# 1.导入工具库

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from datetime import timedelta, datetime
from sklearn.feature_selection import chi2, SelectPercentile
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(
    level = logging.WARNING,
    format = '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s'
)

# 2.数据集

|数据类别|变量|数据类型|解释|
|-------|-----|-----|-----|
|基本数据|sid|string|样本id/请求会话sid|
|基本数据|label|int|是否作弊|
|媒体信息|pkgname|string|包名(MD5加密)|
|媒体信息|ver|string|app版本|
|媒体信息|adunitshowid|string|对外广告位ID(MD5加密)|
|媒体信息|mediashowid|string|对外媒体ID(MD5加密)|
|媒体信息|apptype|int|app所属分类|
|时间|nginxtime|bigint|请求到达服务时间, 单位ms|
|IP信息|ip|string|客户端IP地址|
|IP信息|city|string|城市|
|IP信息|province|string|省份|
|IP信息|reqrealip|string|请求的http协议头携带IP, 有可能是下|游服务器的ip|
|设备信息|adidmd5|string|Adroid ID的MD5值|
|设备信息|imeimd5|string|imei的MD5值|
|设备信息|idfamd5|string|idfa的MD5值|
|设备信息|openudidmd5|string|openudid的MD5值|
|设备信息|macmd5|string|mac的MD5值|
|设备信息|dvctype|int|设备类型 0–未知,1-PC,2–手机,3–平板,4–电视盒|
|       |       |   |5–智能电视,6–可穿戴设备,7–智能家电,8-音箱,9-智能硬件|
|设备信息|model|string|机型|
|设备信息|make|string|厂商|
|设备信息|ntt|int|网络类型 0-未知,1-有线网,2-WIFI,3-蜂窝网络未知,4-2G,5-3G,6–4G|
|设备信息|carrier|string|运营商 0-未知, 46000-移动, |46001-联通, 46003-电信|
|设备信息|os|string|操作系统 : Android, iOS|
|设备信息|osv|string|操作系统版本|
|设备信息|orientation|int|横竖屏:0竖屏, 1横屏|
|设备信息|lan|string|语言|
|设备信息|h|int|设备高|
|设备信息|w|int|设备宽|
|设备信息|ppi|int|屏幕密度|

# 3.评价指标

宏平均 F1-score:

- TP: 正确预测作弊记录
- FP: 错将正常记录预测为作弊记录
- FN: 作弊记录预测为非作弊记录
- $precision = \frac{TP}{TP + FP}$
- $recall = \frac{TP}{TP + FN}$
- $F_{1} = \frac{2 \times precision \times recall}{precision + recall}$

# 4.读取数据集

In [2]:
# 读取 train data 和 test data
path = "/Users/zfwang/machinelearning/datasets/aichallenge_2019_ad_fraud/"
train_data = pd.read_csv(path + "round1_iflyad_anticheat_traindata.txt", sep = "\t")
test_data = pd.read_csv(path + "round1_iflyad_anticheat_testdata_feature.txt", sep = "\t")

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 29 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   sid           1000000 non-null  object 
 1   label         1000000 non-null  int64  
 2   pkgname       1000000 non-null  object 
 3   ver           678176 non-null   object 
 4   adunitshowid  1000000 non-null  object 
 5   mediashowid   1000000 non-null  object 
 6   apptype       1000000 non-null  float64
 7   nginxtime     1000000 non-null  float64
 8   ip            1000000 non-null  object 
 9   city          984807 non-null   object 
 10  province      1000000 non-null  float64
 11  reqrealip     1000000 non-null  object 
 12  adidmd5       1000000 non-null  object 
 13  imeimd5       1000000 non-null  object 
 14  idfamd5       1000000 non-null  object 
 15  openudidmd5   1000000 non-null  object 
 16  macmd5        1000000 non-null  object 
 17  dvctype       1000000 non-nu

In [4]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   sid           100000 non-null  object 
 1   pkgname       100000 non-null  object 
 2   ver           70808 non-null   object 
 3   adunitshowid  100000 non-null  object 
 4   mediashowid   100000 non-null  object 
 5   apptype       100000 non-null  float64
 6   nginxtime     100000 non-null  float64
 7   ip            100000 non-null  object 
 8   city          98533 non-null   object 
 9   province      100000 non-null  float64
 10  reqrealip     100000 non-null  object 
 11  adidmd5       100000 non-null  object 
 12  imeimd5       100000 non-null  object 
 13  idfamd5       100000 non-null  object 
 14  openudidmd5   100000 non-null  object 
 15  macmd5        100000 non-null  object 
 16  dvctype       100000 non-null  float64
 17  model         99905 non-null   object 
 18  make 

In [5]:
train_data.describe()

Unnamed: 0,label,apptype,nginxtime,province,dvctype,ntt,carrier,orientation,h,w,ppi
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,0.482894,296.17687,1559815000000.0,0.429006,1.636588,3.089714,40016.597331,0.279626,1264.178138,703.122219,71.740154
std,0.499708,42.996455,168161700.0,6.116586,0.814882,1.84268,15474.197096,1.040451,853.682627,506.04734,167.470664
min,0.0,-1.0,1559491000000.0,-1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0
25%,0.0,273.0,1559664000000.0,-1.0,2.0,2.0,46000.0,0.0,720.0,360.0,0.0
50%,0.0,301.0,1559816000000.0,-1.0,2.0,2.0,46000.0,0.0,1280.0,720.0,0.0
75%,1.0,326.0,1559965000000.0,-1.0,2.0,5.0,46000.0,1.0,2040.0,1080.0,0.0
max,1.0,381.0,1560096000000.0,34.0,3.0,7.0,46003.0,90.0,9024.0,9024.0,720.0


In [6]:
test_data.describe()

Unnamed: 0,apptype,nginxtime,province,dvctype,ntt,carrier,orientation,h,w,ppi
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,293.61359,1560147000000.0,0.52118,1.69787,3.20868,40043.25535,0.28422,1324.67806,726.28144,76.32633
std,42.307144,22901880.0,6.28967,0.793016,1.869681,15444.931973,1.185316,839.494123,488.670876,171.341531
min,-1.0,1560096000000.0,-1.0,0.0,0.0,-1.0,0.0,-1.0,-1.0,0.0
25%,273.0,1560131000000.0,-1.0,2.0,2.0,46000.0,0.0,760.0,360.0,0.0
50%,301.0,1560150000000.0,-1.0,2.0,2.0,46000.0,0.0,1424.0,720.0,0.0
75%,319.0,1560167000000.0,-1.0,2.0,6.0,46000.0,1.0,2130.0,1080.0,0.0
max,381.0,1560182000000.0,34.0,3.0,7.0,46003.0,90.0,3180.0,2960.0,1000.0


In [7]:
train_data.head()

Unnamed: 0,sid,label,pkgname,ver,adunitshowid,mediashowid,apptype,nginxtime,ip,city,...,make,ntt,carrier,os,osv,orientation,lan,h,w,ppi
0,d7460126-e071-4979-9ee8-42f72777a28a-156009070...,1,2d2664e827bcbb8b07100c7fbe072e9b,11.11.0,907d0f8c29663840491577a21c7b612a,ca64a500000d84c8fcb8a0587d0e1e0c,280.0,1560091000000.0,183.197.47.83,石家庄市,...,vivo,2.0,46000.0,Android,8.1.0,0.0,zh-CN,2340.0,1080.0,3.0
1,b660d559-db97-4b5f-9bd2-2450cb89ce77-156005074...,1,empty,,10199dc8fea2e173525bc3151bd84312,3f2fc57a6e1f9c6fa4464c25cc1e88a3,319.0,1560051000000.0,106.34.14.149,开封市,...,HUAWEI,5.0,0.0,android,Android_9,0.0,,1080.0,2040.0,0.0
2,f49a740e-66c3-4605-9b67-4d3079fe69cb-156008914...,0,16b81f93f4b1a35cebbf15f07683f171,3.2.1.0524.1958,83f2ecfe65f936f5f2ed59f8e8ff1d01,eea7280e1a2313e4c2e89290b01d196c,273.0,1560089000000.0,223.104.16.151,长春市,...,OPPO,2.0,46000.0,Android,8.1.0,1.0,zh-CN,2196.0,1080.0,0.0
3,fd60d096-f168-4540-b782-729d64d0fcc6-156006253...,0,empty,4.7.5,9f1eadd9092b19bc86ee0cacde1c867f,eec946a5a66c023ec9d3b2ede5900626,265.0,1560063000000.0,223.104.239.101,曲靖市,...,,6.0,0.0,android,7.0,0.0,,0.0,0.0,0.0
4,a037b032-a5c7-40ea-9161-26b118b12406-156007938...,1,cf4821986014a7fef9d7b7ad8de655e4,228,2af944462e43cd2f59acbbfd37445413,57b3053174973702549ba88b6017ac30,336.0,1560079000000.0,220.152.155.170,深圳市,...,EML-AL00,2.0,46000.0,Android,9,0.0,Zh-CN,2244.0,1080.0,0.0


In [8]:
test_data.head()

Unnamed: 0,sid,pkgname,ver,adunitshowid,mediashowid,apptype,nginxtime,ip,city,province,...,make,ntt,carrier,os,osv,orientation,lan,h,w,ppi
0,4601f528-865c-4238-a43f-cf4a4e11027d-156017418...,empty,190516,51f02f6007e49301adb4ccb1da0fbf12,5f83d3d963b19352aff3d9b5b29fa736,273.0,1560174000000.0,117.136.86.37,西安市,-1.0,...,HUAWEI,6.0,46000.0,android,9,0.0,zh_CN,2137.0,1080.0,0.0
1,c727e4bf-2ae1-4829-9aa1-730277ee2042-156013863...,empty,4.2.0,96b536f3322df1404342c933fe597569,4d24206fcbc095ad0c27c7dd10e83047,318.0,1560139000000.0,110.52.34.129,株洲市,-1.0,...,OPPO,2.0,46003.0,android,6.0,0.0,,1080.0,1920.0,480.0
2,fe23d40a-0db8-403f-8df0-6e8b9ea461c3-156009638...,ae90f961dbafe0f78f8a1cd7775362ed,,0d116d0416862dea0ff81905de4df513,fff9eaef85a611570dda886b390fee8a,281.0,1560096000000.0,120.229.25.233,深圳市,-1.0,...,HUAWEI,2.0,-1.0,android,3.9.0,0.0,,0.0,0.0,0.0
3,308b053c-5d7c-43b8-99dc-8aad647f953f-156017652...,empty,190516,51f02f6007e49301adb4ccb1da0fbf12,5f83d3d963b19352aff3d9b5b29fa736,273.0,1560177000000.0,117.136.19.239,无锡市,-1.0,...,OPPO,6.0,46000.0,android,8.1.0,0.0,zh_CN,2280.0,1080.0,0.0
4,94477644-63cd-4f3c-9b2a-f3d25f1ec7e2-156014459...,170a88a12e36f8a0f1b73442304398b1,30927000,6618408bae888ad02c4e0d9bd4d88d29,d53d2af198ebef9544f0823c3c8e84f8,301.0,1560145000000.0,218.89.52.71,南充市,-1.0,...,SMARTISAN,2.0,46000.0,Android,7.1.2,1.0,zh-CN,720.0,360.0,0.0


In [9]:
# 合并 df_train 和 df_test
df_uni = pd.concat([train_data, test_data], ignore_index = True)
df_uni["label"] = df_uni["label"].fillna(-1).astype(int)
df_uni.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100000 entries, 0 to 1099999
Data columns (total 29 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   sid           1100000 non-null  object 
 1   label         1100000 non-null  int64  
 2   pkgname       1100000 non-null  object 
 3   ver           748984 non-null   object 
 4   adunitshowid  1100000 non-null  object 
 5   mediashowid   1100000 non-null  object 
 6   apptype       1100000 non-null  float64
 7   nginxtime     1100000 non-null  float64
 8   ip            1100000 non-null  object 
 9   city          1083340 non-null  object 
 10  province      1100000 non-null  float64
 11  reqrealip     1100000 non-null  object 
 12  adidmd5       1100000 non-null  object 
 13  imeimd5       1100000 non-null  object 
 14  idfamd5       1100000 non-null  object 
 15  openudidmd5   1100000 non-null  object 
 16  macmd5        1100000 non-null  object 
 17  dvctype       1100000 non-n

# 5.特征工程

## 5.1 特征分类

In [10]:
# 连续型特征
num_cols = [
    "reqrealip", 
]
# 类别型特征
cat_cols = [
    'pkgname', 'ver', 'adunitshowid', 'mediashowid', 'apptype', 'ip',
    'city', 'province', 'adidmd5', 'imeimd5', 'idfamd5',
    'openudidmd5', 'macmd5', 'dvctype', 'model', 'make', 'ntt',
    'carrier', 'os', 'osv', 'orientation', 'lan', 'h', 'w', 'ppi'
]
# 待删除特征
drop_cols = ['sid', 'label', 'nginxtime']

# 对特征样本的计数
value_counts_col = [
    'pkgname', 'adunitshowid', 'ip', 'reqrealip',
    'adidmd5', 'imeimd5', 'idfamd5', 'macmd5'
]

## 5.2 缺失值填充

In [11]:
# 对含有缺失值的特征用`null_value`进行填充
for cat_col in cat_cols:
    if df_uni[cat_col].isnull().sum() > 0:
        df_uni[cat_col].fillna('null_value', inplace = True)

In [12]:
df_uni.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100000 entries, 0 to 1099999
Data columns (total 29 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   sid           1100000 non-null  object 
 1   label         1100000 non-null  int64  
 2   pkgname       1100000 non-null  object 
 3   ver           1100000 non-null  object 
 4   adunitshowid  1100000 non-null  object 
 5   mediashowid   1100000 non-null  object 
 6   apptype       1100000 non-null  float64
 7   nginxtime     1100000 non-null  float64
 8   ip            1100000 non-null  object 
 9   city          1100000 non-null  object 
 10  province      1100000 non-null  float64
 11  reqrealip     1100000 non-null  object 
 12  adidmd5       1100000 non-null  object 
 13  imeimd5       1100000 non-null  object 
 14  idfamd5       1100000 non-null  object 
 15  openudidmd5   1100000 non-null  object 
 16  macmd5        1100000 non-null  object 
 17  dvctype       1100000 non-n

## 5.3 生成特征

In [13]:
print("=" * 23)
print("generate some features:")
print("=" * 23)

def gen_value_counts(data, col):
    print('value counts', col)
    df_tmp = pd.DataFrame(data[col].value_counts().reset_index())
    df_tmp.columns = [col, 'tmp']
    r = pd.merge(data, df_tmp, how = 'left', on = col)['tmp']
    return r.fillna(0)

for col in value_counts_col:
    df_uni['vc_' + col] = gen_value_counts(df_uni, col)

print("=" * 23)
print("当前df_uni的形状: ")
print("=" * 23)
print(df_uni.shape)

print("=" * 23)
print("当前df_uni的信息: ")
print("=" * 23)
print(df_uni.info())

generate some features:
value counts pkgname
value counts adunitshowid
value counts ip
value counts reqrealip
value counts adidmd5
value counts imeimd5
value counts idfamd5
value counts macmd5
当前df_uni的形状: 
(1100000, 37)
当前df_uni的信息: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100000 entries, 0 to 1099999
Data columns (total 37 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   sid              1100000 non-null  object 
 1   label            1100000 non-null  int64  
 2   pkgname          1100000 non-null  object 
 3   ver              1100000 non-null  object 
 4   adunitshowid     1100000 non-null  object 
 5   mediashowid      1100000 non-null  object 
 6   apptype          1100000 non-null  float64
 7   nginxtime        1100000 non-null  float64
 8   ip               1100000 non-null  object 
 9   city             1100000 non-null  object 
 10  province         1100000 non-null  float64
 11  reqrealip        110000

## 5.4 特征转换

In [14]:
print("=" * 23)
print('cut features:')
print("=" * 23)
def cut_col(data, col_name, cut_list):
    """
    
    """
    print('cutting', col_name)
    def _trans(array):
        count = array['box_counts']
        for box in cut_list:
            if count <= box:
                return 'count_' + str(box)
        return array[col_name]

    df_counts = pd.DataFrame(data[col_name].value_counts())
    df_counts.columns = ['box_counts']
    df_counts[col_name] = df_counts.index
    df = pd.merge(data, df_counts, on = col_name, how = 'left')
    column = df.apply(_trans, axis = 1)
    return column

cut_col_dict = {
    ('pkgname', 'ver', 'reqrealip', 'adidmd5', 'imeimd5', 'openudidmd5', 'macmd5', 'model', 'make'): [3],
    ('ip',): [3, 5, 10],
}
for cut_cols, cut_list in cut_col_dict.items():
    for col in cut_cols:
        df_uni[col] = cut_col(df_uni, col, cut_list)

        
print("=" * 23)
print("当前df_uni的信息: ")
print("=" * 23)
print(df_uni.info())

cut features:
cutting pkgname
cutting ver
cutting reqrealip
cutting adidmd5
cutting imeimd5
cutting openudidmd5
cutting macmd5
cutting model
cutting make
cutting ip
当前df_uni的信息: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100000 entries, 0 to 1099999
Data columns (total 37 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   sid              1100000 non-null  object 
 1   label            1100000 non-null  int64  
 2   pkgname          1100000 non-null  object 
 3   ver              1100000 non-null  object 
 4   adunitshowid     1100000 non-null  object 
 5   mediashowid      1100000 non-null  object 
 6   apptype          1100000 non-null  float64
 7   nginxtime        1100000 non-null  float64
 8   ip               1100000 non-null  object 
 9   city             1100000 non-null  object 
 10  province         1100000 non-null  float64
 11  reqrealip        1100000 non-null  object 
 12  adidmd5          1100000 non-nu

## 5.5 日期特征处理

In [15]:
print("=" * 23)
print('feature time')
print("=" * 23)
# 将`请求到达服务时间`转换为"s",并且转换为北京时间
df_uni['datetime'] = pd.to_datetime(df_uni['nginxtime'] / 1000, unit = 's') + timedelta(hours = 8)

# 提取`请求到达服务时间`的`小时`
df_uni['hour'] = df_uni['datetime'].dt.hour

# 计算当前`请求到达服务时间`距离第一次请求到达服务时间的天数
df_uni['day'] = df_uni['datetime'].dt.day - df_uni['datetime'].dt.day.min()

# 生成`hour(时)`特征
cat_cols += ['hour']
drop_cols += ['datetime', 'day']

print("=" * 23)
print("当前df_uni的信息: ")
print("=" * 23)
print(df_uni.info())

feature time
当前df_uni的信息: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100000 entries, 0 to 1099999
Data columns (total 40 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   sid              1100000 non-null  object        
 1   label            1100000 non-null  int64         
 2   pkgname          1100000 non-null  object        
 3   ver              1100000 non-null  object        
 4   adunitshowid     1100000 non-null  object        
 5   mediashowid      1100000 non-null  object        
 6   apptype          1100000 non-null  float64       
 7   nginxtime        1100000 non-null  float64       
 8   ip               1100000 non-null  object        
 9   city             1100000 non-null  object        
 10  province         1100000 non-null  float64       
 11  reqrealip        1100000 non-null  object        
 12  adidmd5          1100000 non-null  object        
 13  imeimd5          1100000 non-n

## 5.6 

In [16]:
print("=" * 23)
print('post process')
print("=" * 23)
for col in cat_cols:
    df_uni[col] = df_uni[col].map(dict(zip(df_uni[col].unique(), range(0, df_uni[col].nunique()))))

print("=" * 23)
print("当前df_uni的信息: ")
print("=" * 23)
print(df_uni.info())

post process
当前df_uni的信息: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100000 entries, 0 to 1099999
Data columns (total 40 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   sid              1100000 non-null  object        
 1   label            1100000 non-null  int64         
 2   pkgname          1100000 non-null  int64         
 3   ver              1100000 non-null  int64         
 4   adunitshowid     1100000 non-null  int64         
 5   mediashowid      1100000 non-null  int64         
 6   apptype          1100000 non-null  int64         
 7   nginxtime        1100000 non-null  float64       
 8   ip               1100000 non-null  int64         
 9   city             1100000 non-null  int64         
 10  province         1100000 non-null  int64         
 11  reqrealip        1100000 non-null  object        
 12  adidmd5          1100000 non-null  int64         
 13  imeimd5          1100000 non-n

## 5.7 重新创建 train, validation, test 数据集

In [17]:
all_train_index = (df_uni['day'] <= 6).values
train_index     = (df_uni['day'] <= 5).values
valid_index     = (df_uni['day'] == 6).values
test_index      = (df_uni['day'] == 7).values
train_label     = (df_uni['label']).values
print("all_train_index.shape:", all_train_index.shape)
print("train_index.shape:", train_index.shape)
print("valid_index.shape:", valid_index.shape)
print("test_index.shape:", test_index.shape)
print("train_label.shape:", train_label.shape)

all_train_index.shape: (1100000,)
train_index.shape: (1100000,)
valid_index.shape: (1100000,)
test_index.shape: (1100000,)
train_label.shape: (1100000,)


## 5.8 删除无用特征

In [18]:
for col in drop_cols:
    if col in df_uni.columns:
        df_uni.drop([col], axis=1, inplace=True)
print("=" * 23)
print("当前df_uni的形状: ")
print("=" * 23)
print(df_uni.shape)

当前df_uni的形状: 
(1100000, 35)


## 5.9 类别特征One-Hot编码

In [19]:
ohe = OneHotEncoder()
mtx_cat = ohe.fit_transform(df_uni[cat_cols])

num_cols = list(set(df_uni.columns).difference(set(cat_cols)))
mtx_num = sparse.csr_matrix(df_uni[num_cols].astype(float).values)
mtx_uni = sparse.hstack([mtx_num, mtx_cat])
mtx_uni = mtx_uni.tocsr()

ValueError: could not convert string to float: '180.76.181.7'

## 5.10 低方差特征删除

In [None]:
def col_filter(mtx_train, y_train, mtx_test, func=chi2, percentile=90):
    feature_select = SelectPercentile(func, percentile=percentile)
    feature_select.fit(mtx_train, y_train)
    mtx_train = feature_select.transform(mtx_train)
    mtx_test = feature_select.transform(mtx_test)
    return mtx_train, mtx_test

all_train_x, test_x = col_filter(
    mtx_uni[all_train_index, :],
    train_label[all_train_index],
    mtx_uni[test_index, :]
)

## 5.11 模型数据准备

In [None]:
train_x = all_train_x[train_index[:all_train_x.shape[0]], :]
train_y = train_label[train_index]

val_x = all_train_x[valid_index[:all_train_x.shape[0]], :]
val_y = train_label[valid_index]

# 6.训练模型

## 6.1 定义模型评估指标(metric)

In [None]:
# 混淆矩阵
def confusion_matrix():
    pass

# 精度和召回率
def precision_recall():
    pass

# f1 score
def lgb_f1(labels, preds):
    score = f1_score(labels, np.round(preds))
    return 'f1', score, True

## 6.2 模型训练

### 6.2.1 LightGBM

In [None]:
print("=" * 23)
print('training...')
print("=" * 23)

# models
lgb = LGBMClassifier(random_seed = 2019, 
                     n_jobs = -1, 
                     objective = 'binary',
                     learning_rate = 0.1, 
                     n_estimators = 4000, 
                     num_leaves = 64, 
                     max_depth = -1,
                     min_child_samples = 20, 
                     min_child_weight = 9, 
                     subsample_freq = 1,
                     subsample = 0.8, 
                     colsample_bytree = 0.8, 
                     reg_alpha = 1, 
                     reg_lambda = 5)

lgb.fit(
    train_x,
    train_y,
    eval_set = [(train_x, train_y), (val_x, val_y)],
    eval_names = ['train', 'val'],
    eval_metric = lgb_f1,
    early_stopping_rounds = 100,
    verbose = 10,
)

print('best score', lgb.best_score_)

### 6.2.1 XGBoost

In [None]:
xgbc = XGBClassifier(random_seed = 2019,
                     n_jobs = -1,
                     objective = "binary",
                     learning_rate = 0.1,
                     n_estimators = 4000,
                     num_leaves = 64,
                     max_depth = -1,
                     min_child_samples = 20)

# 7.模型预测

In [None]:
print("=" * 23)
print("predict:")
print("=" * 23)
all_train_y = train_label[all_train_index]
lgb.n_estimators = lgb.best_iteration_
lgb.fit(all_train_x, all_train_y)
test_y = lgb.predict(test_x)

# 8.生成submission文件

In [None]:
print("=" * 23)
print("generate submission file")
print("=" * 23)
df_sub = pd.concat([df_test["sid"], pd.Series(test_y)], axis = 1)
df_sub.columns = ["sid", "label"]
df_sub.to_csv("submit-{}.csv".format(datetime.now().strftime("%m%d_%H%M%S")), 
              sep = ",", 
              index = False)