## 导入库、数据

基本上结构化数据，导入这些库（无脑导入~）

In [20]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer

import matplotlib.pyplot as plt
%matplotlib inline 
plt.style.use("ggplot")      
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [21]:
# 数据读取，删除无效列

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test1.csv')

train.drop(['Unnamed: 0'],axis=1,inplace=True)
test.drop(['Unnamed: 0'],axis=1,inplace=True)
train.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,316361,1199,46000.0,0.0,0.0,0.0,1,,104,6.0,android,9,18,1438873,1559893000000.0,8,2135019403,0,2329670524,601
1,135939,893,0.0,0.0,0.0,0.0,1,,19,6.0,android,8.1,0,1185582,1559994000000.0,4,2782306428,1,2864801071,1000
2,399254,821,0.0,760.0,0.0,360.0,1,,559,0.0,android,8.1.0,0,1555716,1559837000000.0,0,1392806005,2,628911675,696
3,68983,1004,46000.0,2214.0,0.0,1080.0,0,,129,2.0,android,8.1.0,0,1093419,1560042000000.0,0,3562553457,3,1283809327,753
4,288999,1076,46000.0,2280.0,0.0,1080.0,1,zh-CN,64,2.0,android,8.0.0,0,1400089,1559867000000.0,5,2364522023,4,1510695983,582


In [22]:
train.describe()
train.columns,len(train.columns)-1

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,media_id,ntt,package,sid,timestamp,location,fea1_hash,cus_type
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,261359.275126,986.64011,40028.788034,1264.986626,72.027966,703.486166,0.48448,124.08762,3.089808,38.465876,1500335.0,1559814000000.0,96.040504,2300866000.0,730.824682
std,233616.172774,128.956348,15460.788899,853.37133,167.66493,505.751343,0.49976,164.25454,1.843088,136.321129,288429.2,168073500.0,85.65274,1236593000.0,331.946854
min,0.0,95.0,-1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1000005.0,1559491000000.0,-1.0,12400.0,297.0
25%,0.0,917.0,46000.0,720.0,0.0,360.0,0.0,29.0,2.0,0.0,1250850.0,1559664000000.0,23.0,1376752000.0,411.0
50%,228563.0,1001.0,46000.0,1280.0,0.0,720.0,0.0,64.0,2.0,7.0,1500358.0,1559816000000.0,64.0,2490131000.0,658.0
75%,465701.5,1076.0,46000.0,2040.0,0.0,1080.0,1.0,139.0,5.0,24.0,1750028.0,1559964000000.0,154.0,3062465000.0,1019.0
max,709898.0,1241.0,46003.0,9024.0,720.0,8832.0,1.0,1544.0,7.0,2327.0,1999999.0,1560096000000.0,330.0,4291920000.0,1380.0


(Index(['android_id', 'apptype', 'carrier', 'dev_height', 'dev_ppi',
        'dev_width', 'label', 'lan', 'media_id', 'ntt', 'os', 'osv', 'package',
        'sid', 'timestamp', 'version', 'fea_hash', 'location', 'fea1_hash',
        'cus_type'],
       dtype='object'),
 19)

In [23]:
# 特征取值类别数
category_nums = []
for col in train.columns:
    category_nums.append(train[col].nunique())

# 类别数小于200的  
pd.DataFrame({'col':train.columns,'category_nums':category_nums}).sort_values(by=['category_nums'])

Unnamed: 0,col,category_nums
10,os,2
6,label,2
2,carrier,5
9,ntt,8
7,lan,21
...,...,...
18,fea1_hash,4959
0,android_id,362258
16,fea_hash,402980
13,sid,500000


In [24]:
# 训练集、测试集拼接，一起处理特征
data = pd.concat([train,test])

## 特征工程

结构化数据特征工程，主要的方向就是针对有效特征做的，有一个很管用的方法就是，用一个简单的基模型比如决策树、LR等这些跑下baseline，输出feature_importance，大致方向就是针对前几的特征重点做。所以以下特征工程也是基于上述的思路的结果，feature_importance的过程可以加上去（当时未记录这些模块，可动手尝试下）

In [25]:
# 操作系统版本，可以合并成少数几个类
data.osv.unique()

array(['9', '8.1', '8.1.0', '8.0.0', '5.1', '9.0.0', '7.1.1', '5.1.1',
       '7.8.7', '4.4.4', nan, '6.0', '6.0.1', '3.9.0', 'Android_8.1.0',
       '7.0', '7.9.2', 'Android_9', '7.9.0', '7.1.2', '3.8.6', '5.1.0',
       '9.0', '7.7.7', '6.0.0', '5.0.2', 'Android_6.0.1', '7.0.0',
       '6.0_23', '4.2.2', '7.1', '8.0', '4.3', 'Android_8.0.0', '4.4.2',
       '4.1', 'Android_7.1.1', '5.0', 'Android_5.1.1', '2.3', '5.0.0',
       '7.8.0', '5.0.1', '4.4', '7.8.6', 'Android_5.1', 'Android_7.1.2',
       'Android_4.4.4', '8.0.1', '3.1.1', '7.7.2', '6.1', '3.8.4',
       '7.8.9', '7.7.5', '7.6.8', 'Android_7.0', '7.8.4', '7', '7.7.3',
       '7.1.0', '9.1', '4.1.1', 'Android_4.4.2', '3.8.0', '6.1.2',
       '7.2.1', '7.6.9', '3.7.8', '7.8.2', '8', '7910', '7.6.4', '7.8.5',
       '2.2.3', '5.2', 'Android_6.0', '4.2.1', '10.3.3', '4.0.2', '7.8.8',
       '4.4.3', '21100', '7.7.0', '4.0.3', '4.0.4', '5', '2.3.6', '6.1.0',
       '21000', '4.3.0', '4.1.2', '4.2', '7.6.7', '9.0.5', '4.4.2_19',


In [26]:
def handle_osv(osv):
    osv = str(osv).replace('.','').replace('W','').replace('Android_','').replace('Android', '').replace('十核20G_HD', '').replace(' ','')
    # 这2个值的前面的取值都为8.1.0，类似于聚类操作
    if osv=='nan' or osv=='GIONEE_YNGA': 
        result = 810
    elif osv=='f073b_changxiang_v01_b1b8_20180915':
        result = 810
    elif osv=='%E6%B1%9F%E7%81%B5OS+50':
        result=500
    elif osv.count('-')>0:
        result = int(osv.split('-')[0])
    else:
        result = int(osv)
        
    # 都处理成3位数
    if result<10:
        result*=100
    elif result<100:
        result*=10
    
    return int(result)

# 还可以根据label继续分箱，尝试了但效果不太好，所以没做
data['osv'] = data['osv'].apply(handle_osv)     
data['osv'].unique()

array([  900,   810,   800,   510,   711,   511,   787,   444,   600,
         601,   390,   700,   792,   790,   712,   386,   777,   502,
        6023,   422,   710,   430,   442,   410,   500,   230,   780,
         501,   440,   786,   801,   311,   772,   610,   384,   789,
         775,   768,   784,   773,   910,   411,   380,   612,   721,
         769,   378,   782,  7910,   764,   785,   223,   520,   421,
        1033,   402,   788,   443, 21100,   770,   403,   404,   236,
       21000,   412,   420,   767,   905, 44219,   383,   433,   445,
         376,   235,   713,   303,   621,  5122, 60119,  7930,  4232,
         512,   774,   602,   431,   237,   530, 71200,   212,   766,
         234,   320,   110, 51122,   400,   731,   802,  5021,   201,
         429,  7920,   503,   292,   222, 60122,   446, 71300,   120],
      dtype=int64)

In [27]:
data['version'].unique()

array(['8', '4', '0', '5', '9', '7', '3', '1', '6', '11', '2', 'v1', 'V3',
       'GA3', '10', 'P_Final_6', '15', 'V6', ' 2', 'GA2', 'V2', '50',
       '20'], dtype=object)

In [28]:
def handle_version(version):
    version = str(version).replace(' ','').replace('v','').replace('V','').replace('GA','').replace('P_Final_','').replace('GA','')
    # 发现version=50附近都是5
    if version == '50':
        return int(5)
    return int(version)

data['version'] = data['version'].apply(handle_version)
data['version'].unique()

array([ 8,  4,  0,  5,  9,  7,  3,  1,  6, 11,  2, 10, 15, 20],
      dtype=int64)

In [29]:
# 设备采用的语言,大多是是中文，也存在外语，是否在国外
data['lan'].unique()

array([nan, 'zh-CN', 'zh', 'cn', 'zh-cn', 'zh_CN', 'Zh-CN', 'tw', 'ZH',
       'en', 'CN', 'en-GB', 'TW', 'zh_CN_#Hans', 'zh-HK', 'en-US',
       'zh-TW', 'ko', 'zh-MO', 'it', 'mi', 'ja', 'en_US', 'zh-US',
       'in_ID'], dtype=object)

In [30]:
def foreign_lan(x):
    set1 = {'zh-CN', 'zh', 'cn', 'zh_CN', 'Zh-CN', 'zh-cn', 'ZH', 'CN', 'zh_CN_#Hans'}
    if x in set1:
        return 0
    elif x=='unk':
        return 2
    else:
        return 1
    
data["vpn"] = data["lan"].apply(foreign_lan)

In [32]:
# 再对特征lan label-encode
from sklearn.preprocessing import LabelEncoder

LabelEncoder= LabelEncoder()
data['lan']=LabelEncoder.fit_transform(data['lan'].astype('str'))
data['lan'].unique()

array([ 6,  9,  8, 19, 15, 16, 18,  7, 12, 20,  0, 21,  1, 17, 10, 22, 13,
        4, 11,  2,  5,  3, 23, 14, 24])

In [33]:
import datetime

data['timestamp'] = data['timestamp'].apply(lambda x : datetime.datetime.fromtimestamp(x/1000))
# 常规处理
data['year']= data['timestamp'].dt.year
data['month']= data['timestamp'].dt.month
data['day']= data['timestamp'].dt.day
data['hour']= data['timestamp'].dt.hour
data['minute']= data['timestamp'].dt.minute
data['weekday']= data['timestamp'].dt.weekday

# 划分训练集
train = data[data['label'].notnull()]
test = data[data['label'].isnull()].drop(['label'],axis=1)

# 当前时间与最小时间的时间差（注意：训练集、测试集需分开处理）
start_time1 = train['timestamp'].min()
train['timestamp_diff'] = train['timestamp']-start_time1
train['timestamp_diff'] = train['timestamp_diff'].dt.days - train['timestamp_diff'].dt.seconds/3600/24

start_time2 = test['timestamp'].min()
test['timestamp_diff'] = test['timestamp']-start_time2
test['timestamp_diff'] = test['timestamp_diff'].dt.days - test['timestamp_diff'].dt.seconds/3600/24

# 处理后合并
data = pd.concat([train,test])

In [34]:
# fea_hash 有5w多个不同的取值,大多数就10位，大于10位的就处理成0

train['fea_hash'].apply(len).quantile([.1,.2,.5,.75,.95])
data['fea_hash'] = data['fea_hash'].map(lambda i:0 if len(str(i))>10 else int(i))
data['fea_hash_len'] = data['fea_hash'].map(lambda i:len(str(i)))

0.10     9.0
0.20     9.0
0.50    10.0
0.75    10.0
0.95    10.0
Name: fea_hash, dtype: float64

In [35]:
# fea1_hash 大多数就10位，大于10位的就处理成-1

train['fea1_hash'].astype('str').map(len).quantile([.1,.2,.5,.75,.95])
data['fea1_hash'] = data['fea1_hash'].map(lambda i:0 if len(str(i))>10 else int(i))
data['fea1_hash_len'] = data['fea1_hash'].map(lambda i:len(str(i)))

0.10     9.0
0.20     9.0
0.50    10.0
0.75    10.0
0.95    10.0
Name: fea1_hash, dtype: float64

In [37]:
def divided(x):
    if x % 40 == 0:
        return 2
    elif not x:
        return 1
    else:
        return 0

# 其他数值特征构造      
data["160_height"] = data.dev_height.apply(divided)
data["160_width"] = data.dev_width.apply(divided)
data["hw_ratio"] = data.dev_height / data.dev_width
data["hw_matrix"] = data.dev_height * data.dev_width
data['osv_ver'] = data['osv'] - data['version']

In [39]:
data.head()
data.shape

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,...,minute,weekday,timestamp_diff,fea_hash_len,fea1_hash_len,160_height,160_width,hw_ratio,hw_matrix,osv_ver
0,316361,1199,46000.0,0.0,0.0,0.0,1.0,6,104,6.0,...,32,4,3.352697,10,10,2,2,,0.0,892
1,135939,893,0.0,0.0,0.0,0.0,1.0,6,19,6.0,...,40,5,4.180336,10,10,2,2,,0.0,806
2,399254,821,0.0,760.0,0.0,360.0,1.0,6,559,0.0,...,58,3,2.001204,10,9,2,2,2.111111,273600.0,810
3,68983,1004,46000.0,2214.0,0.0,1080.0,0.0,6,129,2.0,...,59,6,5.625278,10,10,0,2,2.05,2391120.0,810
4,288999,1076,46000.0,2280.0,0.0,1080.0,1.0,9,64,2.0,...,28,4,3.646991,10,10,2,2,2.111111,2462400.0,795


(650000, 35)

## 模型训练、预测

模型主要采用梯度提升模型，代表模型有xgb、lgb、catboost这些，下面采用了xgb、lgb（一般lgb，内存占用少、训练快）。

In [40]:
feat_cols = ['android_id', 'apptype', 'carrier', 'dev_height', 'dev_ppi',
             'dev_width', 'lan', 'media_id', 'ntt', 'osv', 'package',
             'version', 'fea_hash', 'location', 'fea1_hash', 'cus_type',
             'fea_hash_len', 'fea1_hash_len', 'vpn', 'year', 'month', 'day', 'weekday',
             'hour', 'minute', 'timestamp_diff', 'osv_ver', '160_height',
             '160_width','hw_matrix','hw_ratio']

print('参与训练的特征个数：{}'.format(len(feat_cols)))

train = data[data['label'].notnull()]
test = data[data['label'].isnull()].drop(['label'],axis=1)
print('训练集、测试集已准备~')

参与训练的特征个数：31
训练集、测试集已准备~


训练的轮数较多，比较耗时，所以lgb\xgb均未做调参。但可继续尝试调参，比如随机搜索、贝叶斯调参等。

In [43]:
lgb_model = lgb.LGBMClassifier(objective= 'binary',
                               metric= 'auc',
                               num_leaves= 512,  # num_leaves < 2^max_depth
                               boosting_type= 'gbdt',
                               bagging_freq= 1,
                               lambda_l1= 0.5,
                               lambda_l2= 0.5,
                               n_estimators= 5000,
                               learning_rate= 0.005,
                               feature_fraction= 0.8,
                               bagging_fraction= 0.8,
                               max_depth= 12,
                               n_jobs= -1,
                               random_state= 2021
                               )

# 划分训练集
X = train[feat_cols]
y = train['label']

# 5折交叉验证
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)
pred = []
for folder,(train_index, test_index) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    lgb_model = lgb_model.fit(X_train,
                              y_train,
                              eval_set=[(X_val, y_val)],
                              eval_metric='auc',
                              verbose=True)
    # 预测
    test_pre = lgb_model.predict_proba(test[feat_cols])
    print('第{}轮训练结束，正在保存预测数据-------------------------'.format(folder+1))
    pred.append(test_pre)

[1]	valid_0's auc: 0.932958
[2]	valid_0's auc: 0.938484
[3]	valid_0's auc: 0.940172
[4]	valid_0's auc: 0.940404
[5]	valid_0's auc: 0.940744
[6]	valid_0's auc: 0.940948
[7]	valid_0's auc: 0.941024
[8]	valid_0's auc: 0.941127
[9]	valid_0's auc: 0.941077
[10]	valid_0's auc: 0.941122
[11]	valid_0's auc: 0.941103
[12]	valid_0's auc: 0.941161
[13]	valid_0's auc: 0.94115
[14]	valid_0's auc: 0.941251
[15]	valid_0's auc: 0.941206
[16]	valid_0's auc: 0.941226
[17]	valid_0's auc: 0.941275
[18]	valid_0's auc: 0.941296
[19]	valid_0's auc: 0.941248
[20]	valid_0's auc: 0.941366
[21]	valid_0's auc: 0.941299
[22]	valid_0's auc: 0.941292
[23]	valid_0's auc: 0.941289
[24]	valid_0's auc: 0.941272
[25]	valid_0's auc: 0.941308
[26]	valid_0's auc: 0.941311
[27]	valid_0's auc: 0.941286
[28]	valid_0's auc: 0.941295
[29]	valid_0's auc: 0.94132
[30]	valid_0's auc: 0.941353
[31]	valid_0's auc: 0.941422
[32]	valid_0's auc: 0.941433
[33]	valid_0's auc: 0.941499
[34]	valid_0's auc: 0.941523
[35]	valid_0's auc: 0.941

In [44]:
xgb_model = xgb.XGBClassifier(max_depth=15,
                              learning_rate=0.005,
                              n_estimators=1000,
                              objective='binary:logistic',
                              subsample=0.8, # Subsample ratio of the training instance.
                              colsample_bytree=0.8, # Subsample ratio of columns when constructing each tree.
                              min_child_samples=3, 
                              eval_metric='auc',
                              random_state=2022,
                              reg_alpha =0.5, # L1 
                              reg_lambda  = 0.5 # L2
                              )

# 5折交叉验证
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)
pre2 = []
for kfolder,(train_index, test_index) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    xgb_model = xgb_model.fit(X_train,
                              y_train,
                              eval_set=[(X_val, y_val)],
                              eval_metric='auc',
                              verbose=True)
    
    acc =accuracy_score(y_val,xgb_model.predict(X_val))
    print('第{}轮，验证集accuracy={}'.format(kfolder+1,acc))
    # 预测
    test_pre2=xgb_model.predict_proba(test[feat_cols])
    print('第{}轮训练结束，正在保存预测数据-------------------------'.format(folder+1))
    pre2.append(test_pre2)

Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-auc:0.93927
[1]	validation_0-auc:0.94174
[2]	validation_0-auc:0.94255
[3]	validation_0-auc:0.94300
[4]	validation_0-auc:0.94314
[5]	validation_0-auc:0.94328
[6]	validation_0-auc:0.94335
[7]	validation_0-auc:0.94351
[8]	validation_0-auc:0.94335
[9]	validation_0-auc:0.94344
[10]	validation_0-auc:0.94349
[11]	validation_0-auc:0.94365
[12]	validation_0-auc:0.94370
[13]	validation_0-auc:0.94374
[14]	validation_0-auc:0.94377
[15]	validation_0-auc:0.94379
[16]	validation_0-auc:0.94380
[17]	validation_0-auc:0.94384
[18]	validation_0-auc:0.94386
[19]	validation_0-auc:0.94390
[20]	validation_0-auc:0.94391
[21]	validation_0-auc:0.94392
[22]	validation_0-auc:0.94

下面做了一个简单加权的模型融合，取最终2个模型结果的均值作为类别预测的概率，选择0.5作为阈值划分点。

还可继续优化阈值划分点，这里提下思路：在0.5附近选择一定步长进行遍历，计算不同阈值点下的模型评分，选择模型评分最高的阈值作为正负样本划分点。（but，效果不一定好，可试试~）

In [45]:
def result_mean(pred,nfolder=5,class_num=2):
    result_prob = np.zeros((test.shape[0],class_num))
    for folder in range(0,nfolder):
        result_prob+=pred[folder]
    result_prob = result_prob/nfolder
    return result_prob

# 结果 
lgb_result = result_mean(pred)
xgb_result = result_mean(pre2)

# 模型融合
result_submit = test[['sid']]
out_put=[]
for i in range(test.shape[0]):
    merge_result = (lgb_result[i][0]+xgb_result[i][0])/2
    out_put.append(np.where(merge_result>0.5,0,1))
result_submit['label'] = out_put

#输出预测结果
result_submit.to_csv('./result.csv', index=False)
print('预测结果已输出~')

预测结果已输出~
