In [61]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer

import matplotlib.pyplot as plt
%matplotlib inline 
plt.style.use("ggplot")      
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


# from pandas_profiling import ProfileReport

# ProfileReport(train).to_notebook_iframe()

In [62]:
# 数据读取，删除无效列

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test1.csv')

train.drop(['Unnamed: 0'],axis=1,inplace=True)
test.drop(['Unnamed: 0'],axis=1,inplace=True)
train.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,316361,1199,46000.0,0.0,0.0,0.0,1,,104,6.0,android,9,18,1438873,1559893000000.0,8,2135019403,0,2329670524,601
1,135939,893,0.0,0.0,0.0,0.0,1,,19,6.0,android,8.1,0,1185582,1559994000000.0,4,2782306428,1,2864801071,1000
2,399254,821,0.0,760.0,0.0,360.0,1,,559,0.0,android,8.1.0,0,1555716,1559837000000.0,0,1392806005,2,628911675,696
3,68983,1004,46000.0,2214.0,0.0,1080.0,0,,129,2.0,android,8.1.0,0,1093419,1560042000000.0,0,3562553457,3,1283809327,753
4,288999,1076,46000.0,2280.0,0.0,1080.0,1,zh-CN,64,2.0,android,8.0.0,0,1400089,1559867000000.0,5,2364522023,4,1510695983,582


In [63]:
train.describe()
train.columns,len(train.columns)-1

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,media_id,ntt,package,sid,timestamp,location,fea1_hash,cus_type
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,261359.275126,986.64011,40028.788034,1264.986626,72.027966,703.486166,0.48448,124.08762,3.089808,38.465876,1500335.0,1559814000000.0,96.040504,2300866000.0,730.824682
std,233616.172774,128.956348,15460.788899,853.37133,167.66493,505.751343,0.49976,164.25454,1.843088,136.321129,288429.2,168073500.0,85.65274,1236593000.0,331.946854
min,0.0,95.0,-1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1000005.0,1559491000000.0,-1.0,12400.0,297.0
25%,0.0,917.0,46000.0,720.0,0.0,360.0,0.0,29.0,2.0,0.0,1250850.0,1559664000000.0,23.0,1376752000.0,411.0
50%,228563.0,1001.0,46000.0,1280.0,0.0,720.0,0.0,64.0,2.0,7.0,1500358.0,1559816000000.0,64.0,2490131000.0,658.0
75%,465701.5,1076.0,46000.0,2040.0,0.0,1080.0,1.0,139.0,5.0,24.0,1750028.0,1559964000000.0,154.0,3062465000.0,1019.0
max,709898.0,1241.0,46003.0,9024.0,720.0,8832.0,1.0,1544.0,7.0,2327.0,1999999.0,1560096000000.0,330.0,4291920000.0,1380.0


(Index(['android_id', 'apptype', 'carrier', 'dev_height', 'dev_ppi',
        'dev_width', 'label', 'lan', 'media_id', 'ntt', 'os', 'osv', 'package',
        'sid', 'timestamp', 'version', 'fea_hash', 'location', 'fea1_hash',
        'cus_type'],
       dtype='object'),
 19)

In [64]:
# 把各个特征的分布都用直方图画一下(除了fea_hash、fea1_hash、label)

# fea_plt = train.columns.tolist()
# fea_plt.remove('label')
# fea_plt.remove('fea_hash')
# fea_plt.remove('fea1_hash')

# for i in range(len(fea_plt)):
#     print('{}'.format(fea_plt[i]))
#     train[fea_plt[i]].hist()
#     plt.show();
# print(i)    



# plt.figure()
# fig, axs = plt.subplots(4, 5, figsize=(18, 6))

# n = 0
# for i in range(4):
#     x=0
#     for j in range(5):   
#         axs[i][j].hist(train[fea_plt[x+n*5]])
#         axs[i][j].set_title('{}'.format(fea_plt[x+n*5]))
#         x+=1
#     n+=1

# fig.tight_layout();


In [65]:
# plt.figure(figsize=(12, 10))
# sns.heatmap(train.corr(),linewidths = 0.05);

# # 少了fea_hash，发现这列是str，并且有些奇怪的值
# print('与label的相关性：')
# train.corr().label.sort_values()

In [66]:
# # 特征取值类别

# for col in train.columns:
#     col,train[col].nunique()

特征处理

osv、version、timestamp、fea_hash、fea1_hash

In [67]:
data = pd.concat([train,test])

data.osv.unique()

array(['9', '8.1', '8.1.0', '8.0.0', '5.1', '9.0.0', '7.1.1', '5.1.1',
       '7.8.7', '4.4.4', nan, '6.0', '6.0.1', '3.9.0', 'Android_8.1.0',
       '7.0', '7.9.2', 'Android_9', '7.9.0', '7.1.2', '3.8.6', '5.1.0',
       '9.0', '7.7.7', '6.0.0', '5.0.2', 'Android_6.0.1', '7.0.0',
       '6.0_23', '4.2.2', '7.1', '8.0', '4.3', 'Android_8.0.0', '4.4.2',
       '4.1', 'Android_7.1.1', '5.0', 'Android_5.1.1', '2.3', '5.0.0',
       '7.8.0', '5.0.1', '4.4', '7.8.6', 'Android_5.1', 'Android_7.1.2',
       'Android_4.4.4', '8.0.1', '3.1.1', '7.7.2', '6.1', '3.8.4',
       '7.8.9', '7.7.5', '7.6.8', 'Android_7.0', '7.8.4', '7', '7.7.3',
       '7.1.0', '9.1', '4.1.1', 'Android_4.4.2', '3.8.0', '6.1.2',
       '7.2.1', '7.6.9', '3.7.8', '7.8.2', '8', '7910', '7.6.4', '7.8.5',
       '2.2.3', '5.2', 'Android_6.0', '4.2.1', '10.3.3', '4.0.2', '7.8.8',
       '4.4.3', '21100', '7.7.0', '4.0.3', '4.0.4', '5', '2.3.6', '6.1.0',
       '21000', '4.3.0', '4.1.2', '4.2', '7.6.7', '9.0.5', '4.4.2_19',


## 特征工程对比赛字段进行编码

In [68]:
def handle_osv(osv):
    osv = str(osv).replace('.','').replace('W','').replace('Android_','').replace('Android', '').replace('十核20G_HD', '').replace(' ','')
    if osv=='nan' or osv=='GIONEE_YNGA': # 这2个值的前面的取值都为8.1.0
        result = 810
    elif osv=='f073b_changxiang_v01_b1b8_20180915':
        result = 810
    elif osv=='%E6%B1%9F%E7%81%B5OS+50':
        result=500
    elif osv.count('-')>0:
        result = int(osv.split('-')[0])
    else:
        result = int(osv)
    
    if result<10:
        result*=100
    elif result<100:
        result*=10
    
    return int(result)


data['osv'] = data['osv'].apply(handle_osv)     
data['osv'].unique()

array([  900,   810,   800,   510,   711,   511,   787,   444,   600,
         601,   390,   700,   792,   790,   712,   386,   777,   502,
        6023,   422,   710,   430,   442,   410,   500,   230,   780,
         501,   440,   786,   801,   311,   772,   610,   384,   789,
         775,   768,   784,   773,   910,   411,   380,   612,   721,
         769,   378,   782,  7910,   764,   785,   223,   520,   421,
        1033,   402,   788,   443, 21100,   770,   403,   404,   236,
       21000,   412,   420,   767,   905, 44219,   383,   433,   445,
         376,   235,   713,   303,   621,  5122, 60119,  7930,  4232,
         512,   774,   602,   431,   237,   530, 71200,   212,   766,
         234,   320,   110, 51122,   400,   731,   802,  5021,   201,
         429,  7920,   503,   292,   222, 60122,   446, 71300,   120],
      dtype=int64)

In [69]:
def handle_version(version):
    version = str(version).replace(' ','').replace('v','').replace('V','').replace('GA','').replace('P_Final_','').replace('GA','')
    # 发现version=50附近都是5
    if version == '50':
        return int(5)
    return int(version)

data['version'] = data['version'].apply(handle_version)
data['version'].unique()
    

array([ 8,  4,  0,  5,  9,  7,  3,  1,  6, 11,  2, 10, 15, 20],
      dtype=int64)

In [70]:
from sklearn.preprocessing import LabelEncoder

LabelEncoder= LabelEncoder()
data['lan']=LabelEncoder.fit_transform(data['lan'].astype('str'))

data['lan'].unique()

array([14, 17, 16,  4, 22, 23,  3, 15,  2,  5,  0,  6,  1, 24, 18,  7, 20,
       12, 19, 10, 13, 11,  8, 21,  9])

In [71]:
import datetime
data['timestamp'] = data['timestamp'].apply(lambda x : datetime.datetime.fromtimestamp(x/1000))

data['year']= data['timestamp'].dt.year
data['month']= data['timestamp'].dt.month
data['day']= data['timestamp'].dt.day
data['hour']= data['timestamp'].dt.hour
data['minute']= data['timestamp'].dt.minute
data['weekday']= data['timestamp'].dt.weekday

# 划分 训练集

train = data[data['label'].notnull()]
test = data[data['label'].isnull()].drop(['label'],axis=1)

# 获取time_diff
start_time1 = train['timestamp'].min()
train['timestamp_diff'] = train['timestamp']-start_time1
train['timestamp_diff'] = train['timestamp_diff'].dt.days - train['timestamp_diff'].dt.seconds/3600/24

start_time2 = test['timestamp'].min()
test['timestamp_diff'] = test['timestamp']-start_time2
test['timestamp_diff'] = test['timestamp_diff'].dt.days - test['timestamp_diff'].dt.seconds/3600/24

data = pd.concat([train,test])

In [73]:
# fea_hash 有5w多个不同的取值,大多数就10位，大于10位的就处理成0

train['fea_hash'].apply(len).quantile([.1,.2,.5,.75,.95])
data['fea_hash'] = data['fea_hash'].map(lambda i:0 if len(str(i))>10 else int(i))
data['fea_hash_len'] = data['fea_hash'].map(lambda i:len(str(i)))

0.10     9.0
0.20     9.0
0.50    10.0
0.75    10.0
0.95    10.0
Name: fea_hash, dtype: float64

In [74]:
# fea1_hash 大多数就10位，大于10位的就处理成-1

train['fea1_hash'].astype('str').map(len).quantile([.1,.2,.5,.75,.95])
data['fea1_hash'] = data['fea1_hash'].map(lambda i:0 if len(str(i))>10 else int(i))
data['fea1_hash_len'] = data['fea1_hash'].map(lambda i:len(str(i)))

0.10     9.0
0.20     9.0
0.50    10.0
0.75    10.0
0.95    10.0
Name: fea1_hash, dtype: float64

In [75]:
# data['os'] = LabelEncoder.fit_transform(data.os)

data['osv_ver'] = data['osv'] - data['version']

In [76]:
data.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,...,year,month,day,hour,minute,weekday,timestamp_diff,fea_hash_len,fea1_hash_len,osv_ver
0,316361,1199,46000.0,0.0,0.0,0.0,1.0,14,104,6.0,...,2019,6,7,15,32,4,3.352697,10,10,892
1,135939,893,0.0,0.0,0.0,0.0,1.0,14,19,6.0,...,2019,6,8,19,40,5,4.180336,10,10,806
2,399254,821,0.0,760.0,0.0,360.0,1.0,14,559,0.0,...,2019,6,6,23,58,3,2.001204,10,9,810
3,68983,1004,46000.0,2214.0,0.0,1080.0,0.0,14,129,2.0,...,2019,6,9,8,59,6,5.625278,10,10,810
4,288999,1076,46000.0,2280.0,0.0,1080.0,1.0,17,64,2.0,...,2019,6,7,8,28,4,3.646991,10,10,795


In [81]:
feat_cols = ['android_id', 'apptype', 'carrier', 'dev_height', 'dev_ppi',
             'dev_width', 'lan', 'media_id', 'ntt', 'osv', 'package', 
             'version', 'fea_hash', 'location', 'fea1_hash', 'cus_type',
             'fea_hash_len', 'fea1_hash_len', 'year', 'month', 'day', 'weekday',
             'hour', 'minute', 'timestamp_diff', 'osv_ver']


In [79]:
train = data[data['label'].notnull()]
test = data[data['label'].isnull()].drop(['label'],axis=1)

In [82]:
model = lgb.LGBMClassifier(num_leaves=512,
                           max_depth=10,
                           learning_rate=0.005,
                           n_estimators=5000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=2021,
                           metric='auc',
                           boosting_type='gbdt',
                           subsample_freq=1,
                           bagging_fraction=0.8)


X = train[feat_cols]
y = train['label']

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
prob = []
# mean_acc = 0
for k,(train_index, test_index) in enumerate(skf.split(X, y)):
    print(k)
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    print('开始训练~~~')

    model = model.fit(X_train,
                      y_train,
                      eval_set=[(X_val, y_val)],
                      eval_metric='auc',
                      verbose=True)

    test_y_pred = model.predict_proba(test[feat_cols])
    prob.append(test_y_pred)


0
训练
[1]	valid_0's auc: 0.925465
[2]	valid_0's auc: 0.930735
[3]	valid_0's auc: 0.932773
[4]	valid_0's auc: 0.933429
[5]	valid_0's auc: 0.934104
[6]	valid_0's auc: 0.934562
[7]	valid_0's auc: 0.934274
[8]	valid_0's auc: 0.934158
[9]	valid_0's auc: 0.933922
[10]	valid_0's auc: 0.933728
[11]	valid_0's auc: 0.933766
[12]	valid_0's auc: 0.934127
[13]	valid_0's auc: 0.934315
[14]	valid_0's auc: 0.934463
[15]	valid_0's auc: 0.934371
[16]	valid_0's auc: 0.934546
[17]	valid_0's auc: 0.934628
[18]	valid_0's auc: 0.934524
[19]	valid_0's auc: 0.934365
[20]	valid_0's auc: 0.934525
[21]	valid_0's auc: 0.934466
[22]	valid_0's auc: 0.93461
[23]	valid_0's auc: 0.934652
[24]	valid_0's auc: 0.93473
[25]	valid_0's auc: 0.934845
[26]	valid_0's auc: 0.934953
[27]	valid_0's auc: 0.935008
[28]	valid_0's auc: 0.935014
[29]	valid_0's auc: 0.935052
[30]	valid_0's auc: 0.935157
[31]	valid_0's auc: 0.935095
[32]	valid_0's auc: 0.935075
[33]	valid_0's auc: 0.93519
[34]	valid_0's auc: 0.935131
[35]	valid_0's auc: 0

In [89]:
lgb_result = (prob[0]+prob[1]+prob[2]+prob[3]+prob[4])/5

In [88]:
result = []

for i in lgb_result:
    if i[0]>0.5:
        result.append(0)
    else:
        result.append(1)


submit = pd.DataFrame(test['sid'])
submit['label'] = result
submit.to_csv('./result.csv', index=False)