In [356]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer
import gc
import matplotlib.pyplot as plt
%matplotlib inline 
plt.style.use("ggplot")      
import seaborn as sns
# import missingno as msno
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


# from pandas_profiling import ProfileReport

# ProfileReport(train).to_notebook_iframe()

In [357]:
# 数据读取，删除无效列

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test1.csv')

train.drop(['Unnamed: 0'],axis=1,inplace=True)
test.drop(['Unnamed: 0'],axis=1,inplace=True)
train.head()

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,os,osv,package,sid,timestamp,version,fea_hash,location,fea1_hash,cus_type
0,316361,1199,46000.0,0.0,0.0,0.0,1,,104,6.0,android,9,18,1438873,1559893000000.0,8,2135019403,0,2329670524,601
1,135939,893,0.0,0.0,0.0,0.0,1,,19,6.0,android,8.1,0,1185582,1559994000000.0,4,2782306428,1,2864801071,1000
2,399254,821,0.0,760.0,0.0,360.0,1,,559,0.0,android,8.1.0,0,1555716,1559837000000.0,0,1392806005,2,628911675,696
3,68983,1004,46000.0,2214.0,0.0,1080.0,0,,129,2.0,android,8.1.0,0,1093419,1560042000000.0,0,3562553457,3,1283809327,753
4,288999,1076,46000.0,2280.0,0.0,1080.0,1,zh-CN,64,2.0,android,8.0.0,0,1400089,1559867000000.0,5,2364522023,4,1510695983,582


In [358]:
# train.describe()
# train.columns,len(train.columns)-1

In [359]:
# 把各个特征的分布都用直方图画一下(除了fea_hash、fea1_hash、label)

# fea_plt = train.columns.tolist()
# fea_plt.remove('label')
# fea_plt.remove('fea_hash')
# fea_plt.remove('fea1_hash')

# for i in range(len(fea_plt)):
#     print('{}'.format(fea_plt[i]))
#     train[fea_plt[i]].hist()
#     plt.show();
# print(i)    



# plt.figure()
# fig, axs = plt.subplots(4, 5, figsize=(18, 6))

# n = 0
# for i in range(4):
#     x=0
#     for j in range(5):   
#         axs[i][j].hist(train[fea_plt[x+n*5]])
#         axs[i][j].set_title('{}'.format(fea_plt[x+n*5]))
#         x+=1
#     n+=1

# fig.tight_layout();


In [360]:
# plt.figure(figsize=(12, 10))
# sns.heatmap(train.corr(),linewidths = 0.05);

# # 少了fea_hash，发现这列是str，并且有些奇怪的值
# print('与label的相关性：')
# train.corr().label.sort_values()

In [361]:
# # 特征取值类别

# for col in train.columns:
#     col,train[col].nunique()

In [362]:
data = pd.concat([train,test])

## 特征工程

In [363]:
data.osv.unique()

array(['9', '8.1', '8.1.0', '8.0.0', '5.1', '9.0.0', '7.1.1', '5.1.1',
       '7.8.7', '4.4.4', nan, '6.0', '6.0.1', '3.9.0', 'Android_8.1.0',
       '7.0', '7.9.2', 'Android_9', '7.9.0', '7.1.2', '3.8.6', '5.1.0',
       '9.0', '7.7.7', '6.0.0', '5.0.2', 'Android_6.0.1', '7.0.0',
       '6.0_23', '4.2.2', '7.1', '8.0', '4.3', 'Android_8.0.0', '4.4.2',
       '4.1', 'Android_7.1.1', '5.0', 'Android_5.1.1', '2.3', '5.0.0',
       '7.8.0', '5.0.1', '4.4', '7.8.6', 'Android_5.1', 'Android_7.1.2',
       'Android_4.4.4', '8.0.1', '3.1.1', '7.7.2', '6.1', '3.8.4',
       '7.8.9', '7.7.5', '7.6.8', 'Android_7.0', '7.8.4', '7', '7.7.3',
       '7.1.0', '9.1', '4.1.1', 'Android_4.4.2', '3.8.0', '6.1.2',
       '7.2.1', '7.6.9', '3.7.8', '7.8.2', '8', '7910', '7.6.4', '7.8.5',
       '2.2.3', '5.2', 'Android_6.0', '4.2.1', '10.3.3', '4.0.2', '7.8.8',
       '4.4.3', '21100', '7.7.0', '4.0.3', '4.0.4', '5', '2.3.6', '6.1.0',
       '21000', '4.3.0', '4.1.2', '4.2', '7.6.7', '9.0.5', '4.4.2_19',


In [364]:
def handle_osv(osv):
    osv = str(osv).replace('.','').replace('W','').replace('Android_','').replace('Android', '').replace('十核20G_HD', '').replace(' ','')
    if osv=='nan' or osv=='GIONEE_YNGA': # 这2个值的前面的取值都为8.1.0
        result = 810
    elif osv=='f073b_changxiang_v01_b1b8_20180915':
        result = 810
    elif osv=='%E6%B1%9F%E7%81%B5OS+50':
        result=500
    elif osv.count('-')>0:
        result = int(osv.split('-')[0])
    else:
        result = int(osv)
    
    if result<10:
        result*=100
    elif result<100:
        result*=10
    
    return int(result)


data['osv'] = data['osv'].apply(handle_osv)     
data['osv'].unique()

array([  900,   810,   800,   510,   711,   511,   787,   444,   600,
         601,   390,   700,   792,   790,   712,   386,   777,   502,
        6023,   422,   710,   430,   442,   410,   500,   230,   780,
         501,   440,   786,   801,   311,   772,   610,   384,   789,
         775,   768,   784,   773,   910,   411,   380,   612,   721,
         769,   378,   782,  7910,   764,   785,   223,   520,   421,
        1033,   402,   788,   443, 21100,   770,   403,   404,   236,
       21000,   412,   420,   767,   905, 44219,   383,   433,   445,
         376,   235,   713,   303,   621,  5122, 60119,  7930,  4232,
         512,   774,   602,   431,   237,   530, 71200,   212,   766,
         234,   320,   110, 51122,   400,   731,   802,  5021,   201,
         429,  7920,   503,   292,   222, 60122,   446, 71300,   120],
      dtype=int64)

In [365]:
def handle_version(version):
    version = str(version).replace(' ','').replace('v','').replace('V','').replace('GA','').replace('P_Final_','').replace('GA','')
    # 发现version=50附近都是5
    if version == '50':
        return int(5)
    return int(version)

data['version'] = data['version'].apply(handle_version)
data['version'].unique()
    

array([ 8,  4,  0,  5,  9,  7,  3,  1,  6, 11,  2, 10, 15, 20],
      dtype=int64)

In [366]:
import datetime
data['timestamp'] = data['timestamp'].apply(lambda x : datetime.datetime.fromtimestamp(x/1000))

data['year']= data['timestamp'].dt.year
data['month']= data['timestamp'].dt.month
data['day']= data['timestamp'].dt.day
data['hour']= data['timestamp'].dt.hour
data['minute']= data['timestamp'].dt.minute
data['weekday']= data['timestamp'].dt.weekday

# 划分 训练集

train = data[data['label'].notnull()]
test = data[data['label'].isnull()].drop(['label'],axis=1)

# 获取time_diff
start_time1 = train['timestamp'].min()
train['timestamp_diff'] = train['timestamp']-start_time1
train['timestamp_diff'] = train['timestamp_diff'].dt.days - train['timestamp_diff'].dt.seconds/3600/24

start_time2 = test['timestamp'].min()
test['timestamp_diff'] = test['timestamp']-start_time2
test['timestamp_diff'] = test['timestamp_diff'].dt.days - test['timestamp_diff'].dt.seconds/3600/24

data = pd.concat([train,test])

In [367]:
# fea_hash 有5w多个不同的取值,大多数就10位，大于10位的就处理成0

train['fea_hash'].apply(len).quantile([.1,.2,.5,.75,.95])
data['fea_hash'] = data['fea_hash'].map(lambda i:0 if len(str(i))>10 else int(i))
data['fea_hash_len'] = data['fea_hash'].map(lambda i:len(str(i)))

0.10     9.0
0.20     9.0
0.50    10.0
0.75    10.0
0.95    10.0
Name: fea_hash, dtype: float64

In [368]:
# fea1_hash 大多数就10位，大于10位的就处理成-1

train['fea1_hash'].astype('str').map(len).quantile([.1,.2,.5,.75,.95])
data['fea1_hash'] = data['fea1_hash'].map(lambda i:0 if len(str(i))>10 else int(i))
data['fea1_hash_len'] = data['fea1_hash'].map(lambda i:len(str(i)))

0.10     9.0
0.20     9.0
0.50    10.0
0.75    10.0
0.95    10.0
Name: fea1_hash, dtype: float64

In [369]:
def rf_cast(df1, df2):
    c1 = df1.lan.notnull()
    c2 = df1.lan.isnull()
    df1["mynull1"] = c2 
    df1["mynull2"] = c1
    
    predict = df1[["sid","apptype", "carrier", "dev_height", "dev_ppi", "dev_width", "media_id", "ntt", "lan", "mynull1", "mynull2"]]
    df_notnans = predict[predict.mynull2 == True].copy()
    df_nans = predict[predict.mynull1 == True].copy()

    # 划分训练集、测试集
    X_train, X_test, y_train, y_test = train_test_split(df_notnans[["apptype", "carrier", "dev_height", "dev_ppi","dev_width", "media_id", "ntt"]],
                                                        df_notnans["lan"], train_size=0.75, random_state=2022)

    # 随机森林分类
    # 训练
    regr_multirf = RandomForestClassifier(n_estimators=100, max_depth=40, random_state=0, n_jobs=-1)
    regr_multirf.fit(X_train, y_train)
    score = regr_multirf.score(X_test, y_test)
    print("prediction score is {:.2f}%".format(score * 100))
    # 预测
    df_nans["lan"] = regr_multirf.predict(df_nans[["apptype", "carrier", "dev_height", "dev_ppi","dev_width", "media_id", "ntt"]])
    df1 = pd.concat([df_nans,df_notnans]).reset_index()

    
    c1 = df2.lan.notnull()
    c2 = df2.lan.isnull()
    df2["mynull1"] = c2
    df2["mynull2"] = c1 
    predict_test = df2[["sid","apptype", "carrier", "dev_height", "dev_ppi", "dev_width", "media_id", "ntt", "lan", "mynull1", "mynull2"]]
    df_notnans = predict_test[predict_test.mynull2 == True].copy()
    df_nans = predict_test[predict_test.mynull1 == True].copy()
    
    df_nans["lan"] = regr_multirf.predict(df_nans[["apptype", "carrier", "dev_height", "dev_ppi","dev_width", "media_id", "ntt"]])
    df2 = pd.concat([df_nans,df_notnans]).reset_index()

    gc.collect()
    print('lan，缺失值预测填充成功~')
    return df1, df2


train,test = rf_cast(train,test)
data['lan'] = pd.concat([train,test])['lan']

prediction score is 99.47%
lan，缺失值预测填充成功~


In [370]:

def foreign_lan(x):
    set1 = {'zh-CN', 'zh', 'cn', 'zh_CN', 'Zh-CN', 'zh-cn', 'ZH', 'CN', 'zh_CN_#Hans'}
    if x in set1:
        return 0
    # elif x=='unk':
    #     return 2
    else:
        return 1
    
data["vpn"] = data["lan"].apply(foreign_lan)
data["vpn"] = data["lan"].apply(foreign_lan)

In [371]:
from sklearn.preprocessing import LabelEncoder

LabelEncoder= LabelEncoder()
data['lan']=LabelEncoder.fit_transform(data['lan'].astype('str'))

data['lan'].unique()

array([ 4, 16,  3, 22, 15, 21,  0, 13, 14,  2,  5,  6,  1, 23, 17,  7, 19,
       12, 18, 10, 11,  8, 20,  9])

In [372]:

def divided(x):
    if x % 40 == 0:
        return 2
    elif not x:
        return 1
    else:
        return 0
        
data["160_height"] = data.dev_height.apply(divided)
data["160_width"] = data.dev_width.apply(divided)

# data["dev_ppi"] = data["dev_ppi"].astype('float').apply(lambda x : 1.0 if x==0.0 else x)
data["hw_ratio"] = data.dev_height / data.dev_width
data["hw_matrix"] = data.dev_height * data.dev_width
# data["inch"] = (data.dev_height ** 2 + data.dev_width ** 2) ** 0.5 / data.dev_ppi

data['osv_ver'] = data['osv'] - data['version']

In [373]:
data

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,label,lan,media_id,ntt,...,weekday,timestamp_diff,fea_hash_len,fea1_hash_len,vpn,160_height,160_width,hw_ratio,hw_matrix,osv_ver
0,316361,1199,46000.0,0.0,0.0,0.0,1.0,4,104,6.0,...,4,3.352697,10,10,0,2,2,,0.0,892
1,135939,893,0.0,0.0,0.0,0.0,1.0,16,19,6.0,...,5,4.180336,10,10,0,2,2,,0.0,806
2,399254,821,0.0,760.0,0.0,360.0,1.0,16,559,0.0,...,3,2.001204,10,9,0,2,2,2.111111,273600.0,810
3,68983,1004,46000.0,2214.0,0.0,1080.0,0.0,16,129,2.0,...,6,5.625278,10,10,0,0,2,2.050000,2391120.0,810
4,288999,1076,46000.0,2280.0,0.0,1080.0,1.0,4,64,2.0,...,4,3.646991,10,10,0,2,2,2.111111,2462400.0,795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,0,1001,46000.0,760.0,0.0,360.0,,16,29,2.0,...,5,4.610069,10,10,0,2,2,2.111111,273600.0,810
149996,0,1001,46000.0,780.0,0.0,360.0,,16,29,2.0,...,4,3.697292,8,10,0,0,2,2.166667,280800.0,900
149997,0,1001,46000.0,780.0,0.0,360.0,,16,29,5.0,...,6,5.632685,10,10,0,0,2,2.166667,280800.0,810
149998,500925,1052,46000.0,854.0,240.0,480.0,,16,249,6.0,...,2,1.723484,10,10,0,0,2,1.779167,409920.0,440


In [374]:
data.shape,data.columns

((650000, 35),
 Index(['android_id', 'apptype', 'carrier', 'dev_height', 'dev_ppi',
        'dev_width', 'label', 'lan', 'media_id', 'ntt', 'os', 'osv', 'package',
        'sid', 'timestamp', 'version', 'fea_hash', 'location', 'fea1_hash',
        'cus_type', 'year', 'month', 'day', 'hour', 'minute', 'weekday',
        'timestamp_diff', 'fea_hash_len', 'fea1_hash_len', 'vpn', '160_height',
        '160_width', 'hw_ratio', 'hw_matrix', 'osv_ver'],
       dtype='object'))

## 模型预测

In [375]:
feat_cols = ['android_id', 'apptype', 'carrier', 'dev_height', 'dev_ppi',
             'dev_width', 'lan', 'media_id', 'ntt', 'osv', 'package',
             'version', 'fea_hash', 'location', 'fea1_hash',
        'cus_type', 'year', 'month', 'day', 'hour', 'minute', 'weekday',
        'timestamp_diff', 'fea_hash_len', 'fea1_hash_len', 'vpn', '160_height',
        '160_width', 'hw_ratio', 'hw_matrix', 'osv_ver']

print('参与训练的特征个数：{}'.format(len(feat_cols)))



参与训练的特征个数：31


In [376]:
def train_lgb_kfold(X_train, y_train, X_test, n_fold=5):
    '''train lightgbm with k-fold split'''
    gbms = []
    kfold = StratifiedKFold(n_splits=n_fold, random_state=2022, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = lgb.Dataset(X_tr, y_tr)
        dvalid = lgb.Dataset(X_val, y_val, reference=dtrain)

        params = {
            'objective': 'binary',
            'metric': 'auc',
            'num_leaves': 512,      # num_leaves < 2^max_depth
            'boosting_type':'gbdt',
            'bagging_freq':1,
            'lambda_l1':0.5,
            'lambda_l2':0.5,
            'n_estimators':5000,
            'learning_rate': 0.005, 
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'max_depth':12,
            'n_jobs': -1,
            'random_state': 2022
        }

        gbm = lgb.train(params,
                        dtrain,
                        num_boost_round=100,
                        valid_sets=[dtrain, dvalid]
                        # verbose_eval=50,
                        # early_stopping_rounds=20
                        )

        oof_preds[val_index] = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        test_preds += gbm.predict(X_test, num_iteration=gbm.best_iteration) / kfold.n_splits
        gbms.append(gbm)
        print('--------------------------------------------------------第{}轮结束-----------------------------------------------------'.format(fold+1))

    return gbms, oof_preds, test_preds



In [377]:
def train_lgb(train, test, feat_cols, label_col, n_fold=5):
    '''训练lightgbm'''
    X_train = train[feat_cols]
    y_train = train[label_col]
    X_test = test[feat_cols]
    gbms_lgb, oof_preds_lgb, test_preds_lgb = train_lgb_kfold(X_train, y_train, X_test, n_fold=n_fold)
    
    return gbms_lgb, oof_preds_lgb, test_preds_lgb


In [378]:
train = data[data['label'].notnull()]
test = data[data['label'].isnull()].drop(['label'],axis=1)

In [379]:
gbms_lgb, oof_preds_lgb, test_preds_lgb = train_lgb(train, test,
                                                    feat_cols=feat_cols,
                                                    label_col='label')


--------------------------------------------------------第1轮-----------------------------------------------------
[LightGBM] [Info] Number of positive: 193792, number of negative: 206208
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3025
[LightGBM] [Info] Number of data points in the train set: 400000, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.484480 -> initscore=-0.062100
[LightGBM] [Info] Start training from score -0.062100
[1]	training's auc: 0.935886	valid_1's auc: 0.935085
[2]	training's auc: 0.938309	valid_1's auc: 0.937432
[3]	training's auc: 0.939278	valid_1's auc: 0.938417
[4]	training's auc: 0.93901	valid_1's auc: 0.938213
[5]	training's auc: 0.939503	valid_1's auc: 0.938785
[6]	training's auc: 0.939515	valid_1's auc: 0.93885
[7]	training's auc: 0.939786	valid_1's auc: 0.939083
[8]	training's auc: 0.939984	valid_1's auc: 0.939289
[9]	trai

In [381]:
def gen_thres_new(df_train, oof_preds):
    df_train['oof_preds'] = oof_preds
    thres = df_train['label'].mean() 
    # thres = df_train['oof_preds'].quantile(1 - quantile_point) # 比如 0,1,1,1 mean=0.75 1-mean=0.25,也就是25%分位数取值为0

    _thresh = []
    for thres_item in np.arange(thres - 0.2, thres + 0.2, 0.01): #  按照理论阈值的上下0.2范围，0.01步长，找到最佳阈值，f1分数最高对应的阈值即为最佳阈值
        _thresh.append(
            [thres_item, f1_score(df_train['label'], np.where(oof_preds > thres_item, 1, 0), average='macro')])

    _thresh = np.array(_thresh)
    best_id = _thresh[:, 1].argmax() # 找到f1最高对应的行
    best_thresh = _thresh[best_id][0] # 取出最佳阈值

    print("阈值: {}\n训练集的f1: {}".format(best_thresh, _thresh[best_id][1]))
    return best_thresh

lgb_thres =  gen_thres_new(train, oof_preds_lgb)
print(lgb_thres)



# 测试集预测结果
df_test_res = pd.DataFrame({'sid': test['sid'],
                            'test_preds_lgb': test_preds_lgb})

## 直接按 0.5 划分
df_test_res['preds'] =  np.where(df_test_res['test_preds_lgb']>0.5,1,0)


## 按最佳阈值划分
# df_test_res['preds'] =  np.where(df_test_res['test_preds_lgb']>lgb_thres,1,0)


df_test_submit = df_test_res.iloc[:,[0,2]].rename(columns={'preds':'label'})
df_test_submit.to_csv('./result.csv', index=False)
print('预测结果，输出成功~')


阈值: 0.5044800000000003
训练集的f1: 0.8901218694134707
0.5044800000000003
预测结果，输出成功~
