# 数据合并处理

In [None]:
import pandas as pd
import numpy as np
import scipy as sp

def read_csv_file(f, logging = False):
    data = pd.read_csv(f)
    if logging:
        print(data.head())
        print(data.columns.values)
        print(data.describe())
        print(data.info())
    return data

def categories_process_first_class(cate):
    cate = str(cate)
    return int(cate[0])
        
    
def categories_process_second_class(cate):
    cate = str(cate)
    if len(cate) < 3:
        return 0
    else:
        return int(cate[1:])
    
def age_process(age):
    age = int(age)
    if age == 0:
        return 0
    if age < 16:
        return 1
    if age < 23:
        return 2
    if age < 27:
        return 3
    else:
        return 4

def province_process(hometown):
    hometown = str(hometown)
    if len(hometown) == 1:
        return 0
    else:
        return int(hometown[: -2])

def city_process(hometown):
    hometown = str(hometown)
    if len(hometown) == 1:
        return 0
    else:
        return int(hometown[-2: ])

def get_time_day(t):
    t = str(t)
    t = int(t[0: 2])
    return t

def get_time_hour(t):
    t = str(t)
    t = int(t[2: 4])
    if t < 6:
        return 0
    if t < 12:
        return 1
    if t < 18:
        return 2
    else:
        return 3

def get_time_min(t):
    t = str(t)
    t = int(t[4: ])
    return t

def logloss(act, pred):
    epsilon = 1e-5
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1 - eplison, pred)
    ll = sum(act * sp.log(pred) + sp.subtract(1, act) * sp.log(sp.subtract(1, pred)))
    ll = ll * -1.0/len(act)
    return ll

In [None]:
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

print('test dataset')
test_data = read_csv_file('./pre/test.csv', logging = True)
print('================================================\n')
print('train dataset:')
train_data = read_csv_file('./pre/train.csv', logging = True)
print('================================================\n')
print('ad dataset')
ad = read_csv_file('./pre/ad.csv', logging = True)
print('================================================\n')
print('app dataset')
app_categories = read_csv_file('./pre/app_categories.csv', logging = True)
print('================================================\n')
print('user dataset')
user = read_csv_file('./pre/user.csv', logging = True)
print('================================================\n')
print('position dataset')
position = read_csv_file('./pre/position.csv', logging= True)
print('================================================\n')
print('user_app_actions dataset')
user_app_actions = read_csv_file('./pre/user_app_actions.csv', logging = True)

In [None]:
print('================================================\n')
print('user_installedapps dataset')
user_installedapps = read_csv_file('./pre/user_installedapps.csv', logging = False)

In [None]:
train = train_data.drop(['label', 'conversionTime'], axis = 1)
test = test_data.drop(['label', 'instanceID'], axis = 1)
data = pd.concat((train, test), axis = 0)

In [None]:
print(data.shape)
print(train.shape)
print(test.shape)

In [None]:
data.isnull().sum()

In [None]:
data_ad = data.merge(ad, on = 'creativeID', how = 'left')
data_ad.shape

In [None]:
data_ad['click_day'] = data_ad['clickTime'].apply(get_time_day)
data_ad['click_min'] = data_ad['clickTime'].apply(get_time_min)
data_ad['click_hour'] = data_ad['clickTime'].apply(get_time_hour)
data_ad.drop(['clickTime'], inplace = True, axis = 1)
data_ad.tail()

In [None]:
data_ad_app = data_ad.merge(app_categories, on = 'appID', how = 'left')
data_ad_app['app_first_categories'] = data_ad_app['appCategory'].apply(categories_process_first_class)
data_ad_app['app_second_categories'] = data_ad_app['appCategory'].apply(categories_process_second_class)
data_ad_app.drop(['appCategory'], axis = 1, inplace = True)

In [None]:
data_ad_app_user = data_ad_app.merge(user, on = 'userID', how = 'left')

In [None]:
##test数据集的age数据缺失较多
data_ad_app_user['age'].describe()

In [None]:
data_ad_app_user['age'].replace(0.0, data_ad_app_user['age'][data_ad_app_user['age'] != 0.0].mean(), inplace = True)

In [None]:
data_ad_app_user['age'].fillna(data_ad_app_user['age'][data_ad_app_user['age'] != 0.0].mean(), inplace = True)
data_ad_app_user['age'].isnull().sum()

In [None]:
data_ad_app_user['age'] = data_ad_app_user['age'].apply(age_process)

In [None]:
data_ad_app_user['residence_province'] = data_ad_app_user['residence'].apply(province_process)
data_ad_app_user['residence_city'] = data_ad_app_user['residence'].apply(city_process)
data_ad_app_user.drop(['residence'], inplace = True, axis = 1)
data_ad_app_user.tail()

In [None]:
data_ad_app_user_position = data_ad_app_user.merge(position, on = 'positionID', how = 'left')

In [None]:
x_train = data_ad_app_user_position[: 3749528]
y_train = train_data['label']
test = data_ad_app_user_position[3749528: ]

In [None]:
import pickle
with open('values.pkl', 'wb') as f:
    pickle.dump(x_train, f)
    pickle.dump(y_train, f)
    pickle.dump(test, f)

# 数据建模

## 导入数据

In [None]:
import pickle
with open('values.pkl', 'rb') as f:
    x_train = pickle.load(f)
    y_train = pickle.load(f)
    test = pickle.load(f) 

In [None]:
y_train.value_counts()
##正负样本非常不均衡，但由于过采样后数据量过大，电脑内存有限，使用欠采样的方法

## 过采样后存储数据

In [None]:
from imblearn.over_sampling import ADASYN

x_train_oversam, y_train_oversam = ADASYN().fit_sample(x_train, y_train)

In [None]:
import pickle
with open('values_resam.pkl', 'wb') as f:
    pickle.dump(x_train_oversam, f)
    pickle.dump(y_train_oversam, f)   

In [None]:
print(x_train_oversam.shape)
print(y_train_oversam.shape)

In [None]:
## EasyEnsemble
'''
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold

bbc = BalancedBaggingClassifier(
    base_estimator = DecisionTreeClassifier(),
    ratio = 'auto',
    replacement = False,
    random_state = 100,
    n_jobs = -1
)

kf = KFold(n_splits = 5)
for train_index, val_index in kf.split(x_train, y_train):
    X_train,Y_train = x_train[train_index, : ], y_train[train_index]
    X_val, Y_val = x_train[val_index, : ], y_train[val_index]
    bbc.fit(X_train, Y_train)
    pred_bbc = bbc.predict_proba(X_val)
    print(pred_bbc)
    print('The logloss is ', logloss(Y_val, pred_bbc))
'''

## 欠采样并存储数据

In [None]:
import numpy as np
import pickle
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC

with open('values.pkl', 'rb') as f:
    x_train = pickle.load(f)
    y_train = pickle.load(f)
    test = pickle.load(f)
    
x_train = np.array(x_train)
y_train = np.array(y_train)
test = np.array(test)

bc = RandomUnderSampler(random_state = 100)
x_train_resam, y_train_resam = bc.fit_sample(x_train, y_train)

with open('values_undersampling.pkl', 'wb') as f:
    pickle.dump(x_train_resam, f)
    pickle.dump(y_train_resam, f)
    pickle.dump(test, f)

## 载入数据

In [1]:
import pickle

with open('values_undersampling.pkl', 'rb') as f:
    x_train_resam = pickle.load(f)
    y_train_resam = pickle.load(f)
    test = pickle.load(f)

### Xgbosst

#### 用sklearn API

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import numpy as np

param_grid = {
    'max_depth': [3, 4, 5, 8, 10],
    'n_estimators': [50, 100, 200, 400, 600, 800, 1000],
    'laerning_rate': [0.1, 0.2, 0.3],
    'gamma': [0, 0.2],
    'subsample': [0.8, 1],
             }

xgb_model = xgb.XGBClassifier()
rgs = GridSearchCV(xgb_model, param_grid, n_jobs = -1, cv = 5)
rgs.fit(x_train_resam, y_train_resam)
print(rgs.best_score_)
print(rgs.best_params_)
pred = rgs.predict_proba(test)
print('The logloss is ', logloss(Y_val, pred_bbc))

#### 用Python API

In [3]:
import xgboost as xgb

dtrain = xgb.DMatrix(x_train_resam, label = y_train_resam)
num_xgb = 500

params = {
    'booster': 'gbtree',
    'eta': 0.2, 
    'eval_metric': 'logloss',
    'tree method': 'auto',
    'objective': 'binary:logistic'
}

bst = xgb.cv(params, dtrain, num_xgb, nfold = 5, verbose_eval = True)

[0]	train-logloss:0.662994+0.000289113	test-logloss:0.663231+0.000402351
[1]	train-logloss:0.642744+0.000555967	test-logloss:0.643205+0.000724138
[2]	train-logloss:0.627459+0.000494531	test-logloss:0.628052+0.00126858
[3]	train-logloss:0.616709+0.00055443	test-logloss:0.617552+0.00132264
[4]	train-logloss:0.608691+0.000874165	test-logloss:0.609752+0.00144459
[5]	train-logloss:0.601819+0.00074481	test-logloss:0.60304+0.00185067
[6]	train-logloss:0.597109+0.000931505	test-logloss:0.59856+0.0020078
[7]	train-logloss:0.592709+0.000790944	test-logloss:0.594419+0.00180797
[8]	train-logloss:0.589358+0.00110724	test-logloss:0.591284+0.00203231
[9]	train-logloss:0.5871+0.000944274	test-logloss:0.589226+0.00227144
[10]	train-logloss:0.584939+0.000879835	test-logloss:0.587297+0.00233238
[11]	train-logloss:0.582778+0.000867472	test-logloss:0.585386+0.00292344
[12]	train-logloss:0.579652+0.00132008	test-logloss:0.582572+0.00314231
[13]	train-logloss:0.57792+0.000460898	test-logloss:0.581054+0.00242

[115]	train-logloss:0.527062+0.00137147	test-logloss:0.553303+0.00195657
[116]	train-logloss:0.526818+0.0013113	test-logloss:0.553317+0.0019283
[117]	train-logloss:0.526549+0.00129633	test-logloss:0.553288+0.00194048
[118]	train-logloss:0.526387+0.00131524	test-logloss:0.5533+0.00192613
[119]	train-logloss:0.526132+0.00128433	test-logloss:0.553248+0.0019128
[120]	train-logloss:0.525889+0.00137856	test-logloss:0.553256+0.00192598
[121]	train-logloss:0.525728+0.00146693	test-logloss:0.553213+0.00190001
[122]	train-logloss:0.525475+0.00139006	test-logloss:0.55317+0.00191421
[123]	train-logloss:0.525189+0.00131317	test-logloss:0.553091+0.0019354
[124]	train-logloss:0.52488+0.00126585	test-logloss:0.553097+0.00194123
[125]	train-logloss:0.524623+0.00136907	test-logloss:0.553071+0.00194326
[126]	train-logloss:0.52435+0.00133443	test-logloss:0.553005+0.00197181
[127]	train-logloss:0.524021+0.00133208	test-logloss:0.552947+0.00195158
[128]	train-logloss:0.523722+0.00134036	test-logloss:0.55285

[228]	train-logloss:0.501996+0.000874475	test-logloss:0.551775+0.00224076
[229]	train-logloss:0.501867+0.00082819	test-logloss:0.551778+0.00224372
[230]	train-logloss:0.501593+0.00083106	test-logloss:0.551801+0.00223259
[231]	train-logloss:0.501397+0.000780539	test-logloss:0.551791+0.00229302
[232]	train-logloss:0.501154+0.000779219	test-logloss:0.551774+0.00231978
[233]	train-logloss:0.500952+0.000830103	test-logloss:0.551797+0.00233644
[234]	train-logloss:0.500762+0.000805606	test-logloss:0.551837+0.00233178
[235]	train-logloss:0.500554+0.00078577	test-logloss:0.551842+0.00231724
[236]	train-logloss:0.500344+0.000763132	test-logloss:0.551794+0.00234256
[237]	train-logloss:0.500162+0.000776637	test-logloss:0.551804+0.00229651
[238]	train-logloss:0.499994+0.000791717	test-logloss:0.551799+0.00229374
[239]	train-logloss:0.499846+0.000803244	test-logloss:0.551811+0.00228391
[240]	train-logloss:0.499634+0.000834049	test-logloss:0.551813+0.00227793
[241]	train-logloss:0.499409+0.000820167	

[340]	train-logloss:0.481138+0.000764024	test-logloss:0.552612+0.00254887
[341]	train-logloss:0.480981+0.000811363	test-logloss:0.552621+0.00256449
[342]	train-logloss:0.480804+0.000820766	test-logloss:0.55264+0.00258427
[343]	train-logloss:0.480608+0.000816856	test-logloss:0.552638+0.00259153
[344]	train-logloss:0.480449+0.000804295	test-logloss:0.552645+0.00260831
[345]	train-logloss:0.480272+0.000811903	test-logloss:0.552672+0.00262223
[346]	train-logloss:0.480141+0.000806074	test-logloss:0.552664+0.00260787
[347]	train-logloss:0.479965+0.000774026	test-logloss:0.552679+0.00258241
[348]	train-logloss:0.47985+0.000754662	test-logloss:0.55268+0.00257466
[349]	train-logloss:0.479682+0.000723204	test-logloss:0.552695+0.00258251
[350]	train-logloss:0.479541+0.000652389	test-logloss:0.552713+0.00255525
[351]	train-logloss:0.479416+0.000661524	test-logloss:0.552728+0.0025502
[352]	train-logloss:0.479271+0.000696417	test-logloss:0.55274+0.00255869
[353]	train-logloss:0.479099+0.000736417	te

[452]	train-logloss:0.46321+0.000878561	test-logloss:0.55401+0.00272488
[453]	train-logloss:0.463061+0.00087521	test-logloss:0.554023+0.00271781
[454]	train-logloss:0.462914+0.000868299	test-logloss:0.554029+0.00272442
[455]	train-logloss:0.462778+0.000844391	test-logloss:0.554059+0.00271484
[456]	train-logloss:0.462579+0.000796816	test-logloss:0.554048+0.00268132
[457]	train-logloss:0.462502+0.000761562	test-logloss:0.554058+0.00268516
[458]	train-logloss:0.462329+0.000792599	test-logloss:0.554055+0.00269604
[459]	train-logloss:0.462223+0.000787065	test-logloss:0.554064+0.00269483
[460]	train-logloss:0.462067+0.000788579	test-logloss:0.554126+0.00271364
[461]	train-logloss:0.461918+0.000842252	test-logloss:0.55413+0.00269475
[462]	train-logloss:0.461712+0.000879605	test-logloss:0.554154+0.00272257
[463]	train-logloss:0.461528+0.00089636	test-logloss:0.554196+0.00273437
[464]	train-logloss:0.461321+0.000923065	test-logloss:0.554207+0.00274371
[465]	train-logloss:0.461119+0.000957759	te

In [None]:
dtest = xgb.DMatrix(test)
pred = bst.predict(dtest)
print(logloss()

### LightGBM

In [None]:
import lightgbm as lgb