In [1]:
import lightgbm as lgb

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

import gc
import time
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }
train_sample = pd.read_csv("~/environment/mnt/ssd/kaggle-talkingdata2/competition_files/train.csv", skiprows = range(1,122903891), nrows=40000000, dtype=dtypes)
print(train_sample.shape)
train_sample.head()

(40000000, 8)


Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,304218,12,1,13,245,2017-11-08 16:16:45,,0
1,70656,3,1,19,379,2017-11-08 16:16:45,,0
2,54039,2,1,41,477,2017-11-08 16:16:45,,0
3,258387,9,1,13,232,2017-11-08 16:16:45,,0
4,152156,12,1,15,259,2017-11-08 16:16:45,,0


In [3]:
train_sample.loc[:,"click_time_dt"] = pd.to_datetime(train_sample.loc[:,"click_time"])
train_sample.loc[:,"hour"] = train_sample.loc[:,"click_time_dt"].apply(lambda x: x.hour)
train_sample.loc[:,"day"] = train_sample.loc[:,"click_time_dt"].apply(lambda x: x.day)

print("grouping by ip")
gp = train_sample.groupby(["ip"])["channel"].count().reset_index()
gp.columns = ["ip", "click_by_ip"]
train_sample = pd.merge(train_sample, gp, on="ip", how="left", sort=False)
del gp
gc.collect()

print('grouping by ip-day-hour combination...')
gp = train_sample[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_tcount'})
train_sample = train_sample.merge(gp, on=['ip','day','hour'], how='left')
del gp
gc.collect()

print('grouping by ip-app combination...')
gp = train_sample[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
train_sample = train_sample.merge(gp, on=['ip','app'], how='left')
del gp
gc.collect()

print('grouping by ip-app-os combination...')
gp = train_sample[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
train_sample = train_sample.merge(gp, on=['ip','app', 'os'], how='left')
del gp
gc.collect()

print('grouping by : ip_day_chl_var_hour')
gp = train_sample[['ip','day','hour','channel']].groupby(by=['ip','day','channel'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_tchan_count'})
train_sample = train_sample.merge(gp, on=['ip','day','channel'], how='left')
del gp
gc.collect()

print('grouping by : ip_app_os_var_hour')
gp = train_sample[['ip','app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_app_os_var'})
train_sample = train_sample.merge(gp, on=['ip','app', 'os'], how='left')
del gp
gc.collect()

print('grouping by : ip_app_channel_var_day')
gp = train_sample[['ip','app', 'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[['day']].var().reset_index().rename(index=str, columns={'day': 'ip_app_channel_var_day'})
train_sample = train_sample.merge(gp, on=['ip','app', 'channel'], how='left')
del gp
gc.collect()

print('grouping by : ip_app_chl_mean_hour')
gp = train_sample[['ip','app', 'channel','hour']].groupby(by=['ip', 'app', 'channel'])[['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_app_channel_mean_hour'})
print("merging...")
train_sample = train_sample.merge(gp, on=['ip','app', 'channel'], how='left')
del gp
gc.collect()

print("vars and data type: ")
train_sample.info()
train_sample['ip_tcount'] = train_sample['ip_tcount'].astype('uint16')
train_sample['ip_app_count'] = train_sample['ip_app_count'].astype('uint16')
train_sample['ip_app_os_count'] = train_sample['ip_app_os_count'].astype('uint16')

print(train_sample.shape)
train_sample.head()

grouping by ip
grouping by ip-day-hour combination...
grouping by ip-app combination...
grouping by ip-app-os combination...
grouping by : ip_day_chl_var_hour
grouping by : ip_app_os_var_hour
grouping by : ip_app_channel_var_day
grouping by : ip_app_chl_mean_hour
merging...
vars and data type: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000000 entries, 0 to 39999999
Data columns (total 19 columns):
ip                          uint32
app                         uint16
device                      uint16
os                          uint16
channel                     uint16
click_time                  object
attributed_time             object
is_attributed               uint8
click_time_dt               datetime64[ns]
hour                        int64
day                         int64
click_by_ip                 int64
ip_tcount                   int64
ip_app_count                int64
ip_app_os_count             int64
ip_tchan_count              float64
ip_app_os_var              

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,click_time_dt,hour,day,click_by_ip,ip_tcount,ip_app_count,ip_app_os_count,ip_tchan_count,ip_app_os_var,ip_app_channel_var_day,ip_app_channel_mean_hour
0,304218,12,1,13,245,2017-11-08 16:16:45,,0,2017-11-08 16:16:45,16,8,399,11,90,23,14.25,16.573123,0.333333,11.25
1,70656,3,1,19,379,2017-11-08 16:16:45,,0,2017-11-08 16:16:45,16,8,2847,238,511,104,7.780952,57.463686,0.264706,10.941176
2,54039,2,1,41,477,2017-11-08 16:16:45,,0,2017-11-08 16:16:45,16,8,2651,236,236,4,6.809524,36.0,0.066502,5.068966
3,258387,9,1,13,232,2017-11-08 16:16:45,,0,2017-11-08 16:16:45,16,8,599,39,45,11,24.5,61.963636,0.3,10.0
4,152156,12,1,15,259,2017-11-08 16:16:45,,0,2017-11-08 16:16:45,16,8,461,26,35,1,12.0,,0.0,18.0


In [4]:
feature = ['app','device','os', 'channel', 'hour', 'day', 
              'ip_tcount', 'ip_tchan_count', 'ip_app_count',
              'ip_app_os_count', 'ip_app_os_var',
              'ip_app_channel_var_day','ip_app_channel_mean_hour']
categorical = ['app', 'device', 'os', 'channel', 'hour', 'day']
target = ["is_attributed"]
X = train_sample.loc[:,feature + target]
y = train_sample.loc[:,target]
del train_sample
gc.collect()

44

In [5]:
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
                 feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric':metrics,
        'learning_rate': 0.01,
        #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 4,
        'verbose': 0,
        'metric':metrics
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values.reshape(-1),
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values.reshape(-1),
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )

    evals_results = {}

    bst1 = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[xgtrain, xgvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10,
                     feval = feval)

    n_estimators = bst1.best_iteration
    print("\nModel Report")
    print("n_estimators : ", n_estimators)
    print(metrics+":", evals_results['valid'][metrics][n_estimators-1])

    return bst1

In [6]:
tr_X, val_X, tr_y, val_y = train_test_split(X, y, test_size=0.1)
del X, y
gc.collect()

26

In [7]:
tr_X[target].values.reshape(-1).shape

(36000000,)

In [12]:
print("Training...")
start_time = time.time()


params = {
    'learning_rate': 0.15,
    #'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 7,  # 2^max_depth - 1
    'max_depth': 3,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 100,  # Number of bucketed bin for feature values
    'subsample': 0.7,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'scale_pos_weight':99.74 # because training data is extremely unbalanced 
}
bst = lgb_modelfit_nocv(params, 
                        tr_X, 
                        val_X, 
                        feature, 
                        target, 
                        objective='binary', 
                        metrics='auc',
                        early_stopping_rounds=30, 
                        verbose_eval=True, 
                        num_boost_round=500, 
                        categorical_features=categorical)

print('[{}]: model training time'.format(time.time() - start_time))



Training...
preparing validation datasets




Training until validation scores don't improve for 30 rounds.
[10]	train's auc: 0.95492	valid's auc: 0.952781
[20]	train's auc: 0.961418	valid's auc: 0.959198
[30]	train's auc: 0.965297	valid's auc: 0.963219
[40]	train's auc: 0.967372	valid's auc: 0.965491
[50]	train's auc: 0.968315	valid's auc: 0.966217
[60]	train's auc: 0.969278	valid's auc: 0.9669
[70]	train's auc: 0.969874	valid's auc: 0.967427
[80]	train's auc: 0.970415	valid's auc: 0.967718
[90]	train's auc: 0.970921	valid's auc: 0.968039
[100]	train's auc: 0.971349	valid's auc: 0.968332
[110]	train's auc: 0.971637	valid's auc: 0.968548
[120]	train's auc: 0.971896	valid's auc: 0.968759
[130]	train's auc: 0.97214	valid's auc: 0.968875
[140]	train's auc: 0.972444	valid's auc: 0.96916
[150]	train's auc: 0.972653	valid's auc: 0.969293
[160]	train's auc: 0.97284	valid's auc: 0.969397
[170]	train's auc: 0.973048	valid's auc: 0.969488
[180]	train's auc: 0.973247	valid's auc: 0.96957
[190]	train's auc: 0.973406	valid's auc: 0.9697
[200]	

NameError: name 'train_df' is not defined

In [13]:
import pickle
pickle.dump(bst, open("lightgbm.pkl", "wb"))

In [19]:
import pickle
bst = pickle.load(open('/home/ec2-user/environment/lightgbm.pkl', 'rb'))

# 予測

In [20]:
test = pd.read_csv("~/environment/test.csv",dtype=dtypes)
print(test.shape)
test.head()

(18790469, 7)


Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


In [21]:
test.loc[:,"click_time_dt"] = pd.to_datetime(test.loc[:,"click_time"])
test.loc[:,"hour"] = test.loc[:,"click_time_dt"].apply(lambda x: x.hour)
test.loc[:,"day"] = test.loc[:,"click_time_dt"].apply(lambda x: x.day)

print("grouping by ip")
gp = test.groupby(["ip"])["channel"].count().reset_index()
gp.columns = ["ip", "click_by_ip"]
test = pd.merge(test, gp, on="ip", how="left", sort=False)
del gp
gc.collect()

print('grouping by ip-day-hour combination...')
gp = test[['ip','day','hour','channel']].groupby(by=['ip','day','hour'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_tcount'})
test = test.merge(gp, on=['ip','day','hour'], how='left')
del gp
gc.collect()

print('grouping by ip-app combination...')
gp = test[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_count'})
test = test.merge(gp, on=['ip','app'], how='left')
del gp
gc.collect()

print('grouping by ip-app-os combination...')
gp = test[['ip','app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={'channel': 'ip_app_os_count'})
test = test.merge(gp, on=['ip','app', 'os'], how='left')
del gp
gc.collect()

print('grouping by : ip_day_chl_var_hour')
gp = test[['ip','day','hour','channel']].groupby(by=['ip','day','channel'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_tchan_count'})
test = test.merge(gp, on=['ip','day','channel'], how='left')
del gp
gc.collect()

print('grouping by : ip_app_os_var_hour')
gp = test[['ip','app', 'os', 'hour']].groupby(by=['ip', 'app', 'os'])[['hour']].var().reset_index().rename(index=str, columns={'hour': 'ip_app_os_var'})
test = test.merge(gp, on=['ip','app', 'os'], how='left')
del gp
gc.collect()

print('grouping by : ip_app_channel_var_day')
gp = test[['ip','app', 'channel', 'day']].groupby(by=['ip', 'app', 'channel'])[['day']].var().reset_index().rename(index=str, columns={'day': 'ip_app_channel_var_day'})
test = test.merge(gp, on=['ip','app', 'channel'], how='left')
del gp
gc.collect()

print('grouping by : ip_app_chl_mean_hour')
gp = test[['ip','app', 'channel','hour']].groupby(by=['ip', 'app', 'channel'])[['hour']].mean().reset_index().rename(index=str, columns={'hour': 'ip_app_channel_mean_hour'})
print("merging...")
test = test.merge(gp, on=['ip','app', 'channel'], how='left')
del gp
gc.collect()

print("vars and data type: ")
test.info()
test['ip_tcount'] =test['ip_tcount'].astype('uint16')
test['ip_app_count'] = test['ip_app_count'].astype('uint16')
test['ip_app_os_count'] = test['ip_app_os_count'].astype('uint16')

print(test.shape)
test.head()

grouping by ip
grouping by ip-day-hour combination...
grouping by ip-app combination...
grouping by ip-app-os combination...
grouping by : ip_day_chl_var_hour
grouping by : ip_app_os_var_hour
grouping by : ip_app_channel_var_day
grouping by : ip_app_chl_mean_hour
merging...
vars and data type: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18790469 entries, 0 to 18790468
Data columns (total 18 columns):
click_id                    uint32
ip                          uint32
app                         uint16
device                      uint16
os                          uint16
channel                     uint16
click_time                  object
click_time_dt               datetime64[ns]
hour                        int64
day                         int64
click_by_ip                 int64
ip_tcount                   int64
ip_app_count                int64
ip_app_os_count             int64
ip_tchan_count              float64
ip_app_os_var               float64
ip_app_channel_var_day   

Unnamed: 0,click_id,ip,app,device,os,channel,click_time,click_time_dt,hour,day,click_by_ip,ip_tcount,ip_app_count,ip_app_os_count,ip_tchan_count,ip_app_os_var,ip_app_channel_var_day,ip_app_channel_mean_hour
0,0,5744,9,1,3,107,2017-11-10 04:00:00,2017-11-10 04:00:00,4,10,91,34,28,1,0.333333,,0.0,4.5
1,1,119901,9,1,3,466,2017-11-10 04:00:00,2017-11-10 04:00:00,4,10,2083,403,289,5,13.130156,4.7,0.0,8.228571
2,2,72287,21,1,19,128,2017-11-10 04:00:00,2017-11-10 04:00:00,4,10,2135,229,312,24,10.612795,9.027174,0.0,7.969697
3,3,78477,15,1,13,111,2017-11-10 04:00:00,2017-11-10 04:00:00,4,10,1201,239,42,23,0.0,15.873518,0.0,4.0
4,4,123080,12,1,13,328,2017-11-10 04:00:00,2017-11-10 04:00:00,4,10,208,60,24,7,0.0,11.904762,0.0,4.0


In [22]:
X_test = test.loc[:,feature]
del test
gc.collect()
print(X_test.shape)
X_test.head()

(18790469, 13)


Unnamed: 0,app,device,os,channel,hour,day,ip_tcount,ip_tchan_count,ip_app_count,ip_app_os_count,ip_app_os_var,ip_app_channel_var_day,ip_app_channel_mean_hour
0,9,1,3,107,4,10,34,0.333333,28,1,,0.0,4.5
1,9,1,3,466,4,10,403,13.130156,289,5,4.7,0.0,8.228571
2,21,1,19,128,4,10,229,10.612795,312,24,9.027174,0.0,7.969697
3,15,1,13,111,4,10,239,0.0,42,23,15.873518,0.0,4.0
4,12,1,13,328,4,10,60,0.0,24,7,11.904762,0.0,4.0


In [23]:
pred = bst.predict(X_test[feature])

In [24]:
submission = pd.read_csv("~/environment/sample_submission.csv")
print(submission.shape)
submission.head()

(18790469, 2)


Unnamed: 0,click_id,is_attributed
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [25]:
submission.loc[:,"is_attributed"] = pred
submission.head()

Unnamed: 0,click_id,is_attributed
0,0,0.093081
1,1,0.039815
2,2,0.005675
3,3,0.026374
4,4,0.009867


In [26]:
submission.to_csv("submission.csv",index=None)

In [27]:
pd.read_csv("submission.csv").head()

Unnamed: 0,click_id,is_attributed
0,0,0.093081
1,1,0.039815
2,2,0.005675
3,3,0.026374
4,4,0.009867
