In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import gc

In [2]:
dtypes = {
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'hour'          : 'uint8',
        'is_attributed' : 'uint8', 
        'ip_day_hour_count': 'uint32', 
        'ip_os_day_hour_count': 'uint32', 
        'ip_app_day_hour_count': 'uint32', 
        'ip_app_os_day_hour_count': 'uint32', 
        'app_day_hour_count': 'uint32', 
        'ip_device_os_count': 'uint32', 
        'ip_app_device_os_count': 'uint32', 
        'ip_device_os_mean': 'float16',
        'ip_app_device_os_mean': 'float16',
        'ip_app_device_mean': 'float16',
        'app_device_os_mean': 'float16',
        'ip_device_os_time2nextclick': 'int32',
        'ip_app_device_os_time2nextclick': 'int32',
        'ip_app_device_time2nextclick': 'int32',
        'ip_device_os_time2previousclick': 'int32',
        'ip_app_device_os_time2previousclick': 'int32',
        'ip_app_device_time2previousclick': 'int32',
        'ip_device_os_countfromfuture': 'uint32', 
        'ip_app_device_os_countfromfuture': 'uint32', 
        'ip_app_device_countfromfuture': 'uint32', 
        'ip_device_os_countfrompast': 'uint32', 
        'ip_app_device_os_countfrompast': 'uint32', 
        'ip_app_device_countfrompast': 'uint32', 
        'ip_device_os_lasttimediff': 'int32',
        'ip_app_device_os_lasttimediff': 'int32',
        'ip_app_device_lasttimediff': 'int32',
        'ip_device_os_firsttimediff': 'int32',
        'ip_app_device_os_firsttimediff': 'int32',
        'ip_app_device_firsttimediff': 'int32',
        'matrixFact_user_iposdeviceapp_item_app': 'float16',
        'matrixFact_user_ip_item_appdeviceos': 'float16',
        'matrixFact_user_ipchannel_item_appdeviceos': 'float16',
        'ip_device_os_regression': 'float16',
        'ip_app_device_os_regression': 'float16',
        'ip_app_device_regression': 'float16',
        'ip_app_device_os_channel_regression': 'float16'
        } 

In [5]:
load_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/'
file_format = '{}_equalhour_supplementV1.csv'
day_list = ['day7', 'day8', 'day9']
df_dict = {}
for file in ['day7', 'day8', 'day9', 'test']: 
    df_dict[file] = pd.read_csv(load_path+file_format.format(file),dtype=dtypes)
    print(file_format.format(file))

day7_equalhour_supplementV1.csv
day8_equalhour_supplementV1.csv
day9_equalhour_supplementV1.csv
test_equalhour_supplementV1.csv


# define parameter

In [6]:
target = 'is_attributed'

day_list = ['day7', 'day8', 'day9']

combine = 0
params_raw = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 4000,
        'learning_rate': 0.1,
        'num_leaves': 61,
        'num_threads': 4, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': -1, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 390,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.7, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':70,
        'bagging_fraction': 0.7, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        'verbose': 0,
        'scale_pos_weight': 400,
        'metric' : [ 'auc']
    }
categorical_col = [ 'app', 'device', 'os', 'channel', 'hour',   ]

feature_cols = list(df_dict['test'].columns.values)

In [7]:
# get rid of extra features
# extra = set(['matrixFact_user_iposdeviceapp_item_channel', 'matrixFact_user_iposdeviceapp_item_device'])
extra = set(['ip_device_os_countself',
             'ip_day_hour_countself', 
             'app_day_hour_countself',
             'ip_app_device_os_countself',
             'ip_app_day_hour_countself', 
             'ip_os_day_hour_countself', 
             'ip_app_os_day_hour_countself' ])

# extra = set([])
# extra = set(['ip_device_os_count',
#              'ip_day_hour_count', 
#              'app_day_hour_count',
#              'ip_app_device_os_count',
#              'ip_app_day_hour_count', 
#              'ip_os_day_hour_count', 
#              'ip_app_os_day_hour_count' ])
feature_cols = list(set(feature_cols) - extra)

# Create List of Parameter to go through

In [4]:
combine1 = [(390,400), (100, 99.7), (4000, 400), (1000, 400), (1999, 400)]
learning_rate = [0.03, 0.01]
feature_fraction = [0.7, 0.6, 0.5, 0.4]
bagging_fraction = [0.7, 0.6, 0.8, 0.9, 1]
num_leaves = [31, 61]
list_of_parameter = []
for nleave in num_leaves:
    for com in combine1:
        for lr in learning_rate:
            for ff in feature_fraction:
                for bf in bagging_fraction:
                    list_of_parameter.append({'min_data_in_leaf': com[0], 
                                              'scale_pos_weight': com[1], 
                                              'learning_rate':lr, 
                                              'feature_fraction':ff, 
                                              'bagging_fraction':bf,
                                              'ROC':0, 
                                              'num_rounds':0, 
                                              'max_depth': -1, 
                                              'num_leaves': nleave})
# df_grid = pd.DataFrame(list_of_parameter)
# df_grid.to_csv('/home/kai/data/kaggle/talkingdata/wl/report/grid_search.csv', index=False)

In [33]:
# Load Parameters
df_grid = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/report/grid_search.csv')

# Get parameters

In [34]:
def get_parameters(df ,param):
    params = param.copy()
    dff = df.copy()
    dff = dff[(dff.ROC == 0) &(dff.learning_rate == 0.03) &((dff.min_data_in_leaf == 390)|(dff.min_data_in_leaf == 100))]
    if len(dff) > 0:
        index = dff.index.values.copy()
        np.random.shuffle(index)
        num=index[0]
        param_get = dff.loc[num].to_dict()
        param_get.pop('ROC')
        param_get.pop('num_rounds')
        param_get['max_depth'] = int(param_get['max_depth'])
        param_get['min_data_in_leaf'] = int(param_get['min_data_in_leaf'])
        param_get['num_leaves'] = int(param_get['num_leaves'])
        
        params.update(param_get)
        return(params, num)
    else:
        return (None, None)

In [35]:
def train_lightgbm(x_train, x_val, feature_cols, categorical_feature, params, best_round = None, target='is_attributed'):
    param = params.copy()
    y_train = x_train[target].values
    y_val = x_val[target].values
    
    lgb_train = lgb.Dataset(x_train[feature_cols], y_train, categorical_feature = categorical_feature)
    lgb_val = lgb.Dataset(x_val[feature_cols], y_val, categorical_feature = categorical_feature)
    if best_round is not None:
        param['num_rounds'] = best_round
        del param['early_stopping_round']
    print('start training')
    model = lgb.train(param, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)
    return model

# Train on Day8,9 Val Day 7

In [None]:
from sklearn.metrics import roc_auc_score
%env JOBLIB_TEMP_FOLDER=/tmp
day = 'day7'

# counter = 0
# save_path = '/home/kai/data/kaggle/talkingdata/wl/data/gridsearch/'
# file_format = 'train_day8day9_val_day7_index_{}.npy'
# train_day = list(set(day_list)-set([day]))
# trainset = pd.concat([df_dict[train_day[0]],df_dict[train_day[1]]])
# print('building train set done!')
# valset = df_dict[day]
# print('building train val done!')
# test = df_dict['test']
# del df_dict
gc.collect()

df_grid = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/report/grid_search.csv')
params, index = get_parameters(df_grid, params_raw)
while index is not None:
    counter += 1
    print('=================================================')
    print(counter)
    print(index)
    print(params)
    model = train_lightgbm(trainset, valset, feature_cols, categorical_col, params)
    best_round = model.best_iteration
    # Calculate ROC-AUC
    ROC = roc_auc_score(valset[target].values, model.predict(valset[feature_cols]))
    df_grid.loc[index, 'ROC'] = ROC
    df_grid.loc[index, 'num_rounds'] = best_round
    df_grid.to_csv('/home/kai/data/kaggle/talkingdata/wl/report/grid_search.csv', index=False)
    
    print('pred and save')
    pred = model.predict(test[feature_cols])
    file_name = file_format.format(index)
    np.save(save_path+file_name, pred)
    print('saving... {}'.format(file_name))
    
    df_grid = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/report/grid_search.csv')
    params, index = get_parameters(df_grid, params_raw)
    gc.collect()
    
print('done!')
    


env: JOBLIB_TEMP_FOLDER=/tmp
10
186
{'objective': 'binary', 'boosting': 'gbdt', 'num_rounds': 4000, 'learning_rate': 0.029999999999999999, 'num_leaves': 31, 'num_threads': 4, 'device': 'cpu', 'max_depth': -1, 'min_data_in_leaf': 100, 'feature_fraction': 0.5, 'feature_fraction_seed': 1, 'early_stopping_round': 70, 'bagging_fraction': 0.90000000000000002, 'bagging_freq': 1, 'bagging_seed': 1, 'verbose': 0, 'scale_pos_weight': 99.700000000000003, 'metric': ['auc']}
start training




Training until validation scores don't improve for 70 rounds.
[10]	valid_0's auc: 0.971577
[20]	valid_0's auc: 0.972614
[30]	valid_0's auc: 0.972944
[40]	valid_0's auc: 0.974221
[50]	valid_0's auc: 0.975999
[60]	valid_0's auc: 0.977362
[70]	valid_0's auc: 0.977849
[80]	valid_0's auc: 0.978471
[90]	valid_0's auc: 0.978936
[100]	valid_0's auc: 0.979468
[110]	valid_0's auc: 0.979912
[120]	valid_0's auc: 0.980866
[130]	valid_0's auc: 0.981632
[140]	valid_0's auc: 0.982033
[150]	valid_0's auc: 0.982531
[160]	valid_0's auc: 0.982878
[170]	valid_0's auc: 0.983163
[180]	valid_0's auc: 0.983484
[190]	valid_0's auc: 0.983726
[200]	valid_0's auc: 0.983961
[210]	valid_0's auc: 0.984164
[220]	valid_0's auc: 0.98432
[230]	valid_0's auc: 0.984452
[240]	valid_0's auc: 0.984598
[250]	valid_0's auc: 0.984727
[260]	valid_0's auc: 0.98482
[270]	valid_0's auc: 0.984902
[280]	valid_0's auc: 0.984973
[290]	valid_0's auc: 0.985023
[300]	valid_0's auc: 0.985077
[310]	valid_0's auc: 0.985119
[320]	valid_0's auc

[690]	valid_0's auc: 0.985523
[700]	valid_0's auc: 0.985534
[710]	valid_0's auc: 0.985535
[720]	valid_0's auc: 0.985532
[730]	valid_0's auc: 0.985538
[740]	valid_0's auc: 0.985543
[750]	valid_0's auc: 0.985541
[760]	valid_0's auc: 0.985546
[770]	valid_0's auc: 0.985546
[780]	valid_0's auc: 0.98555
[790]	valid_0's auc: 0.985556
[800]	valid_0's auc: 0.985562
[810]	valid_0's auc: 0.985565
[820]	valid_0's auc: 0.985571
[830]	valid_0's auc: 0.985571
[840]	valid_0's auc: 0.985566
[850]	valid_0's auc: 0.985564
[860]	valid_0's auc: 0.985568
[870]	valid_0's auc: 0.985564
[880]	valid_0's auc: 0.985563
[890]	valid_0's auc: 0.985565
Early stopping, best iteration is:
[824]	valid_0's auc: 0.985574
pred and save
saving... train_day8day9_val_day7_index_244.npy
13
184
{'objective': 'binary', 'boosting': 'gbdt', 'num_rounds': 4000, 'learning_rate': 0.029999999999999999, 'num_leaves': 31, 'num_threads': 4, 'device': 'cpu', 'max_depth': -1, 'min_data_in_leaf': 390, 'feature_fraction': 0.40000000000000002

[120]	valid_0's auc: 0.980475
[130]	valid_0's auc: 0.981197
[140]	valid_0's auc: 0.981564
[150]	valid_0's auc: 0.982116
[160]	valid_0's auc: 0.982613
[170]	valid_0's auc: 0.982914
[180]	valid_0's auc: 0.983252
[190]	valid_0's auc: 0.983477
[200]	valid_0's auc: 0.983685
[210]	valid_0's auc: 0.983873
[220]	valid_0's auc: 0.984043
[230]	valid_0's auc: 0.984212
[240]	valid_0's auc: 0.984381
[250]	valid_0's auc: 0.98453
[260]	valid_0's auc: 0.984648
[270]	valid_0's auc: 0.984744
[280]	valid_0's auc: 0.984842
[290]	valid_0's auc: 0.984896
[300]	valid_0's auc: 0.98498
[310]	valid_0's auc: 0.98504
[320]	valid_0's auc: 0.985088
[330]	valid_0's auc: 0.985128
[340]	valid_0's auc: 0.985177
[350]	valid_0's auc: 0.985207
[360]	valid_0's auc: 0.985241
[370]	valid_0's auc: 0.985271
[380]	valid_0's auc: 0.985307
[390]	valid_0's auc: 0.98533
[400]	valid_0's auc: 0.985348
[410]	valid_0's auc: 0.985376
[420]	valid_0's auc: 0.985395
[430]	valid_0's auc: 0.98541
[440]	valid_0's auc: 0.985421
[450]	valid_0's

[380]	valid_0's auc: 0.985264
[390]	valid_0's auc: 0.985289
[400]	valid_0's auc: 0.985283
[410]	valid_0's auc: 0.985284
[420]	valid_0's auc: 0.985294
[430]	valid_0's auc: 0.985304
[440]	valid_0's auc: 0.985311
[450]	valid_0's auc: 0.985311
[460]	valid_0's auc: 0.985305
[470]	valid_0's auc: 0.985297
[480]	valid_0's auc: 0.985299
[490]	valid_0's auc: 0.985308
[500]	valid_0's auc: 0.985313
[510]	valid_0's auc: 0.985312
[520]	valid_0's auc: 0.985309
[530]	valid_0's auc: 0.985323
[540]	valid_0's auc: 0.985323
[550]	valid_0's auc: 0.985324
[560]	valid_0's auc: 0.985327
[570]	valid_0's auc: 0.985329
[580]	valid_0's auc: 0.98533
[590]	valid_0's auc: 0.985332
[600]	valid_0's auc: 0.985335
[610]	valid_0's auc: 0.98533
[620]	valid_0's auc: 0.985337
[630]	valid_0's auc: 0.985332
[640]	valid_0's auc: 0.985328
[650]	valid_0's auc: 0.985323
[660]	valid_0's auc: 0.985311
[670]	valid_0's auc: 0.985307
[680]	valid_0's auc: 0.985307
[690]	valid_0's auc: 0.98531
Early stopping, best iteration is:
[625]	va

In [32]:
# importance_type (string, optional (default="split")) 
# – How the importance is calculated. 
# If “split”, result contains numbers of times the feature is used in a model. 
# If “gain”, result contains total gains of splits which use the feature.
importance = pd.Series(model.feature_importance(importance_type='gain'), index=feature_cols)
importance = importance.sort_values(ascending=False)
if len(model.feature_importance()) != len(feature_cols):
    raise ValueError('Feature importance has length: {}, \n while feature number is {}'.
                     format(len(model.feature_importance()), len(feature_cols)))
    
importance

matrixFact_user_ip_item_appdeviceos       3.446741e+08
matrixFact_user_iposdeviceapp_item_app    8.417493e+07
channel                                   1.668758e+07
device_os                                 1.108526e+07
ip_app_device_os_time2nextclick           8.144030e+06
app                                       6.917718e+06
ip_app_device_os_lasttimediff             6.707789e+06
ip_day_hour_count                         4.397688e+06
ip_app_device_os_countfromfuture          3.394890e+06
os                                        2.623705e+06
ip_device_os_count                        2.545699e+06
ip_app_os_day_hour_count                  2.149142e+06
hour                                      1.961198e+06
ip_app_device_time2nextclick              1.530539e+06
ip_device_os_lasttimediff                 1.250323e+06
ip_app_device_mean                        8.831188e+05
ip_app_device_regression                  8.715329e+05
ip_app_device_lasttimediff                8.461638e+05
ip_device_