In [None]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import gc

In [2]:
load_path = '/home/kai/data/kaggle/talkingdata/wl/data/equalhour/'
file_format = '{}_features_matrixregV4_norm.csv'
day_list = ['day7', 'day8', 'day9']
df_dict = {}
for file in ['day7', 'day8', 'day9','test']: 
    df_dict[file] = pd.read_csv(load_path+file_format.format(file))
    print(file_format.format(file))

day7_features_matrixregV4_norm.csv
day8_features_matrixregV4_norm.csv
day9_features_matrixregV4_norm.csv
test_features_matrixregV4_norm.csv


# define parameter

In [4]:
target = 'is_attributed'

day_list = ['day7', 'day8', 'day9']

combine = 0
params_raw = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 2000,
        'learning_rate': 0.1,
        'num_leaves': 61,
        'num_threads': 4, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': -1, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 390,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.7, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':70,
        'bagging_fraction': 0.7, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        'verbose': 0,
        'scale_pos_weight': 400,
        'metric' : [ 'auc']
    }
categorical_col = [ 'app', 'device', 'os', 'channel', 'hour',   ]

feature_cols = list(df_dict['test'].columns.values)

In [5]:
# get rid of extra features
# extra = set(['matrixFact_user_iposdeviceapp_item_channel', 'matrixFact_user_iposdeviceapp_item_device'])
extra = set(['app_device_os_mean',
             'app_device_os_time2nextclick', 
             'app_device_os_time2previousclick',
             'app_device_os_countfromfuture',
             'app_device_os_countfrompast', 
             'app_device_os_lasttimediff', 
             'app_device_os_regression', 'app_device_os_channel', 'app_device_os', 'app_os', 'app_device', 'device_os',
             ])
# extra = set(['ip_app_device_mean',
#              'ip_app_device_time2nextclick', 
#              'ip_app_device_time2previousclick',
#              'ip_app_device_countfromfuture',
#              'ip_app_device_countfrompast', 
#              'ip_app_device_lasttimediff', 
#              'ip_app_device_regression'])
feature_cols = list(set(feature_cols) - extra)

# Create List of Parameter to go through

In [6]:
combine1 = [(390,400), (100, 99.7), (190, 200), (1000, 400), (1999, 400)]
learning_rate = [0.1, 0.03]
feature_fraction = [0.7, 0.8, 0.6]
num_leaves = [31, 61]
list_of_parameter = []
for nleave in num_leaves:
    for com in combine1:
        for lr in learning_rate:
            for ff in feature_fraction:
                list_of_parameter.append({'min_data_in_leaf': com[0], 
                                          'scale_pos_weight': com[1], 
                                          'learning_rate':lr, 
                                          'feature_fraction':ff, 
                                          'ROC':0, 
                                          'num_rounds':0, 
                                          'max_depth': -1, 'num_leaves': nleave})
# df_grid = pd.DataFrame(list_of_parameter)
# df_grid.to_csv('/home/kai/data/kaggle/talkingdata/wl/report/grid_search.csv', index=False)

In [7]:
# Load Parameters
df_grid = pd.read_csv('/home/kai/data/kaggle/talkingdata/wl/report/grid_search.csv')

# Get parameters

In [8]:
def get_parameters(df ,param):
    params = param.copy()
    dff = df.copy()
    dff = dff[dff.ROC == 0]
    if len(dff) > 0:
        index = dff.index.values.copy()
        np.random.shuffle(index)
        num=index[0]
        param_get = dff.loc[num].to_dict()
        param_get.pop('ROC')
        param_get.pop('num_rounds')
        param_get['max_depth'] = int(param_get['max_depth'])
        param_get['min_data_in_leaf'] = int(param_get['min_data_in_leaf'])
        param_get['num_leaves'] = int(param_get['num_leaves'])
        
        params.update(param_get)
        return(params, num)
    else:
        return (None, None)

In [None]:
def train_lightgbm(x_train, x_val, feature_cols, categorical_feature, params, best_round = None, target='is_attributed'):
    param = params.copy()
    y_train = x_train[target].values
    y_val = x_val[target].values
    
    lgb_train = lgb.Dataset(x_train[feature_cols], y_train, categorical_feature = categorical_feature)
    lgb_val = lgb.Dataset(x_val[feature_cols], y_val, categorical_feature = categorical_feature)
    if best_round is not None:
        param['num_rounds'] = best_round
        del param['early_stopping_round']
    print('start training')
    model = lgb.train(param, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)
    return model

# Train on Day8,9 Val Day 7

In [None]:
from sklearn.metrics import roc_auc_score
%env JOBLIB_TEMP_FOLDER=/tmp
day = 'day7'
params, index = get_parameters(df_grid, params_raw)
counter = 0

train_day = list(set(day_list)-set([day]))
trainset = pd.concat([df_dict[train_day[0]],df_dict[train_day[1]]])
print('building train set done!')
valset = df_dict[day]
print('building train val done!')

del df_dict
gc.collect()

while index is not None:
    counter += 1
    print('=================================================')
    print(counter)
    print(index)
    print(params)
    model = train_lightgbm(trainset, valset, feature_cols, categorical_col, params)
    best_round = model.best_iteration
    # Calculate ROC-AUC
    ROC = roc_auc_score(valset[target].values, model.predict(valset[feature_cols]))
    df_grid.loc[index, 'ROC'] = ROC
    df_grid.loc[index, 'num_rounds'] = best_round
    df_grid.to_csv('/home/kai/data/kaggle/talkingdata/wl/report/grid_search.csv', index=False)
    params, index = get_parameters(df_grid, params_raw)
    gc.collect()
    
print('done!')
    


env: JOBLIB_TEMP_FOLDER=/tmp
building train set done!
building train val done!
1
29
{'objective': 'binary', 'boosting': 'gbdt', 'num_rounds': 2000, 'learning_rate': 0.029999999999999999, 'num_leaves': 31, 'num_threads': 4, 'device': 'cpu', 'max_depth': -1, 'min_data_in_leaf': 1999, 'feature_fraction': 0.59999999999999998, 'feature_fraction_seed': 1, 'early_stopping_round': 70, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'bagging_seed': 1, 'verbose': 0, 'scale_pos_weight': 400.0, 'metric': ['auc']}
start training




Training until validation scores don't improve for 70 rounds.
[10]	valid_0's auc: 0.975154
[20]	valid_0's auc: 0.977537
[30]	valid_0's auc: 0.978037
[40]	valid_0's auc: 0.979128
[50]	valid_0's auc: 0.979548
[60]	valid_0's auc: 0.980128
[70]	valid_0's auc: 0.980587
[80]	valid_0's auc: 0.981058
[90]	valid_0's auc: 0.981434
[100]	valid_0's auc: 0.981733
[110]	valid_0's auc: 0.982016
[120]	valid_0's auc: 0.982369
[130]	valid_0's auc: 0.982675
[140]	valid_0's auc: 0.982942
[150]	valid_0's auc: 0.983179
[160]	valid_0's auc: 0.983415
[170]	valid_0's auc: 0.983579
[180]	valid_0's auc: 0.983761
[190]	valid_0's auc: 0.983875
[200]	valid_0's auc: 0.984
[210]	valid_0's auc: 0.984078
[220]	valid_0's auc: 0.984161
[230]	valid_0's auc: 0.984213
[240]	valid_0's auc: 0.984275
[250]	valid_0's auc: 0.984336
[260]	valid_0's auc: 0.98436
[270]	valid_0's auc: 0.984398
[280]	valid_0's auc: 0.984432
[290]	valid_0's auc: 0.98445
[300]	valid_0's auc: 0.984485
[310]	valid_0's auc: 0.984525
[320]	valid_0's auc: 0

[180]	valid_0's auc: 0.983477
[190]	valid_0's auc: 0.983621
[200]	valid_0's auc: 0.983757
[210]	valid_0's auc: 0.983886
[220]	valid_0's auc: 0.983979
[230]	valid_0's auc: 0.98405
[240]	valid_0's auc: 0.984111
[250]	valid_0's auc: 0.984173
[260]	valid_0's auc: 0.984224
[270]	valid_0's auc: 0.984265
[280]	valid_0's auc: 0.984304
[290]	valid_0's auc: 0.984336
[300]	valid_0's auc: 0.984388
[310]	valid_0's auc: 0.984416
[320]	valid_0's auc: 0.984432
[330]	valid_0's auc: 0.984457
[340]	valid_0's auc: 0.984464
[350]	valid_0's auc: 0.984489
[360]	valid_0's auc: 0.984508
[370]	valid_0's auc: 0.984526
[380]	valid_0's auc: 0.984542
[390]	valid_0's auc: 0.984561
[400]	valid_0's auc: 0.984578
[410]	valid_0's auc: 0.984589
[420]	valid_0's auc: 0.984597
[430]	valid_0's auc: 0.984611
[440]	valid_0's auc: 0.984619
[450]	valid_0's auc: 0.984627
[460]	valid_0's auc: 0.984632
[470]	valid_0's auc: 0.984635
[480]	valid_0's auc: 0.984649
[490]	valid_0's auc: 0.984658
[500]	valid_0's auc: 0.984662
[510]	valid

[80]	valid_0's auc: 0.984311
[90]	valid_0's auc: 0.98439
[100]	valid_0's auc: 0.984453
[110]	valid_0's auc: 0.984513
[120]	valid_0's auc: 0.984552
[130]	valid_0's auc: 0.984598
[140]	valid_0's auc: 0.9846
[150]	valid_0's auc: 0.984588
[160]	valid_0's auc: 0.984639
[170]	valid_0's auc: 0.984638
[180]	valid_0's auc: 0.984656
[190]	valid_0's auc: 0.984625
[200]	valid_0's auc: 0.984628
[210]	valid_0's auc: 0.984597
[220]	valid_0's auc: 0.984587
[230]	valid_0's auc: 0.984564
[240]	valid_0's auc: 0.984541
Early stopping, best iteration is:
[176]	valid_0's auc: 0.98466
10
28
{'objective': 'binary', 'boosting': 'gbdt', 'num_rounds': 2000, 'learning_rate': 0.029999999999999999, 'num_leaves': 31, 'num_threads': 4, 'device': 'cpu', 'max_depth': -1, 'min_data_in_leaf': 1999, 'feature_fraction': 0.80000000000000004, 'feature_fraction_seed': 1, 'early_stopping_round': 70, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'bagging_seed': 1, 'verbose': 0, 'scale_pos_weight': 400.0, 'metric': ['auc']}
start 

In [32]:
# importance_type (string, optional (default="split")) 
# – How the importance is calculated. 
# If “split”, result contains numbers of times the feature is used in a model. 
# If “gain”, result contains total gains of splits which use the feature.
importance = pd.Series(model.feature_importance(importance_type='gain'), index=feature_cols)
importance = importance.sort_values(ascending=False)
if len(model.feature_importance()) != len(feature_cols):
    raise ValueError('Feature importance has length: {}, \n while feature number is {}'.
                     format(len(model.feature_importance()), len(feature_cols)))
    
importance

matrixFact_user_ip_item_appdeviceos       3.446741e+08
matrixFact_user_iposdeviceapp_item_app    8.417493e+07
channel                                   1.668758e+07
device_os                                 1.108526e+07
ip_app_device_os_time2nextclick           8.144030e+06
app                                       6.917718e+06
ip_app_device_os_lasttimediff             6.707789e+06
ip_day_hour_count                         4.397688e+06
ip_app_device_os_countfromfuture          3.394890e+06
os                                        2.623705e+06
ip_device_os_count                        2.545699e+06
ip_app_os_day_hour_count                  2.149142e+06
hour                                      1.961198e+06
ip_app_device_time2nextclick              1.530539e+06
ip_device_os_lasttimediff                 1.250323e+06
ip_app_device_mean                        8.831188e+05
ip_app_device_regression                  8.715329e+05
ip_app_device_lasttimediff                8.461638e+05
ip_device_