In [10]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import gc
from sklearn.metrics import roc_auc_score

In [11]:
df_all = pd.read_feather('/home/kai/talkingdata/data/ALL_features_supplementV3_feature42.ftr')
target = 'is_attributed'
extra = ['ip_app_device_os_channel_regression']
feature_cols = list(set(df_all.columns) - set([target]) -set(extra))
categorical_col = [ 'app', 'device', 'os', 'channel', 'hour',   ]

In [12]:
path ='/home/kai/talkingdata/data/'
df_hour = pd.read_csv(path+'hourdistri.csv', index_col='Unnamed: 0')
index = {}
for day in ['day7', 'day8','day9']:
    index[day] = list(range(df_hour.loc[day,'4start'], df_hour.loc[day,'6end0sec'])) + \
    list(range(df_hour.loc[day,'9start'], df_hour.loc[day,'11end0sec'])) + \
    list(range(df_hour.loc[day,'13start'], df_hour.loc[day,'15end0sec'])) 

In [13]:

trainset = df_all.iloc[index['day7']+index['day8']]
valset = df_all.iloc[index['day9']]

In [15]:

combine1 = [(390,400), (100, 99.7), (400, 99.7), (100, 400), (999, 400)]
learning_rate = [0.1]
feature_fraction = [0.7, 0.6, 0.5, 0.4]
bagging_fraction = [0.7, 0.6, 0.8, 0.9, 1]
num_leaves = [31, 61,11]
lambda_l2 = [1,3,5,7,9]

list_of_parameter = []
for nleave in num_leaves:
    for com in combine1:
        for lr in learning_rate:
            for ff in feature_fraction:
                for bf in bagging_fraction:
                    for l2 in lambda_l2:
                        list_of_parameter.append({'min_data_in_leaf': com[0], 
                                                  'scale_pos_weight': com[1], 
                                                  'learning_rate':lr, 
                                                  'feature_fraction':ff, 
                                                  'bagging_fraction':bf,
                                                  'ROC':0, 
                                                  'num_rounds':0, 
                                                  'max_depth': -1, 
                                                  'lambda_l2':l2,
                                                  'num_leaves': nleave})
                        

df_grid = pd.DataFrame(list_of_parameter)
df_grid.to_csv('/home/kai/talkingdata/data/girdsearchparams-result.csv', index=False)                        

# LightGBM

In [19]:
params_raw = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 4000,
        'learning_rate': 0.1,
        'num_leaves': 61,
        'num_threads': 62, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': -1, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 390,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.8, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':50,
        'bagging_fraction': 0.7, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        'verbose': 0,
        'scale_pos_weight': 400,
        'metric' : [ 'auc']
    }
target = 'is_attributed'

In [20]:
def get_parameters(df ,param):
    params = param.copy()
    dff = df.copy()
    dff = dff[(dff.ROC == 0) ]
    if len(dff) > 0:
        index = dff.index.values.copy()
        np.random.shuffle(index)
        num=index[0]
        param_get = dff.loc[num].to_dict()
        param_get.pop('ROC')
        param_get.pop('num_rounds')
        param_get['max_depth'] = int(param_get['max_depth'])
        param_get['min_data_in_leaf'] = int(param_get['min_data_in_leaf'])
        param_get['num_leaves'] = int(param_get['num_leaves'])
        
        params.update(param_get)
        return(params, num)
    else:
        return (None, None)
    

In [21]:
def train_lightgbm(x_train, x_val, feature_cols, categorical_feature, params, best_round = None, target='is_attributed'):
    param = params.copy()
    y_train = x_train[target].values
    y_val = x_val[target].values
    
    lgb_train = lgb.Dataset(x_train[feature_cols], y_train, categorical_feature = categorical_feature)
    lgb_val = lgb.Dataset(x_val[feature_cols], y_val, categorical_feature = categorical_feature)
    if best_round is not None:
        param['num_rounds'] = best_round
        del param['early_stopping_round']
    print('start training')
    model = lgb.train(param, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)
    return model

In [None]:
df_grid = pd.read_csv('/home/kai/talkingdata/data/girdsearchparams-result.csv')
params, index = get_parameters(df_grid, params_raw)
counter = 0
while index is not None:
    counter += 1
    print('=================================================')
    print(counter)
    print(index)
    print(params)

    model = train_lightgbm(trainset, valset, feature_cols, categorical_col, params)
    best_round = model.best_iteration
    
    ROC = roc_auc_score(valset[target].values, model.predict(valset[feature_cols]))
    df_grid.loc[index, 'ROC'] = ROC
    df_grid.loc[index, 'num_rounds'] = best_round
    df_grid.to_csv('/home/kai/talkingdata/data/girdsearchparams-result.csv', index=False)
    df_grid = pd.read_csv('/home/kai/talkingdata/data/girdsearchparams-result.csv')
    params, index = get_parameters(df_grid, params_raw)
    
    gc.collect()
    
print('done!')

1
388
{'objective': 'binary', 'boosting': 'gbdt', 'num_rounds': 4000, 'learning_rate': 0.10000000000000001, 'num_leaves': 31, 'num_threads': 62, 'device': 'cpu', 'max_depth': -1, 'min_data_in_leaf': 100, 'feature_fraction': 0.40000000000000002, 'feature_fraction_seed': 1, 'early_stopping_round': 50, 'bagging_fraction': 0.80000000000000004, 'bagging_freq': 1, 'bagging_seed': 1, 'verbose': 0, 'scale_pos_weight': 400.0, 'metric': ['auc'], 'lambda_l2': 7.0}
start training




Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.972605
[20]	valid_0's auc: 0.976008
[30]	valid_0's auc: 0.978318
[40]	valid_0's auc: 0.979554
[50]	valid_0's auc: 0.980303
[60]	valid_0's auc: 0.981164
[70]	valid_0's auc: 0.98164
[80]	valid_0's auc: 0.981888
[90]	valid_0's auc: 0.981975
[100]	valid_0's auc: 0.982028
[110]	valid_0's auc: 0.982066
[120]	valid_0's auc: 0.982103
[130]	valid_0's auc: 0.982152
[140]	valid_0's auc: 0.982197
[150]	valid_0's auc: 0.982254
[180]	valid_0's auc: 0.982297
[190]	valid_0's auc: 0.982296
[200]	valid_0's auc: 0.982298
[210]	valid_0's auc: 0.982287
[220]	valid_0's auc: 0.982294
[230]	valid_0's auc: 0.982303
[240]	valid_0's auc: 0.982302
[280]	valid_0's auc: 0.982322
[290]	valid_0's auc: 0.982315
[300]	valid_0's auc: 0.98232
Early stopping, best iteration is:
[258]	valid_0's auc: 0.982336
2
2151
{'objective': 'binary', 'boosting': 'gbdt', 'num_rounds': 4000, 'learning_rate': 0.10000000000000001, 'num_leaves': 61, 'num_th

[80]	valid_0's auc: 0.981725
[90]	valid_0's auc: 0.981724
[100]	valid_0's auc: 0.98177
[110]	valid_0's auc: 0.981809
[120]	valid_0's auc: 0.981815
[130]	valid_0's auc: 0.981795
[140]	valid_0's auc: 0.981843
[150]	valid_0's auc: 0.981821
[160]	valid_0's auc: 0.98179
[170]	valid_0's auc: 0.981767
[180]	valid_0's auc: 0.981749
[190]	valid_0's auc: 0.981739
Early stopping, best iteration is:
[147]	valid_0's auc: 0.981854
7
1345
{'objective': 'binary', 'boosting': 'gbdt', 'num_rounds': 4000, 'learning_rate': 0.10000000000000001, 'num_leaves': 11, 'num_threads': 62, 'device': 'cpu', 'max_depth': -1, 'min_data_in_leaf': 100, 'feature_fraction': 0.59999999999999998, 'feature_fraction_seed': 1, 'early_stopping_round': 50, 'bagging_fraction': 1.0, 'bagging_freq': 1, 'bagging_seed': 1, 'verbose': 0, 'scale_pos_weight': 400.0, 'metric': ['auc'], 'lambda_l2': 1.0}
start training
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.967438
[20]	valid_0's auc: 0.971773
[

In [51]:
ROC = roc_auc_score(valset[target].values, model.predict(valset[cur_feature]))
df_grid.loc[index, 'roc'] = ROC
df_grid.loc[index, 'best_rount'] = best_round
df_grid.to_csv('/home/kai/talkingdata/data/girdsearch-result.csv', index=False)

gc.collect()
print(ROC)

0.981877586913


In [26]:
aa

[2, 3, 4]