In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import gc
from sklearn.metrics import roc_auc_score

In [2]:
df_all = pd.read_feather('/home/kai/talkingdata/data/ALL_features_supplementV3_feature42.ftr')
target = 'is_attributed'
extra = ['ip_app_device_os_channel_regression']
feature_cols = list(set(df_all.columns) - set([target]) -set(extra))
categorical_col = [ 'app', 'device', 'os', 'channel', 'hour',   ]

# LightGBM

In [3]:
params_raw = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 2400,
        'learning_rate': 0.1,
        'num_leaves': 11,
        'num_threads': 62, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': -1, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 390,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.7, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':60,
        'bagging_fraction': 0.9, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        'verbose': 0,
        'scale_pos_weight': 400,
        'metric' : [ 'auc'],
        'lambda_l2': 1
    }

categorical_col = [ 'app', 'device', 'os', 'channel', 'hour']
target = 'is_attributed'

In [9]:
def get_parameters(df ,param):
    params = param.copy()
    dff = df.copy()
    dff = dff[(dff.ROC == 0) ]
    if len(dff) > 0:
        index = dff.index.values.copy()
        np.random.shuffle(index)
        num=index[0]
        param_get = dff.loc[num].to_dict()
        param_get.pop('ROC')
        param_get.pop('num_rounds')
        param_get['max_depth'] = int(param_get['max_depth'])
        param_get['min_data_in_leaf'] = int(param_get['min_data_in_leaf'])
        param_get['num_leaves'] = int(param_get['num_leaves'])
        
        params.update(param_get)
        return(params, num)
    else:
        return (None, None)

def get_best_parm(df,param, rank=1):
    rank = int(rank -1)
    params = param.copy()
    dff = df.copy()
    param_get = dff.sort_values('ROC', ascending=False).iloc[rank].to_dict()
    roc = param_get['ROC']
    print('ROC is : {}'.format(roc))
    
    best_round = int(param_get['num_rounds'])
    print('Best round: {}'.format(best_round))
    param_get.pop('ROC')
    param_get['max_depth'] = int(param_get['max_depth'])
    param_get['min_data_in_leaf'] = int(param_get['min_data_in_leaf'])
    param_get['num_leaves'] = int(param_get['num_leaves'])
    param_get['num_rounds'] = int(best_round+500)
    params.update(param_get)
    return(params, best_round, roc)



In [10]:
def train_lightgbm(x_train, x_val, feature_cols, categorical_feature, params, best_round = None, target='is_attributed'):
    param = params.copy()
    y_train = x_train[target].values
    y_val = x_val[target].values
    
    lgb_train = lgb.Dataset(x_train[feature_cols], y_train, categorical_feature = categorical_feature)
    lgb_val = lgb.Dataset(x_val[feature_cols], y_val, categorical_feature = categorical_feature)
    if best_round is not None:
        param['num_rounds'] = best_round
        del param['early_stopping_round']
    print('start training')
    model = lgb.train(param, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)
    return model

In [None]:

model_path = '/home/kai/talkingdata/data/'
prefix = 'lightgbm_bestparam_bestround_{}_roc_{}'
result_param =pd.read_csv('/home/kai/talkingdata/data/girdsearchparams-result.csv')

params, best_round, roc = get_best_parm(result_param, params_raw, rank=1)
print(params)
model = train_lightgbm(df_all, df_all.iloc[-1000000:], feature_cols, categorical_col, params, best_round=best_round+500)
model.save_model(model_path+prefix.format(best_round, roc))
# np.save(model_path+prefix.format(1)+'.npy', best_round_arr)

import json
feature_file = '/home/kai/talkingdata/data/lightgbm-featurecolsV3_col38_lightgbm_bestparam_bestround_{}_roc_{}.json'.format(best_round, roc)

with open(feature_file, 'w') as outfile:
    json.dump(feature_cols, outfile)
    
# model1 = lgb.Booster(model_file='gbm_test',)
    
    
    

ROC is : 0.9823818964626924
Best round: 498
{'objective': 'binary', 'boosting': 'gbdt', 'num_rounds': 998, 'learning_rate': 0.10000000000000001, 'num_leaves': 11, 'num_threads': 62, 'device': 'cpu', 'max_depth': -1, 'min_data_in_leaf': 100, 'feature_fraction': 0.59999999999999998, 'feature_fraction_seed': 1, 'early_stopping_round': 60, 'bagging_fraction': 1.0, 'bagging_freq': 1, 'bagging_seed': 1, 'verbose': 0, 'scale_pos_weight': 400.0, 'metric': ['auc'], 'lambda_l2': 1.0}
start training




[10]	valid_0's auc: 0.982601
[20]	valid_0's auc: 0.985223
[30]	valid_0's auc: 0.987735
[40]	valid_0's auc: 0.989841
[50]	valid_0's auc: 0.990575
[60]	valid_0's auc: 0.991248
[70]	valid_0's auc: 0.991757
[80]	valid_0's auc: 0.992165
[90]	valid_0's auc: 0.992411
[100]	valid_0's auc: 0.992575
[110]	valid_0's auc: 0.9928
[120]	valid_0's auc: 0.992971
[130]	valid_0's auc: 0.993197
[140]	valid_0's auc: 0.993321
[150]	valid_0's auc: 0.993396
[160]	valid_0's auc: 0.993509
[170]	valid_0's auc: 0.993572
[180]	valid_0's auc: 0.993627
[190]	valid_0's auc: 0.993804
[200]	valid_0's auc: 0.994294
[210]	valid_0's auc: 0.994378
[220]	valid_0's auc: 0.994391
[230]	valid_0's auc: 0.994524
[240]	valid_0's auc: 0.994509
[250]	valid_0's auc: 0.994682
[260]	valid_0's auc: 0.994699
[270]	valid_0's auc: 0.994728
[280]	valid_0's auc: 0.994859
[290]	valid_0's auc: 0.994899
[300]	valid_0's auc: 0.99528
[310]	valid_0's auc: 0.99537
[320]	valid_0's auc: 0.995378
[330]	valid_0's auc: 0.995397
[340]	valid_0's auc: 0.

In [51]:
ROC = roc_auc_score(valset[target].values, model.predict(valset[cur_feature]))
df_grid.loc[index, 'roc'] = ROC
df_grid.loc[index, 'best_rount'] = best_round
df_grid.to_csv('/home/kai/talkingdata/data/girdsearch-result.csv', index=False)

gc.collect()
print(ROC)

0.981877586913


In [85]:
df_grid.loc[38,'drpcol']

'allfeaturesneeded'