In [49]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import gc
from sklearn.metrics import roc_auc_score

In [9]:
df_all = pd.read_feather('/home/kai/talkingdata/data/ALL_features_supplementV3_feature42.ftr')
target = 'is_attributed'
extra = ['ip_app_device_os_channel_regression']
feature_cols = list(set(df_all.columns) - set([target]) -set(extra))
categorical_col = [ 'app', 'device', 'os', 'channel', 'hour',   ]

In [43]:
path ='/home/kai/talkingdata/data/'
df_hour = pd.read_csv(path+'hourdistri.csv', index_col='Unnamed: 0')
index = {}
for day in ['day7', 'day8','day9']:
    index[day] = list(range(df_hour.loc[day,'4start'], df_hour.loc[day,'6end0sec'])) + \
    list(range(df_hour.loc[day,'9start'], df_hour.loc[day,'11end0sec'])) + \
    list(range(df_hour.loc[day,'13start'], df_hour.loc[day,'15end0sec'])) 

In [44]:

trainset = df_all.iloc[index['day7']+index['day8']]
valset = df_all.iloc[index['day9']]

In [11]:
feature_seach = feature_cols.copy()
feature_seach.append('allfeaturesneeded')
df_search = pd.DataFrame({'drpcol':feature_seach, 
                          'roc': np.zeros(len(feature_seach)), 
                          'best_rount':np.zeros(len(feature_seach))})

In [14]:
df_search.to_csv('/home/kai/talkingdata/data/girdsearch-result.csv', index=False)

# LightGBM

In [45]:
params_raw = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 4000,
        'learning_rate': 0.1,
        'num_leaves': 61,
        'num_threads': 62, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': -1, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 390,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.8, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':50,
        'bagging_fraction': 0.7, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        'verbose': 0,
        'scale_pos_weight': 400,
        'metric' : [ 'auc']
    }
target = 'is_attributed'

In [87]:
def get_parameters(df):
    dff = df.copy()
    dff = dff[(dff.roc == 0)] 
    if len(dff) > 0:
        index = dff.index.values.copy()
        np.random.shuffle(index)
        num=index[0]
#         print(num)
#         col = dff.iloc[num]['drpcol']
        col = dff.loc[num,'drpcol']
        
        return(col, num)
    else:
        return (None, None)
    
def feature_catg(feature_cols,categorical_col,removal):
    cur_f = feature_cols.copy()
    cur_c = categorical_col.copy()
    if removal == 'allfeaturesneeded':
        return cur_f, cur_c
    if removal in cur_f:
        cur_f.remove(removal)
        print('{} is removed!'.format(removal))
        if removal in cur_c:
            cur_c.remove(removal)
            print('{} is removed from category!'.format(removal))
    else:
        raise ValueError('{} is not in the feature list'.format(removal))
    return cur_f, cur_c

In [47]:
def train_lightgbm(x_train, x_val, feature_cols, categorical_feature, params, best_round = None, target='is_attributed'):
    param = params.copy()
    y_train = x_train[target].values
    y_val = x_val[target].values
    
    lgb_train = lgb.Dataset(x_train[feature_cols], y_train, categorical_feature = categorical_feature)
    lgb_val = lgb.Dataset(x_val[feature_cols], y_val, categorical_feature = categorical_feature)
    if best_round is not None:
        param['num_rounds'] = best_round
        del param['early_stopping_round']
    print('start training')
    model = lgb.train(param, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)
    return model

In [88]:
df_grid = pd.read_csv('/home/kai/talkingdata/data/girdsearch-result.csv')
removal_col, index = get_parameters(df_grid)
counter = 0
while index is not None:
    counter += 1
    print('=================================================')
    print(counter)
    print(index)

    cur_feature, cur_cat  = feature_catg(feature_cols, categorical_col, removal_col)
    
    model = train_lightgbm(trainset, valset, cur_feature, cur_cat, params_raw)
    best_round = model.best_iteration
    ROC = roc_auc_score(valset[target].values, model.predict(valset[cur_feature]))
    df_grid.loc[index, 'roc'] = ROC
    df_grid.loc[index, 'best_rount'] = best_round
    df_grid.to_csv('/home/kai/talkingdata/data/girdsearch-result.csv', index=False)
    removal_col, index = get_parameters(df_grid)
    
    gc.collect()
    
print('done!')

1
7
ip_device_os_countfromfuture is removed!
start training




Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.976722
[20]	valid_0's auc: 0.978563
[30]	valid_0's auc: 0.979736
[40]	valid_0's auc: 0.980656
[50]	valid_0's auc: 0.981147
[60]	valid_0's auc: 0.981509
[70]	valid_0's auc: 0.981728
[80]	valid_0's auc: 0.98185
[90]	valid_0's auc: 0.981923
[100]	valid_0's auc: 0.981978
[110]	valid_0's auc: 0.981962
[120]	valid_0's auc: 0.98192
[130]	valid_0's auc: 0.981916
[140]	valid_0's auc: 0.981886
[150]	valid_0's auc: 0.981855
Early stopping, best iteration is:
[102]	valid_0's auc: 0.981986
2
8
ip_app_day_hour_count is removed!
start training
Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.976712
[20]	valid_0's auc: 0.978521
[30]	valid_0's auc: 0.979661
[40]	valid_0's auc: 0.980637
[50]	valid_0's auc: 0.981212
[60]	valid_0's auc: 0.98146
[70]	valid_0's auc: 0.981743
[80]	valid_0's auc: 0.981777
[90]	valid_0's auc: 0.98186
[100]	valid_0's auc: 0.981913
[110]	valid_0's auc: 0.981906


In [51]:
ROC = roc_auc_score(valset[target].values, model.predict(valset[cur_feature]))
df_grid.loc[index, 'roc'] = ROC
df_grid.loc[index, 'best_rount'] = best_round
df_grid.to_csv('/home/kai/talkingdata/data/girdsearch-result.csv', index=False)

gc.collect()
print(ROC)

0.981877586913


In [85]:
df_grid.loc[38,'drpcol']

'allfeaturesneeded'