In [1]:
import pandas as pd
import lightgbm as lgb
import gc
import numpy as np

In [2]:
entry = 5
K = 3
folds = [0, 1, 2]
folds_comp = [[1, 2], [0, 2], [0, 1]]
base_path = '/home/kai/data/kaggle/talkingdata/wl/data/stacking/train/'
base_path_test = '/home/kai/data/kaggle/talkingdata/wl/data/stacking/test/'

func_pool = ['count', 'mean', 'reversemean', 'time2nextclick', 
             'time2previousclick', 'countfromfuture', 'countfrompast', 'lasttimediff']

func_pool = ['count']

target = 'is_attributed'
categorical_feature = ['app', 'device', 'os', 'channel', 'hour']

combine = 0
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 2000,
        'learning_rate': 0.1,
        'num_leaves': 31,
        'num_threads': 4, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': -1, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 390,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.7, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':100,
        'bagging_fraction': 0.7, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        'verbose': 0,
        'scale_pos_weight': 400,
        'metric' : [ 'auc']
    }

categorical_col = [ 'app', 'device', 'os', 'channel', 'hour', ]

In [3]:
from sklearn.model_selection import train_test_split
def get_train_val(df, mode='shuffle', ratio=0.2, seed=19):
    if mode == 'shuffle':
        trainset, valset = train_test_split(df,test_size=ratio, random_state=seed)
    return (trainset, valset)

def train_lightgbm(x_train, x_val, feature_cols, categorical_feature, params, best_round = None, target='is_attributed'):
    param = params.copy()
    y_train = x_train[target].values
    y_val = x_val[target].values
    
    lgb_train = lgb.Dataset(x_train[feature_cols], y_train, categorical_feature = categorical_feature)
    lgb_val = lgb.Dataset(x_val[feature_cols], y_val, categorical_feature = categorical_feature)
    if best_round is not None:
        param['num_rounds'] = best_round
        del param['early_stopping_round']
    print('start training')
    model = lgb.train(param, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)
    return model
    
    

In [None]:
%env JOBLIB_TEMP_FOLDER=/tmp
combine = 0 # for lightgbm params

for func in func_pool:
    print('starting training function : {}'.format(func))
    for entry_pressed in range(entry):
        print('column pressed: {}'.format(entry_pressed))
        for fold in range(K):
            fold_trainset = folds[fold]
            fold_pred_train0, fold_pred_train1 = folds_comp[fold] 
            trainset_name = 'train_fold{}_{}_{}.csv'.format(fold_trainset, func, entry_pressed)
            pred_dict = {'train0': fold_pred_train0, 'train1': fold_pred_train1, 'test':fold_trainset}
            print('=================')
            df_train = pd.read_csv(base_path + trainset_name)
            print('loading df_train done!')
            print(trainset_name)
            trainset, valset = get_train_val(df_train)
            feature_cols = list(set(trainset.columns) - set([target]))
            model = train_lightgbm(trainset, valset, feature_cols, categorical_feature, params)
#             train on all. comment to save time
#             best_round = model.best_iteration
#             model = train_lightgbm(df_train, valset, feature_cols, categorical_feature, params, best_round)
            del df_train
            gc.collect()
            ###log
            all_str = 'fold: {} \t function: {} \t'.format(fold, func)
            print(all_str)
            with open('feature_all.txt', 'w') as text_file:
                text_file.write(all_str + '\n')
            #!!!
            print('start predicting')
            for each_fold in pred_dict:
                if each_fold == 'test':
                    file_name = 'test_fold{}_{}_{}.csv'.format(pred_dict[each_fold], func, entry_pressed)
                    read_path = base_path_test + file_name
                    save_name = 'test_fold{}_{}_{}_param{}.npy'.format(pred_dict[each_fold], func, entry_pressed, combine)
                    save_path = base_path_test + 'result/' + save_name
                else:
                    file_name = 'train_fold{}_{}_{}.csv'.format(pred_dict[each_fold], func, entry_pressed)
                    read_path = base_path + file_name
                    save_name = 'train_fold{}_predonfold{}_{}_{}_param{}.npy'.format(fold_trainset, pred_dict[each_fold], func, entry_pressed, combine)
                    save_path = base_path + 'result/'+ save_name
                print('loading files: \n{}'.format(read_path))
                df_pred = pd.read_csv(read_path)
                print('predicting...')
                preds = model.predict(df_pred[feature_cols])
                del df_pred
                gc.collect()
                print('saving files: \n{}'.format(save_path))
                np.save(save_path, preds)
                ###log

                with open('feature_all.txt', 'w') as text_file:
                    text_file.write('saving ' + save_path + '\n')
                #!!!
            

env: JOBLIB_TEMP_FOLDER=/tmp
starting training function : count
column pressed: 0
loading df_train done!
train_fold0_count_0.csv
start training




Training until validation scores don't improve for 100 rounds.
[10]	valid_0's auc: 0.970854
[20]	valid_0's auc: 0.972561
[30]	valid_0's auc: 0.974158
[40]	valid_0's auc: 0.975203
[50]	valid_0's auc: 0.975944
[60]	valid_0's auc: 0.976471
[70]	valid_0's auc: 0.976803
[80]	valid_0's auc: 0.976973
[90]	valid_0's auc: 0.977052
[100]	valid_0's auc: 0.977144
[110]	valid_0's auc: 0.977251
[120]	valid_0's auc: 0.977261
[130]	valid_0's auc: 0.977272
[140]	valid_0's auc: 0.977317
[150]	valid_0's auc: 0.97737
[160]	valid_0's auc: 0.977366
[170]	valid_0's auc: 0.977382
[180]	valid_0's auc: 0.977385
[190]	valid_0's auc: 0.977401
[200]	valid_0's auc: 0.977424
[210]	valid_0's auc: 0.977436
[220]	valid_0's auc: 0.977417
[230]	valid_0's auc: 0.97742
[240]	valid_0's auc: 0.977423
[250]	valid_0's auc: 0.977423
[260]	valid_0's auc: 0.977433
[270]	valid_0's auc: 0.977456
[280]	valid_0's auc: 0.977433
[290]	valid_0's auc: 0.977432
[300]	valid_0's auc: 0.977406
[310]	valid_0's auc: 0.977402
[320]	valid_0's au

[150]	valid_0's auc: 0.977996
[160]	valid_0's auc: 0.978002
[170]	valid_0's auc: 0.977984
[180]	valid_0's auc: 0.97799
[190]	valid_0's auc: 0.97798
[200]	valid_0's auc: 0.977971
[210]	valid_0's auc: 0.977995
[220]	valid_0's auc: 0.978028
[230]	valid_0's auc: 0.97803
[240]	valid_0's auc: 0.978067
[250]	valid_0's auc: 0.978105
[260]	valid_0's auc: 0.978134
[270]	valid_0's auc: 0.978118
[280]	valid_0's auc: 0.978097
[290]	valid_0's auc: 0.978111
[300]	valid_0's auc: 0.978079
[310]	valid_0's auc: 0.978064
[320]	valid_0's auc: 0.978048
[330]	valid_0's auc: 0.978036
[340]	valid_0's auc: 0.97802
[350]	valid_0's auc: 0.978013
Early stopping, best iteration is:
[257]	valid_0's auc: 0.978141
fold: 1 	 function: count 	
start predicting
loading files: 
/home/kai/data/kaggle/talkingdata/wl/data/stacking/train/train_fold0_count_1.csv
predicting...
saving files: 
/home/kai/data/kaggle/talkingdata/wl/data/stacking/train/result/train_fold1_predonfold0_count_1_param0.npy
loading files: 
/home/kai/data/

In [6]:
np.save(save_path, preds)