In [1]:
import pandas as pd
import numpy as np
import sys
import lightgbm as lgb

In [2]:
entry = 5
K = 3
folds = [0, 1, 2]
folds_comp = [[1, 2], [0, 2], [0, 1]]
combine = 0

base_path = '/home/kai/data/kaggle/talkingdata/wl/data/stacking/train/result/layer2/'
base_path_test = '/home/kai/data/kaggle/talkingdata/wl/data/stacking/test/result/layer2/'

func_pool = ['count', 'mean', 'reversemean', 'time2nextclick', 
             'time2previousclick', 'countfromfuture', 'countfrompast', 'lasttimediff']

func_pool = ['count']
target = 'is_attributed'

filebase = 'train_fold{}_predonfold{}_{}_{}_param{}.npy'
filebase_test = 'test_fold{}_{}_{}_param{}.npy'
label_file = '/home/kai/data/kaggle/talkingdata/data/train_label.npy'

In [11]:
params = {
        'objective': 'binary',
        'boosting': 'gbdt',
        'num_rounds': 2000,
        'learning_rate': 0.1,
        'num_leaves': 2,
        'num_threads': 4, # best speed: set to number of real cpu cores, which is vCPU/2
        'device': 'cpu',
        'max_depth': 2, # no limit. This is used to deal with over-fitting when #data is small.
        'min_data_in_leaf': 2000,  #minimal number of data in one leaf. Can be used to deal with over-fitting
        'feature_fraction': 0.7, #For example, if set to 0.8, will select 80% features before training each tree.  speed up training / deal with over-fitting
        'feature_fraction_seed': 1,
        'early_stopping_round':100,
        'bagging_fraction': 0.7, #Randomly select part of data without resampling
        'bagging_freq': 1, #frequency for bagging, 0 means disable bagging. k means will perform bagging at every k iteration. to enable bagging, bagging_fraction should be set as well
        'bagging_seed': 1,
        'verbose': 0,
        'scale_pos_weight': 401,
        'metric' : [ 'auc']
    }

# Loading Files

# loading training df

In [8]:
def encapsulate_train(func_pool, entry, base_path, label_file, target='is_attributed',combine=0):
    df = pd.DataFrame()
    for func in func_pool:
        for ent in range(entry):
            file_name = 'train_layer2_{}_{}_param{}.npy'.format(func, ent, combine)
            print('loading...\n{}'.format(file_name))
            load_path = base_path + file_name
            pred = np.load(load_path)
            feature_name = 'layer2_{}_{}_param{}'.format(func, ent, combine)
            df[feature_name] = pred
            print(feature_name)
    print('adding label...')
    label = np.load(label_file)
    df[target] = label
    print('done')
    return df

def encapsulate_test(func_pool, entry, base_path, target='is_attributed',combine=0):
    df = pd.DataFrame()
    for func in func_pool:
        for ent in range(entry):
            file_name = 'test_layer2_{}_{}_param{}.npy'.format(func, ent, combine)
            print('loading...\n{}'.format(file_name))
            load_path = base_path + file_name
            pred = np.load(load_path)
            feature_name = 'layer2_{}_{}_param{}'.format(func, ent, combine)
            df[feature_name] = pred
            print(feature_name)
    print('done')
    return df

df = encapsulate_train(func_pool, entry, base_path, label_file, target=target,combine=0)
df_test = encapsulate_test(func_pool, entry, base_path_test,  target=target,combine=0)
    
print('Training set length: {} size:{}'.format(len(df), sys.getsizeof(df)/ 1024**3))
print('Testing set length: {} size:{}'.format(len(df_test), sys.getsizeof(df_test)/ 1024**3))

loading...
train_layer2_count_0_param0.npy
layer2_count_0_param0
loading...
train_layer2_count_1_param0.npy
layer2_count_1_param0
loading...
train_layer2_count_2_param0.npy
layer2_count_2_param0
loading...
train_layer2_count_3_param0.npy
layer2_count_3_param0
loading...
train_layer2_count_4_param0.npy
layer2_count_4_param0
adding label...
done
loading...
test_layer2_count_0_param0.npy
layer2_count_0_param0
loading...
test_layer2_count_1_param0.npy
layer2_count_1_param0
loading...
test_layer2_count_2_param0.npy
layer2_count_2_param0
loading...
test_layer2_count_3_param0.npy
layer2_count_3_param0
loading...
test_layer2_count_4_param0.npy
layer2_count_4_param0
done
Training set length: 184903890 size:8.265848107635975
Testing set length: 18790469 size:0.6999996155500412


# Get Training and Validation set

In [9]:
feature_cols = list(set(df.columns) - set([target]))
categorical_feature = None
feature_cols

['layer2_count_1_param0',
 'layer2_count_2_param0',
 'layer2_count_3_param0',
 'layer2_count_0_param0',
 'layer2_count_4_param0']

## Train on all

### 1. shuffle

In [None]:
from sklearn.model_selection import train_test_split
def get_train_val(df, mode='shuffle', ratio=0.1, seed=19):
    if mode == 'shuffle':
        trainset, valset = train_test_split(df,test_size=ratio, random_state=seed)
    return (trainset, valset)


def train_lightgbm(x_train, x_val, feature_cols, categorical_feature, params, best_round = None, target='is_attributed'):
    param = params.copy()
    y_train = x_train[target].values
    y_val = x_val[target].values
    
    lgb_train = lgb.Dataset(x_train[feature_cols], y_train, categorical_feature = categorical_feature)
    lgb_val = lgb.Dataset(x_val[feature_cols], y_val, categorical_feature = categorical_feature)
    if best_round is not None:
        param['num_rounds'] = best_round
        del param['early_stopping_round']
    print('start training')
    model = lgb.train(param, train_set=lgb_train, valid_sets=lgb_val, verbose_eval=10)
    return model

trainset,valset = get_train_val(df, ratio=0.1)
print('splitting done')
model = train_lightgbm(trainset, valset, feature_cols, categorical_feature, params)
#             train on all. comment to save time
# best_round = model.best_iteration
# print('training on best round')
# model = train_lightgbm(df, valset, feature_cols, categorical_feature, params, best_round)

splitting done
start training




Training until validation scores don't improve for 100 rounds.
[10]	valid_0's auc: 0.959916
[20]	valid_0's auc: 0.969442
[30]	valid_0's auc: 0.973527
[40]	valid_0's auc: 0.976445
[50]	valid_0's auc: 0.976994
[60]	valid_0's auc: 0.977247
[70]	valid_0's auc: 0.977331
[80]	valid_0's auc: 0.977367
[90]	valid_0's auc: 0.977372
[100]	valid_0's auc: 0.977375
[110]	valid_0's auc: 0.97738
[120]	valid_0's auc: 0.977388
[130]	valid_0's auc: 0.977393
[140]	valid_0's auc: 0.977393
[150]	valid_0's auc: 0.977398
[160]	valid_0's auc: 0.977401
[170]	valid_0's auc: 0.9774
[180]	valid_0's auc: 0.977403
[190]	valid_0's auc: 0.977404
[200]	valid_0's auc: 0.97744
[210]	valid_0's auc: 0.977443
[220]	valid_0's auc: 0.977446
[230]	valid_0's auc: 0.977447
[240]	valid_0's auc: 0.977449
[250]	valid_0's auc: 0.97745
[260]	valid_0's auc: 0.977451
[270]	valid_0's auc: 0.977452
[280]	valid_0's auc: 0.977454
[290]	valid_0's auc: 0.977455
[300]	valid_0's auc: 0.977457
[310]	valid_0's auc: 0.977457
[320]	valid_0's auc: 

# Logistic Regression

In [8]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score

# trainset,valset = get_train_val(df)

# def logisticReg(df,feature_cols, target):
#     x = df[feature_cols].values
#     y = df[target].values
#     model_log = LogisticRegression(C=1)
#     model_log.fit(x, y)
#     return model_log

# model_log = logisticReg(trainset, feature_cols, target)
# print('training done')
# pred_val = model_log.predict_proba(valset[feature_cols].values)[:,1]
# print('predicting done')
# roc = roc_auc_score(valset[target].values, pred_val)
# print('ROC is: {}'.format(roc))

training done
predicting done
ROC is: 0.9769058528809066


In [30]:
# prediction


df_test_raw = pd.read_csv('/home/kai/data/kaggle/talkingdata/data/test.csv')
print('loading raw file done!')

df_sub = pd.DataFrame()
df_sub['click_id'] = df_test_raw['click_id']
df_sub['is_attributed'] = model.predict(df_test)
print('predicting file done!')

df_sub.to_csv('/home/kai/data/kaggle/talkingdata/wl/data/submission/layer2_combine01234_leaf3.csv.gz', compression='gzip', index=False)
print('saving done')


loading raw file done!
predicting file done!
saving done


In [None]:
valset.head(1000)