In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from tqdm import tqdm; tqdm.pandas()

In [2]:
train = pd.read_pickle('data/train_original.pkl')
test = pd.read_pickle('data/test_original.pkl')
special_cols = [col for col in train.columns if train[col].dtype != np.float64]
feature_cols = [col for col in train.columns if col not in special_cols]

In [3]:
train_df = train.copy()
test_df = test.copy()

In [4]:
train_df_0 = train_df.loc[train_df.target==0, feature_cols].reset_index(drop=True)
train_df_1 = train_df.loc[train_df.target==1, feature_cols].reset_index(drop=True)

In [5]:
train_df_0.head(3)

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965


In [6]:
import random; random.seed(0)
from random import shuffle

# shuffle by row
train_df_0.loc[:,:] = np.array([np.array(v) for v in train_df_0.progress_apply(np.random.permutation, axis=1)]) 
train_df_1.loc[:,:] = np.array([np.array(v) for v in train_df_1.progress_apply(np.random.permutation, axis=1)]) 

# shuffle by col
'''
for df in [train_df_0, train_df_1]:
    for col in tqdm(feature_cols):
        vals = df[col].values
        shuffle(vals)
        df.loc[:,col] = vals
'''

100%|███████████████████████████████████████████████████████████████████| 179902/179902 [00:09<00:00, 19035.23it/s]
100%|█████████████████████████████████████████████████████████████████████| 20098/20098 [00:01<00:00, 19493.70it/s]


'\nfor df in [train_df_0, train_df_1]:\n    for col in tqdm(feature_cols):\n        vals = df[col].values\n        shuffle(vals)\n        df.loc[:,col] = vals\n'

In [7]:
train_df_0.head(3)

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,9.0164,11.2663,-0.2154,11.192,13.3102,12.8287,-9.2834,13.7241,-6.7863,5.8764,...,-1.0914,2.0771,5.3253,4.2218,3.1364,-3.4132,10.535,16.1828,7.8871,11.0924
1,0.8194,34.4014,6.8852,13.7814,13.955,31.5899,40.5632,3.3716,-4.8622,1.7093,...,11.0752,0.1225,-4.821,11.77,7.0118,5.389,11.5006,8.4068,2.2302,13.891
2,5.82,-0.9479,6.8874,9.5413,1.1,12.6317,21.1613,-5.5952,2.3612,9.7905,...,-3.9116,4.5255,5.7033,12.9143,3.1417,10.1852,6.9346,20.5092,6.9427,3.6932


In [8]:
train_df_0.shape, train_df_1.shape

((179902, 200), (20098, 200))

In [9]:
# combine
train_df_0['target'] = 0
train_df_1['target'] = 1
train_df = pd.concat([train_df_0, train_df_1]).reset_index(drop=True)
train_df = train_df.sample(frac=1).reset_index(drop=True)

In [11]:
train_df.shape

(200000, 201)

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import time

params = {'num_leaves': 8,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.03,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.7289,
         'reg_lambda': 4.984,
         'random_state': 42,
         'metric': 'binary_logloss',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01,
         'min_child_weight': 19.428,
         'num_threads': 2}

oof = np.zeros(len(train))
prediction = np.zeros(len(test))

n_fold = 10
folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
feature_importance_df = pd.DataFrame()

for fold_n, (train_index, valid_index) in enumerate(folds.split(train_df.target.values,train_df.target.values)):
    
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = train_df.loc[train_index, feature_cols], train_df.loc[valid_index, feature_cols]
    y_train, y_valid = train_df.target.values[train_index], train_df.target.values[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data, num_boost_round=3200,
                    valid_sets = [train_data, valid_data], verbose_eval=100,
                    early_stopping_rounds = 200)
     
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = feature_cols
    fold_importance_df["importance"] = model.feature_importance(importance_type='gain')
    fold_importance_df["fold"] = fold_n + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)    
       
    oof[valid_index] = model.predict(X_valid, num_iteration=model.best_iteration)
    prediction += model.predict(test_df[feature_cols], num_iteration=model.best_iteration)/n_fold
    gc.collect()
    print(roc_auc_score(y_valid, oof[valid_index]))
    
full_auc = roc_auc_score(train_df.target.values, oof)
print(full_auc)

Fold 0 started at Sat Mar 16 11:54:23 2019
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.324511	valid_1's binary_logloss: 0.326241
[200]	training's binary_logloss: 0.323008	valid_1's binary_logloss: 0.326287
Early stopping, best iteration is:
[25]	training's binary_logloss: 0.325716	valid_1's binary_logloss: 0.326122
0.5088445687741604
Fold 1 started at Sat Mar 16 11:54:42 2019
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.324512	valid_1's binary_logloss: 0.326323
[200]	training's binary_logloss: 0.323039	valid_1's binary_logloss: 0.326365
Early stopping, best iteration is:
[1]	training's binary_logloss: 0.326138	valid_1's binary_logloss: 0.32617
0.5021421158340365
Fold 2 started at Sat Mar 16 11:55:00 2019
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.324533	valid_1's binary_logloss: 0.326159
[200]	training's binary_logloss: 0.3230

In [13]:
version = 'kh_lgb_10fold_raw_shuffle_by_row'

pd.to_pickle(oof, 'oof+submission/'+version+'_oof_train')
pd.to_pickle(prediction, 'oof+submission/'+version+'_oof_test')    
sub = pd.DataFrame({"ID_code": test.ID_code.values})
sub["target"] = prediction
sub.to_csv('oof+submission/' + version + '_' + str(full_auc).replace('.', '_') + ".csv", index=False)

pd.options.display.max_rows=600
mean_gain = feature_importance_df[['importance', 'Feature']].groupby('Feature').mean().sort_values('importance', ascending=False)
mean_gain

Unnamed: 0_level_0,importance
Feature,Unnamed: 1_level_1
var_9,238.430357
var_125,224.617348
var_110,223.31822
var_152,210.297254
var_3,201.775119
var_79,201.602307
var_19,194.435664
var_78,190.340746
var_66,170.956338
var_71,170.448372
