In [1]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [4]:
train_df = pd.read_csv('input/train.csv')
# train_df = pd.read_csv('input/train_min.csv')
test_df = pd.read_csv('input/test.csv')

fix_data_skew = True

if fix_data_skew:
    trues = train_df.loc[train_df['target'] == 1]
    falses = train_df.loc[train_df['target'] != 1].sample(frac=1)[:len(trues)]
    train_df = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    train_df = train_df
    
train_df.head()

X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [5]:
n_splits = 5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [10]:
# cv: 0.89951
params = {
         'num_leaves': 8,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4 
        }

'''
# cv: 0.90775 (full)
# lb: 0.896 with fold_n=10 => overfit?
params = {
         'objective': 'binary',
         'max_depth': 127,
         'metric': 'auc',
         'boosting':'gbdt',
         }
'''
'''
# 1st round looks good: cv 0.900043
# lb: 0.899 with n_splits=10
# try larger max_bin?
params = {
         'num_leaves': 8,
         'min_data_in_leaf': 1000,
         'objective': 'binary',
         'max_depth': -1,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4,
         'max_bin': 511,
        }
'''
params = {
         'objective': 'binary',
         'max_depth': 2,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 1,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4,
        }



In [None]:
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=200000,
                    valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 2000)
    
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_splits

Fold 0 started at Sat Mar  2 23:01:10 2019
Training until validation scores don't improve for 2000 rounds.
[300]	training's auc: 0.787228	valid_1's auc: 0.784348
[600]	training's auc: 0.824699	valid_1's auc: 0.816974
[900]	training's auc: 0.846209	valid_1's auc: 0.836109
[1200]	training's auc: 0.861127	valid_1's auc: 0.848328
[1500]	training's auc: 0.873061	valid_1's auc: 0.857866
[1800]	training's auc: 0.881495	valid_1's auc: 0.864845
[2100]	training's auc: 0.888813	valid_1's auc: 0.870391
[2400]	training's auc: 0.894409	valid_1's auc: 0.874912
[2700]	training's auc: 0.899981	valid_1's auc: 0.878709
[3000]	training's auc: 0.903774	valid_1's auc: 0.881353
[3300]	training's auc: 0.907482	valid_1's auc: 0.88411
[3600]	training's auc: 0.91047	valid_1's auc: 0.886042
[3900]	training's auc: 0.913227	valid_1's auc: 0.887677
[4200]	training's auc: 0.915649	valid_1's auc: 0.88924
[4500]	training's auc: 0.917679	valid_1's auc: 0.890449
[4800]	training's auc: 0.919748	valid_1's auc: 0.891721
[51

[11100]	training's auc: 0.942651	valid_1's auc: 0.894523
[11400]	training's auc: 0.943276	valid_1's auc: 0.894652
[11700]	training's auc: 0.94384	valid_1's auc: 0.894635
[12000]	training's auc: 0.944407	valid_1's auc: 0.894754
[12300]	training's auc: 0.944995	valid_1's auc: 0.894857
[12600]	training's auc: 0.94558	valid_1's auc: 0.894808
[12900]	training's auc: 0.946122	valid_1's auc: 0.894878
[13200]	training's auc: 0.946678	valid_1's auc: 0.89488
[13500]	training's auc: 0.947232	valid_1's auc: 0.895025
[13800]	training's auc: 0.947773	valid_1's auc: 0.895107
[14100]	training's auc: 0.948288	valid_1's auc: 0.895182
[14400]	training's auc: 0.94882	valid_1's auc: 0.895114
[14700]	training's auc: 0.949317	valid_1's auc: 0.895227
[15000]	training's auc: 0.949865	valid_1's auc: 0.895179
[15300]	training's auc: 0.950347	valid_1's auc: 0.89512
[15600]	training's auc: 0.95091	valid_1's auc: 0.895058
[15900]	training's auc: 0.9514	valid_1's auc: 0.89507
[16200]	training's auc: 0.951906	valid_1

In [18]:
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)