In [1]:
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [22]:
train_df = pd.read_csv('input/train.csv')
# train_df = pd.read_csv('input/train_min.csv')
test_df = pd.read_csv('input/test.csv')

fix_data_skew = True

if fix_data_skew:
    trues = train_df.loc[train_df['target'] == 1]
    falses = train_df.loc[train_df['target'] != 1].sample(frac=1)[:len(trues)]
    train_df = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    train_df = train_df
    
train_df.head()

X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [23]:
n_splits = 5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [38]:
'''
# cv: 0.89951
params = {
         'num_leaves': 8,
         'min_data_in_leaf': 42,
         'objective': 'binary',
         'max_depth': 16,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4 
        }
'''


# 1st round looks good: cv 0.900043
# lb: 0.899 with n_splits=10
# try larger max_bin?
params = {
         'num_leaves': 8,
         'min_data_in_leaf': 1000,
         'objective': 'binary',
         'max_depth': -1,
         'learning_rate': 0.0123,
         'boosting': 'gbdt',
         'bagging_freq': 5,
         'bagging_fraction': 0.8,
         'feature_fraction': 0.8201,
         'bagging_seed': 11,
         'reg_alpha': 1.728910519108444,
         'reg_lambda': 4.9847051755586085,
         'random_state': 42,
         'metric': 'auc',
         'verbosity': -1,
         'subsample': 0.81,
         'min_gain_to_split': 0.01077313523861969,
         'min_child_weight': 19.428902804238373,
         'num_threads': 4,
         'max_bin': 511,
        }




In [39]:
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid)
        
    model = lgb.train(params,train_data,num_boost_round=2000000,
                    valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 1000)
    
    prediction += model.predict(X_test, num_iteration=model.best_iteration)/n_splits

Fold 0 started at Sun Mar  3 00:32:37 2019
Training until validation scores don't improve for 1000 rounds.
[300]	training's auc: 0.836794	valid_1's auc: 0.814493
[600]	training's auc: 0.87444	valid_1's auc: 0.846988
[900]	training's auc: 0.893144	valid_1's auc: 0.861779
[1200]	training's auc: 0.905651	valid_1's auc: 0.871825
[1500]	training's auc: 0.914176	valid_1's auc: 0.878512
[1800]	training's auc: 0.92058	valid_1's auc: 0.883108
[2100]	training's auc: 0.925791	valid_1's auc: 0.886559
[2400]	training's auc: 0.930053	valid_1's auc: 0.888989
[2700]	training's auc: 0.93378	valid_1's auc: 0.891062
[3000]	training's auc: 0.936858	valid_1's auc: 0.892567
[3300]	training's auc: 0.939478	valid_1's auc: 0.893768
[3600]	training's auc: 0.94195	valid_1's auc: 0.894715
[3900]	training's auc: 0.944005	valid_1's auc: 0.895464
[4200]	training's auc: 0.945919	valid_1's auc: 0.895992
[4500]	training's auc: 0.947581	valid_1's auc: 0.89638
[4800]	training's auc: 0.949291	valid_1's auc: 0.896875
[5100

KeyboardInterrupt: 

In [None]:
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = prediction
sub.to_csv("submission.csv", index=False)