In [14]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import KFold,StratifiedKFold

In [4]:
train_df = pd.read_csv('input/train.csv')
# df = pd.read_csv('input/train_min.csv')  # small data
test_df = pd.read_csv('input/test.csv')

In [5]:
X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [6]:
X_test.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,8.81,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,5.9739,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976
2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,8.3442,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794
3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,7.4578,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,7.1437,...,4.2259,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846


In [7]:
X.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [8]:
pred_cv = np.zeros(len(X))
pred = np.zeros(len(X_test))

In [None]:
# stacking

lgbm_params = {
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'min_data_in_leaf': 2881,
    'max_depth': 0,
    'num_leaves': 3,
    'learning_rate': 0.01,
    'bagging_freq': 3,
    #'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.9),
    'feature_fraction': 0.8453828656355421,
    'bagging_seed': 11,
    'reg_alpha':  1.1173044727720816,
    'reg_lambda': 6.9285776442737514,
    'random_state': 42,
    'verbosity': -1,
    'subsample':0.8421287738494433,
    'min_child_weight': 36.93038816860224,
    'num_threads': 4,
    'max_bin': 483
}

models = [
    ('lgbm', None),
    ('lg', SGDClassifier(loss='log', max_iter=5000, tol=1e-7, alpha=0.3)),
    ('mlp', MLPClassifier(solver='lbfgs', alpha=0.001, hidden_layer_sizes=(5, 2), random_state=1)),
    ('rf', RandomForestClassifier(n_estimators=10, criterion='entropy')),
    ('gnb', GaussianNB()),
    ('qda', QuadraticDiscriminantAnalysis(tol=1e-12)),
]

cv_out = pd.DataFrame(index=train_df.iloc[:,0])
test_out = pd.DataFrame(index=test_df.iloc[:,0])

# add target to cv_out
cv_out['target'] = train_df.iloc[:,1].values

for model in models:
    name = model[0]
    cls = model[1]
    print('working on ' + name)
    
    # k-fold
    n_splits = 5
    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    pred_cv = np.zeros(len(X))
    pred = np.zeros(len(X_test))
    
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
        print("fold: " + str(fold_n))
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
                        
        if name == 'lgbm':
            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_valid, label=y_valid)
            res = lgb.train(lgbm_params,train_data,num_boost_round=2000000,
                    valid_sets = [train_data, valid_data],verbose_eval=300,early_stopping_rounds = 800)
            pred_cv += res.predict(X, num_iteration=res.best_iteration) / n_splits
            pred += res.predict(X_test, num_iteration=res.best_iteration) / n_splits
        else:
            pred_cv += cls.predict_proba(X)[:,1] / n_splits
            pred += cls.predict_proba(X_test)[:,1] / n_splits
    
    print(pred_cv)
        
    cv_out[name] = pred_cv
    test_out[name] = pred

working on lgbm
fold: 0
Training until validation scores don't improve for 800 rounds.
[300]	training's auc: 0.763894	valid_1's auc: 0.755421
[600]	training's auc: 0.803638	valid_1's auc: 0.796102
[900]	training's auc: 0.824243	valid_1's auc: 0.816561
[1200]	training's auc: 0.838573	valid_1's auc: 0.830135
[1500]	training's auc: 0.848948	valid_1's auc: 0.839926
[1800]	training's auc: 0.856728	valid_1's auc: 0.84737
[2100]	training's auc: 0.86311	valid_1's auc: 0.853287
[2400]	training's auc: 0.868888	valid_1's auc: 0.858499
[2700]	training's auc: 0.873314	valid_1's auc: 0.862411
[3000]	training's auc: 0.877112	valid_1's auc: 0.865793
[3300]	training's auc: 0.880634	valid_1's auc: 0.868908
[3600]	training's auc: 0.883577	valid_1's auc: 0.871514
[3900]	training's auc: 0.886398	valid_1's auc: 0.873991
[4200]	training's auc: 0.888777	valid_1's auc: 0.876039
[4500]	training's auc: 0.890779	valid_1's auc: 0.877823
[4800]	training's auc: 0.892801	valid_1's auc: 0.879531
[5100]	training's auc:

[14100]	training's auc: 0.915085	valid_1's auc: 0.897843
[14400]	training's auc: 0.915357	valid_1's auc: 0.898004
[14700]	training's auc: 0.915583	valid_1's auc: 0.898145
[15000]	training's auc: 0.915868	valid_1's auc: 0.898274
[15300]	training's auc: 0.916147	valid_1's auc: 0.898473
[15600]	training's auc: 0.916368	valid_1's auc: 0.898552
[15900]	training's auc: 0.916608	valid_1's auc: 0.898611
[16200]	training's auc: 0.916838	valid_1's auc: 0.898739
[16500]	training's auc: 0.917066	valid_1's auc: 0.898843
[16800]	training's auc: 0.917311	valid_1's auc: 0.898919
[17100]	training's auc: 0.917533	valid_1's auc: 0.899009
[17400]	training's auc: 0.917778	valid_1's auc: 0.899074
[17700]	training's auc: 0.917991	valid_1's auc: 0.899171
[18000]	training's auc: 0.918199	valid_1's auc: 0.899244
[18300]	training's auc: 0.918417	valid_1's auc: 0.899285
[18600]	training's auc: 0.918618	valid_1's auc: 0.899348
[18900]	training's auc: 0.918817	valid_1's auc: 0.899397
[19200]	training's auc: 0.91900

In [None]:
# add statistics
num = len(models);

cv_out['mean'] = cv_out.iloc[:,[1,num]].mean(axis=1)
cv_out['min'] = cv_out.iloc[:,[1,num]].min(axis=1)
cv_out['max'] = cv_out.iloc[:,[1,num]].max(axis=1)

test_out['mean'] = test_out.iloc[:,[0,num-1]].mean(axis=1)
test_out['min'] = test_out.iloc[:,[0,num-1]].min(axis=1)
test_out['max'] = test_out.iloc[:,[0,num-1]].max(axis=1)

In [None]:
cv_out.head()

In [None]:
test_out.head()

In [None]:
cv_out.to_csv('input/train_stack.csv')

In [None]:
test_out.to_csv('input/test_stack.csv')