In [4]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import KFold,StratifiedKFold

In [23]:
train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')

In [6]:
X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [7]:
X_test.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,8.81,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,5.9739,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976
2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,8.3442,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794
3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,7.4578,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,7.1437,...,4.2259,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846


In [8]:
X.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,-4.92,5.747,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,3.1468,8.0851,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,-4.9193,5.9525,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,-5.8609,8.245,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,6.2654,7.6784,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [9]:
pred_cv = np.zeros(len(X))
pred = np.zeros(len(X_test))

In [10]:
# results from other models
models = [
    ('lg', SGDClassifier(loss='log', max_iter=5000, tol=1e-7, alpha=0.3)),
    ('mlp', MLPClassifier(solver='lbfgs', alpha=0.001, hidden_layer_sizes=(5, 2), random_state=1)),
    ('rf', RandomForestClassifier(n_estimators=10, criterion='entropy')),
    ('gnb', GaussianNB()),
    ('qda', QuadraticDiscriminantAnalysis(tol=1e-12)),
]

train_models_out = pd.DataFrame(index=train_df.iloc[:,0])
test_models_out = pd.DataFrame(index=test_df.iloc[:,0])

for model in models:
    name = model[0]
    cls = model[1]
    print('working on ' + name)
    
    # k-fold
    n_splits = 5
    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    pred_cv = np.zeros(len(X))
    pred = np.zeros(len(X_test))
    
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X,y)):
        print("fold: " + str(fold_n))
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        cls.fit(X_train, y_train)
        pred_cv += cls.predict_proba(X)[:,1] / n_splits
        pred += cls.predict_proba(X_test)[:,1] / n_splits
    
    print(pred_cv)
        
    train_models_out[name] = pred_cv
    test_models_out[name] = pred

working on lg
fold: 0
fold: 1
fold: 2
fold: 3
fold: 4
[0.057966   0.63569632 0.05510681 ... 0.13198316 0.038409   0.01368934]
working on mlp
fold: 0
fold: 1
fold: 2
fold: 3
fold: 4
[0.07233561 0.32337534 0.08845907 ... 0.09807362 0.06570075 0.06218938]
working on rf
fold: 0
fold: 1
fold: 2
fold: 3
fold: 4
[0.02 0.06 0.04 ... 0.   0.06 0.02]
working on gnb
fold: 0
fold: 1
fold: 2
fold: 3
fold: 4
[0.01148899 0.78415556 0.00722842 ... 0.05493569 0.05551181 0.00223653]
working on qda
fold: 0
fold: 1
fold: 2
fold: 3
fold: 4
[1.37436641e-02 8.43686252e-01 6.99730267e-03 ... 4.00821058e-02
 2.18645552e-02 5.90055029e-04]


In [12]:
# add statistics
num = len(models);
train_models_out['models_mean'] = train_models_out.iloc[:,[0,num-1]].mean(axis=1)
train_models_out['models_min'] = train_models_out.iloc[:,[0,num-1]].min(axis=1)
train_models_out['models_max'] = train_models_out.iloc[:,[0,num-1]].max(axis=1)
test_models_out['models_mean'] = test_models_out.iloc[:,[0,num-1]].mean(axis=1)
test_models_out['models_min'] = test_models_out.iloc[:,[0,num-1]].min(axis=1)
test_models_out['models_max'] = test_models_out.iloc[:,[0,num-1]].max(axis=1)

In [13]:
train_models_out.head()

Unnamed: 0_level_0,lg,mlp,rf,gnb,qda,models_mean,models_min,models_max
ID_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
train_0,0.057966,0.072336,0.02,0.011489,0.013744,0.035855,0.013744,0.057966
train_1,0.635696,0.323375,0.06,0.784156,0.843686,0.739691,0.635696,0.843686
train_2,0.055107,0.088459,0.04,0.007228,0.006997,0.031052,0.006997,0.055107
train_3,0.135678,0.115099,0.08,0.229947,0.128887,0.132283,0.128887,0.135678
train_4,0.152084,0.130008,0.04,0.051531,0.014644,0.083364,0.014644,0.152084


In [14]:
test_models_out.head()

Unnamed: 0_level_0,lg,mlp,rf,gnb,qda,models_mean,models_min,models_max
ID_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
test_0,0.118176,0.100218,0.2,0.109373,0.488102,0.303139,0.118176,0.488102
test_1,0.234275,0.131037,0.1,0.186684,0.222647,0.228461,0.222647,0.234275
test_2,0.075918,0.080238,0.18,0.199758,0.198338,0.137128,0.075918,0.198338
test_3,0.169123,0.153624,0.02,0.109123,0.095385,0.132254,0.095385,0.169123
test_4,0.080079,0.10478,0.16,0.030233,0.022205,0.051142,0.022205,0.080079


In [31]:
# add to data
for col in train_models_out:
    train_df[col] = train_models_out[col].values
    test_df[col] = test_models_out[col].values

In [32]:
train_df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_198,var_199,lg,mlp,rf,gnb,qda,models_mean,models_min,models_max
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,12.7803,-1.0914,0.057966,0.072336,0.02,0.011489,0.013744,0.035855,0.013744,0.057966
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,18.356,1.9518,0.635696,0.323375,0.06,0.784156,0.843686,0.739691,0.635696,0.843686
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,14.7222,0.3965,0.055107,0.088459,0.04,0.007228,0.006997,0.031052,0.006997,0.055107
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,17.9697,-8.9996,0.135678,0.115099,0.08,0.229947,0.128887,0.132283,0.128887,0.135678
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,17.9974,-8.8104,0.152084,0.130008,0.04,0.051531,0.014644,0.083364,0.014644,0.152084


In [33]:
test_df.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_198,var_199,lg,mlp,rf,gnb,qda,models_mean,models_min,models_max
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,15.4722,-8.7197,0.118176,0.100218,0.2,0.109373,0.488102,0.303139,0.118176,0.488102
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,19.1293,-20.976,0.234275,0.131037,0.1,0.186684,0.222647,0.228461,0.222647,0.234275
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,19.8956,-23.1794,0.075918,0.080238,0.18,0.199758,0.198338,0.137128,0.075918,0.198338
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,13.0168,-4.2108,0.169123,0.153624,0.02,0.109123,0.095385,0.132254,0.095385,0.169123
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,13.926,-9.1846,0.080079,0.10478,0.16,0.030233,0.022205,0.051142,0.022205,0.080079


In [None]:
train_df.to_csv("input/train_more_features.csv", index=False)
test_df.to_csv("input/test_more_features.csv", index=False)