In [88]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import KFold,StratifiedKFold

In [89]:
train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')

In [90]:
X_test = test_df.drop('ID_code',axis=1)
X = train_df.drop(['ID_code','target'],axis=1)
y = train_df['target']

In [91]:
X_tophalf, _, y_tophalf, _ = train_test_split(X, y, test_size=0.5, random_state=10)

In [92]:
X_test.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,8.81,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,5.9739,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976
2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,8.3442,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794
3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,7.4578,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,7.1437,...,4.2259,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846


In [93]:
X_tophalf.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
57514,11.3461,2.0185,8.2617,8.9897,12.2989,-5.205,4.0389,18.181,3.2612,9.2327,...,-0.0927,6.6907,1.5493,6.331,24.6415,3.1709,0.1617,8.6137,16.7621,-23.8231
91806,19.0252,-5.0797,13.1919,8.4464,9.1799,12.9264,4.2622,12.9846,3.2476,6.5997,...,6.6991,15.6887,1.7,-2.5501,16.9392,-1.8475,-3.2033,8.4966,11.2089,5.3368
174169,12.8794,3.0144,13.1454,10.4689,11.0107,-9.0041,5.7038,20.1085,3.5308,6.9209,...,10.4993,7.3249,2.6845,5.6977,13.4284,-1.8297,-2.1189,10.5226,13.4775,-1.0358
165416,8.0732,4.5286,9.7117,4.3824,10.878,-12.1528,4.1867,15.9875,4.9062,8.2363,...,4.4917,8.1056,4.3216,4.5972,15.1782,-1.5221,3.1989,7.6876,16.4864,-23.2289
172314,13.2469,1.157,15.931,3.4848,13.2649,-4.9624,5.2726,12.1206,-3.0408,7.6301,...,7.8434,3.6831,1.5791,0.9119,23.1969,0.8487,14.2175,9.7496,20.5372,12.8501


In [94]:
pred_cv = np.zeros(len(X))
pred = np.zeros(len(X_test))

In [95]:
# results from other models
models = [
    ('lg', SGDClassifier(loss='log', max_iter=5000, tol=1e-7, alpha=0.3)),
    ('mlp', MLPClassifier(solver='lbfgs', alpha=0.001, hidden_layer_sizes=(5, 2), random_state=1)),
    ('rf', RandomForestClassifier(n_estimators=10, criterion='entropy')),
    ('gnb', GaussianNB()),
    ('qda', QuadraticDiscriminantAnalysis(tol=1e-12)),
]

train_models_out = pd.DataFrame(index=train_df.iloc[:,0])
test_models_out = pd.DataFrame(index=test_df.iloc[:,0])

for model in models:
    name = model[0]
    cls = model[1]
    print('working on ' + name)
    
    # k-fold
    n_splits = 5
    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    pred_cv = np.zeros(len(X))
    pred = np.zeros(len(X_test))
    
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X_tophalf,y_tophalf)):
        print("fold: " + str(fold_n))
        X_train, X_valid = X_tophalf.iloc[train_index], X_tophalf.iloc[valid_index]
        y_train, y_valid = y_tophalf.iloc[train_index], y_tophalf.iloc[valid_index]

        cls.fit(X_train, y_train)
        pred_cv += cls.predict_proba(X)[:,1] / n_splits
        pred += cls.predict_proba(X_test)[:,1] / n_splits
    
    print(pred_cv)
        
    train_models_out[name] = pred_cv
    test_models_out[name] = pred

working on lg
fold: 0
fold: 1
fold: 2
fold: 3
fold: 4
[0.05770728 0.63103167 0.05979834 ... 0.13395441 0.03926918 0.01202621]
working on mlp
fold: 0
fold: 1
fold: 2
fold: 3
fold: 4
[0.08635527 0.22083387 0.08885832 ... 0.09694185 0.08392684 0.08130912]
working on rf
fold: 0
fold: 1
fold: 2
fold: 3
fold: 4
[0.04 0.2  0.06 ... 0.04 0.22 0.08]
working on gnb
fold: 0
fold: 1
fold: 2
fold: 3
fold: 4
[0.01151796 0.7876045  0.0075814  ... 0.06032313 0.06251733 0.00190182]
working on qda
fold: 0
fold: 1
fold: 2
fold: 3
fold: 4
[3.34560620e-03 6.70071675e-01 7.62274382e-03 ... 3.03113588e-02
 7.05183030e-04 3.59180896e-04]


In [96]:
# add statistics
num = len(models);
train_models_out['models_mean'] = train_models_out.iloc[:,[0,num-1]].mean(axis=1)
train_models_out['models_min'] = train_models_out.iloc[:,[0,num-1]].min(axis=1)
train_models_out['models_max'] = train_models_out.iloc[:,[0,num-1]].max(axis=1)
test_models_out['models_mean'] = test_models_out.iloc[:,[0,num-1]].mean(axis=1)
test_models_out['models_min'] = test_models_out.iloc[:,[0,num-1]].min(axis=1)
test_models_out['models_max'] = test_models_out.iloc[:,[0,num-1]].max(axis=1)

In [97]:
train_models_out.head()

Unnamed: 0_level_0,lg,mlp,rf,gnb,qda,models_mean,models_min,models_max
ID_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
train_0,0.057707,0.086355,0.04,0.011518,0.003346,0.030526,0.003346,0.057707
train_1,0.631032,0.220834,0.2,0.787605,0.670072,0.650552,0.631032,0.670072
train_2,0.059798,0.088858,0.06,0.007581,0.007623,0.033711,0.007623,0.059798
train_3,0.152145,0.11196,0.08,0.255134,0.134539,0.143342,0.134539,0.152145
train_4,0.147465,0.111052,0.16,0.046184,0.007116,0.077291,0.007116,0.147465


In [98]:
test_models_out.head()

Unnamed: 0_level_0,lg,mlp,rf,gnb,qda,models_mean,models_min,models_max
ID_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
test_0,0.11891,0.102051,0.22,0.102747,0.791759,0.455334,0.11891,0.791759
test_1,0.23615,0.11798,0.14,0.168987,0.107739,0.171945,0.107739,0.23615
test_2,0.081722,0.094173,0.16,0.230615,0.069075,0.075399,0.069075,0.081722
test_3,0.163446,0.11919,0.12,0.106442,0.056532,0.109989,0.056532,0.163446
test_4,0.08058,0.094155,0.1,0.035033,0.046353,0.063467,0.046353,0.08058


In [99]:
# add to data
for col in train_models_out:
    train_df[col] = train_models_out[col].values
    test_df[col] = test_models_out[col].values

In [100]:
train_df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_198,var_199,lg,mlp,rf,gnb,qda,models_mean,models_min,models_max
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,12.7803,-1.0914,0.057707,0.086355,0.04,0.011518,0.003346,0.030526,0.003346,0.057707
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,18.356,1.9518,0.631032,0.220834,0.2,0.787605,0.670072,0.650552,0.631032,0.670072
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,14.7222,0.3965,0.059798,0.088858,0.06,0.007581,0.007623,0.033711,0.007623,0.059798
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,17.9697,-8.9996,0.152145,0.11196,0.08,0.255134,0.134539,0.143342,0.134539,0.152145
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,17.9974,-8.8104,0.147465,0.111052,0.16,0.046184,0.007116,0.077291,0.007116,0.147465


In [101]:
test_df.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_198,var_199,lg,mlp,rf,gnb,qda,models_mean,models_min,models_max
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,15.4722,-8.7197,0.11891,0.102051,0.22,0.102747,0.791759,0.455334,0.11891,0.791759
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,19.1293,-20.976,0.23615,0.11798,0.14,0.168987,0.107739,0.171945,0.107739,0.23615
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,19.8956,-23.1794,0.081722,0.094173,0.16,0.230615,0.069075,0.075399,0.069075,0.081722
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,13.0168,-4.2108,0.163446,0.11919,0.12,0.106442,0.056532,0.109989,0.056532,0.163446
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,13.926,-9.1846,0.08058,0.094155,0.1,0.035033,0.046353,0.063467,0.046353,0.08058


In [102]:
train_df.to_csv("input/train_more_features.csv", index=False)
test_df.to_csv("input/test_more_features.csv", index=False)