In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [2]:
df = pd.read_csv('input/train.csv')
# df = pd.read_csv('input/train_min.csv')  # small data

fix_data_skew = False

if fix_data_skew:
    trues = df.loc[df['target'] == 1]
    falses = df.loc[df['target'] != 1].sample(frac=1)[:len(trues)]
    data = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    data = df
    
data.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [3]:
X, y = data.iloc[:,2:].values, data.iloc[:,1].values

# std scaling
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [5]:
test = pd.read_csv('input/test.csv')
test_ids, test_x = test.iloc[:,0], test.iloc[:,1:]

test_x_scaled = scaler.transform(test_x)

In [None]:
# stacking

models = [
    ('lg', SGDClassifier(loss='log', max_iter=5000, tol=1e-7, alpha=0.3)),
    ('mlp', MLPClassifier(solver='lbfgs', alpha=0.001, hidden_layer_sizes=(5, 2), random_state=1)),
    ('rf', RandomForestClassifier(n_estimators=100, criterion='entropy')),
    ('gnb', GaussianNB()),
    ('qda', QuadraticDiscriminantAnalysis(tol=1e-12)),
]

cv_out = pd.DataFrame(index=data.iloc[:,0])
test_out = pd.DataFrame(index=test_ids)

# add target to cv_out
cv_out['target'] = data.iloc[:,1].values

for model in models:
    name = model[0]
    cls = model[1]
    print('working on ' + name)
    cls.fit(X_scaled, y)
    cv_y = cls.predict_proba(X_scaled)
    test_y = cls.predict_proba(test_x_scaled)
    
    cv_out[name] = pd.DataFrame(cv_y).iloc[:,1].values
    test_out[name] = pd.DataFrame(test_y).iloc[:,1].values

working on lg
working on mlp
working on rf


In [None]:
# add statistics
num = len(models);

cv_out['mean'] = cv_out.iloc[:,[1,num]].mean(axis=1)
cv_out['min'] = cv_out.iloc[:,[1,num]].min(axis=1)
cv_out['max'] = cv_out.iloc[:,[1,num]].max(axis=1)

test_out['mean'] = test_out.iloc[:,[0,num-1]].mean(axis=1)
test_out['min'] = test_out.iloc[:,[0,num-1]].min(axis=1)
test_out['max'] = test_out.iloc[:,[0,num-1]].max(axis=1)

In [None]:
cv_out.head()

In [None]:
test_out.head()

In [None]:
cv_out.to_csv('input/train_stack.csv')

In [None]:
test_out.to_csv('input/test_stack.csv')