In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import VotingClassifier

In [2]:
df = pd.read_csv('input/train.csv')
# df = pd.read_csv('input/train_min.csv')  # small data

fix_data_skew = True

if fix_data_skew:
    trues = df.loc[df['target'] == 1]
    falses = df.loc[df['target'] != 1].sample(frac=1)[:len(trues)]
    data = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    data = df
    
data.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
11859,train_118481,1,11.4441,-3.0915,11.0087,8.8328,9.39,-16.91,4.8323,18.9352,...,-1.2465,10.6506,-0.8165,-0.2405,17.9137,0.533,10.4117,8.9796,14.0923,1.8749
22987,train_149145,0,8.0947,1.1614,8.7245,5.6639,10.3463,-13.0351,4.4251,15.5809,...,-1.1997,7.1187,0.8186,4.0727,16.243,0.2047,5.596,9.3615,10.7585,7.5026
986,train_9913,1,16.6009,-6.73,12.429,8.4804,13.5026,1.941,4.742,20.701,...,6.3938,11.3078,-0.9638,-1.5045,23.7133,-1.8739,7.6578,8.7819,21.6391,-19.4144
27259,train_158431,0,14.4241,-2.5992,10.322,6.7461,13.7528,-8.245,5.7568,18.2593,...,3.2648,6.6875,4.6267,5.981,11.0566,-0.9591,8.6425,8.0553,15.8434,8.1109
29205,train_181823,0,5.4817,1.3942,10.8116,4.4325,9.8978,0.2387,6.0674,14.4902,...,5.0274,7.5158,1.8741,4.1429,12.8867,-0.14,3.1719,9.4963,16.9671,-15.4862


In [3]:
X, y = data.iloc[:,2:].values, data.iloc[:,1].values

# std scaling
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [4]:
#cv: 0.8597059678850029
#      0.8584394316060081 (with data skew fix)
lg = SGDClassifier(loss='log', max_iter=5000, tol=1e-7, alpha=0.3)

#cv: 0.8543658603036868
#      0.8466847795424428 (with data skew fix)
mlp = MLPClassifier(solver='lbfgs', alpha=0.001,
                    hidden_layer_sizes=(5, 2), random_state=1)

# cv: 0.83 (with data skew fix)
rf = RandomForestClassifier(n_estimators=100, criterion='entropy')

# try grid-search
#params = { 'n_estimators': [ 3000,10000,30000 ] }
#cls = GridSearchCV(estimator=cls, param_grid=params, cv=5, scoring='roc_auc')
#cls.fit(X_scaled, y)
#print(cls.best_params_)
#print(cls.best_score_)

# single
#cv_scores = cross_val_score(cls, X_scaled, y, cv=5, scoring='roc_auc')
#print(cv_scores)
#print(np.average(cv_scores))

# voting
cls = VotingClassifier(estimators=[('lg', lg), ('mlp', mlp), ('rf', rf)], voting='hard')
for clf, label in zip([lg, mlp, rf, cls], ['Logistic Regression', 'Multi-layer perceptron', 'Random forest', 'Ensemble']):
    scores = cross_val_score(clf, X_scaled, y, cv=5, scoring='roc_auc')
    print("Score: %0.2f (+/- %0.16f) [%s]" % (scores.mean(), scores.std(), label))

Score: 0.86 (+/- 0.0035904325772501) [Logistic Regression]
Score: 0.85 (+/- 0.0036313174814808) [Multi-layer perceptron]
Score: 0.83 (+/- 0.0028857763113783) [Random forest]


AttributeError: predict_proba is not available when voting='hard'

In [None]:
cls.fit(X_scaled, y)

In [None]:
# create final output
test = pd.read_csv('input/test.csv')
test_ids, test_x = test.iloc[:,0], test.iloc[:,1:]

test_x_scaled = scaler.transform(test_x)
test_y = cls.predict_proba(test_x_scaled)

In [None]:
pd.DataFrame(pd.DataFrame(test_y))

In [None]:
# output the result
sub_ids = pd.DataFrame(test_ids)
sub_y = pd.DataFrame(pd.DataFrame(test_y).iloc[:,1])

sub = pd.DataFrame(np.hstack((sub_ids, sub_y)))
sub.to_csv('output.csv', header=['ID_code', 'target'], index=False)

output_check = pd.read_csv('output.csv')
output_check.head()