In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import VotingClassifier

In [16]:
df = pd.read_csv('input/train.csv')
# df = pd.read_csv('input/train_min.csv')  # small data

fix_data_skew = True

if fix_data_skew:
    trues = df.loc[df['target'] == 1]
    falses = df.loc[df['target'] != 1].sample(frac=1)[:len(trues)]
    data = pd.concat([trues, falses], ignore_index=True).sample(frac=1)
else:
    data = df
    
data.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
36797,train_61453,0,13.2436,-6.5741,10.6925,6.7935,8.6233,-21.4398,5.9569,20.496,...,6.0697,9.4453,3.2494,7.0239,15.9781,0.5302,5.2766,7.9107,19.6704,-15.994
33815,train_188482,0,12.7426,3.6071,9.3626,7.3927,10.4472,0.6793,5.9063,13.9624,...,5.6268,11.7358,3.6914,-0.1375,16.7893,2.1196,0.3551,8.6532,15.7453,-16.147
21465,train_17183,0,10.1024,-5.9566,12.4587,7.3993,8.8282,3.2796,6.5689,18.8125,...,-3.1709,6.2639,1.3712,6.4835,16.8696,-2.6564,-4.9816,10.8511,16.2587,12.3038
21758,train_67078,0,7.9004,1.3109,6.1864,4.9681,12.2709,-17.5565,6.3439,18.5405,...,2.3476,5.3638,3.521,1.8689,18.5687,-0.2031,2.8688,8.7329,15.8691,10.9652
28814,train_157470,0,5.8174,6.6683,12.4757,7.3148,12.6124,-7.4812,4.2887,9.9485,...,2.2002,9.8007,1.4207,7.8151,22.584,-0.1439,5.4137,10.1074,17.1048,13.4404


In [17]:
X, y = data.iloc[:,2:].values, data.iloc[:,1].values

# std scaling
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [18]:
#cv: 0.8597059678850029
#      0.8584394316060081 (with data skew fix)
lg = SGDClassifier(loss='log', max_iter=5000, tol=1e-7, alpha=0.3)

#cv: 0.8543658603036868
#      0.8466847795424428 (with data skew fix)
mlp = MLPClassifier(solver='lbfgs', alpha=0.001,
                    hidden_layer_sizes=(5, 2), random_state=1)

# cv: 0.83 (with data skew fix)
rf = RandomForestClassifier(n_estimators=100, criterion='entropy')

# try grid-search
#params = { 'n_estimators': [ 3000,10000,30000 ] }
#cls = GridSearchCV(estimator=cls, param_grid=params, cv=5, scoring='roc_auc')
#cls.fit(X_scaled, y)
#print(cls.best_params_)
#print(cls.best_score_)

# single
#cv_scores = cross_val_score(cls, X_scaled, y, cv=5, scoring='roc_auc')
#print(cv_scores)
#print(np.average(cv_scores))

# voting
cls = VotingClassifier(estimators=[('lg', lg), ('mlp', mlp), ('rf', rf)], voting='soft')
for clf, label in zip([lg, mlp, rf, cls], ['Logistic Regression', 'Multi-layer perceptron', 'Random forest', 'Ensemble']):
    scores = cross_val_score(clf, X_scaled, y, cv=5, scoring='roc_auc')
    print("Score: %0.2f (+/- %0.16f) [%s]" % (scores.mean(), scores.std(), label))

Score: 0.86 (+/- 0.0031626706705202) [Logistic Regression]
Score: 0.85 (+/- 0.0045031426288045) [Multi-layer perceptron]
Score: 0.83 (+/- 0.0043657462341541) [Random forest]
Score: 0.87 (+/- 0.0032999281758156) [Ensemble]


In [19]:
cls.fit(X_scaled, y)

VotingClassifier(estimators=[('lg', SGDClassifier(alpha=0.3, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=5000,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       po...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None)

In [20]:
# create final output
test = pd.read_csv('input/test.csv')
test_ids, test_x = test.iloc[:,0], test.iloc[:,1:]

test_x_scaled = scaler.transform(test_x)
test_y = cls.predict_proba(test_x_scaled)

In [21]:
# output the result
sub_ids = pd.DataFrame(test_ids)
sub_y = pd.DataFrame(pd.DataFrame(test_y).iloc[:,1])

sub = pd.DataFrame(np.hstack((sub_ids, sub_y)))
sub.to_csv('output.csv', header=['ID_code', 'target'], index=False)

output_check = pd.read_csv('output.csv')
output_check.head()

Unnamed: 0,ID_code,target
0,test_0,0.567813
1,test_1,0.594392
2,test_2,0.465717
3,test_3,0.567965
4,test_4,0.460915
