In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import time
from sklearn import *



In [2]:
# a helper function to generate a file
def to_kaggle(prediction):
    current_time_str = time.strftime("%H-%M-%S_%a_%b_%d", time.localtime())
    file_name = "results/{}.txt".format(current_time_str)
    np.savetxt(file_name,
           np.vstack( (np.arange(len(prediction)) , prediction[:,1]) ).T,
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');


In [3]:
# load data
x_data_all = np.genfromtxt("data/X_train.txt", delimiter=None)
y_data_all = np.genfromtxt("data/Y_train.txt", delimiter=None)
x_test = np.genfromtxt("data/X_test.txt", delimiter=None)

In [4]:
# x_data = x_data_all[:10000, :]
# y_data = y_data_all[:10000]

x_data = x_data_all
y_data = y_data_all

In [5]:
# split training / validation data
x_train, x_validation, y_train, y_validation = model_selection.train_test_split(
    x_data, y_data, test_size=0.2, random_state=42)

In [97]:
logistic_classifier = pipeline.Pipeline([
    ('poly', preprocessing.PolynomialFeatures(degree=2, interaction_only=False)),
    ('linear', linear_model.LogisticRegression())])

print("training started")
logistic_classifier.fit(x_train, y_train)
print("training finished")

logistic_classifier_roc = metrics.roc_auc_score(y_validation, logistic_classifier.predict_proba(x_validation)[:,1])
print(logistic_classifier_roc)

training started
training finished
0.662912115548


In [95]:
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors=10, weights="distance", n_jobs=-1)

print("training started")
starting_time = time.time()
knn_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

knn_classifier_roc = metrics.roc_auc_score(
   y_validation, knn_classifier.predict_proba(x_validation)[:,1])
print(knn_classifier_roc)

training started
training finished, took 2.519016981124878 seconds
0.698577652125


In [68]:
random_forest_classifier = ensemble.RandomForestClassifier(
    n_estimators=500, min_samples_leaf=4, n_jobs=2, oob_score=True)

print("training started")
starting_time = time.time()
random_forest_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

random_forest_classifier_roc = metrics.roc_auc_score(
    y_validation, random_forest_classifier.predict_proba(x_validation)[:,1])
print(random_forest_classifier_roc)

training started
training finished, took 84.45092701911926 seconds
0.754645674456


In [65]:
neural_network_classifier = pipeline.Pipeline([
    ("scale", preprocessing.StandardScaler().fit(x_train)),
    ("nn", neural_network.MLPClassifier(
        hidden_layer_sizes=(14 * 20, 14 * 10),
        max_iter=200))])

print("training started")
starting_time = time.time()
neural_network_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

neural_network_classifier_roc = metrics.roc_auc_score(
    y_validation, neural_network_classifier.predict_proba(x_validation)[:,1])
print(neural_network_classifier_roc)

training started
training finished, took 524.6228878498077 seconds
0.713479734646


0.7703875 0.71085


In [67]:
gradient_boosting_classifier = ensemble.GradientBoostingClassifier(n_estimators=1000, learning_rate=0.5,
     max_depth=3, max_leaf_nodes=13, random_state=0)

print("training started")
starting_time = time.time()
gradient_boosting_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

gradient_boosting_classifier_roc = metrics.roc_auc_score(
   y_validation, gradient_boosting_classifier.predict_proba(x_validation)[:,1])
print(gradient_boosting_classifier_roc)

training started
training finished, took 224.91048502922058 seconds
0.750307905393


In [98]:
ada_classifier = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=3, max_features=7),
               n_estimators=1000, learning_rate=0.5)

print("training started")
starting_time = time.time()
ada_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

ada_classifier_roc = metrics.roc_auc_score(
   y_validation, ada_classifier.predict_proba(x_validation)[:,1])
print(ada_classifier_roc)

training started
training finished, took 161.67395114898682 seconds
0.730548694976


In [100]:
voting_classifier = ensemble.VotingClassifier([
    ("k_nearest_neighbor", knn_classifier),
    ("random_forest", random_forest_classifier),
    ("neural_network", neural_network_classifier),
    ("gradient_boosting", gradient_boosting_classifier),
    ("ada_boosting", ada_classifier)],
    voting="soft", n_jobs=-1)

print("training started")
starting_time = time.time()
voting_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

voting_classifier_roc = metrics.roc_auc_score(
   y_validation, voting_classifier.predict_proba(x_validation)[:,1])
print(voting_classifier_roc)

training started
training finished, took 722.5826351642609 seconds
0.760270938534
