In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import time
from sklearn import *

from itertools import product




In [26]:
def to_kaggle(prediction):
    current_time_str = time.strftime("%H-%M-%S_%a_%b_%d", time.localtime())
    file_name = "{}.txt".format(current_time_str)
    np.savetxt(file_name,
           np.vstack( (np.arange(len(prediction)) , prediction) ).T,
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');

In [3]:
x_data_all = np.genfromtxt("X_train.txt", delimiter=None)
y_data_all = np.genfromtxt("Y_train.txt", delimiter=None)
x_test = np.genfromtxt("X_test.txt", delimiter=None)

In [5]:
x_data = x_data_all
y_data = y_data_all

In [6]:
x_train, x_validation, y_train, y_validation = model_selection.train_test_split(
    x_data, y_data, test_size=0.2, random_state=42)

In [7]:
logistic_classifier = pipeline.Pipeline([
    ('poly', preprocessing.PolynomialFeatures(degree=2, interaction_only=False)),
    ('logistic', linear_model.LogisticRegression())])

print("training started")
logistic_classifier.fit(x_train, y_train)
print("training finished")

logistic_classifier_roc = metrics.roc_auc_score(y_validation, logistic_classifier.predict_proba(x_validation)[:,1])
print("validation roc:", logistic_classifier_roc)

print("training error:", 1 - logistic_classifier.score(x_train, y_train))
print("validation error:", 1 - logistic_classifier.score(x_validation, y_validation))

training started
training finished
('validation roc:', 0.6678297915568554)
('training error:', 0.30789374999999997)
('validation error:', 0.30530000000000002)


In [8]:
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors=10, weights="distance", n_jobs=-1)

print("training started")
starting_time = time.time()
knn_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

y_validation_hat = knn_classifier.predict_proba(x_validation)[:,1]

knn_classifier_roc = metrics.roc_auc_score(y_validation, y_validation_hat)
print(knn_classifier_roc)

print("training error:", 1 - knn_classifier.score(x_train, y_train))
print("validation error:", 1 - knn_classifier.score(x_validation, y_validation))

training started
training finished, took 11.412612915 seconds
0.722975348052
('training error:', 0.051062500000000011)
('validation error:', 0.28749999999999998)


In [9]:
random_forest_classifier = ensemble.RandomForestClassifier(
    n_estimators=1000, min_samples_leaf=4, n_jobs=-1, oob_score=True)

print("training started")
starting_time = time.time()
random_forest_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

random_forest_classifier_roc = metrics.roc_auc_score(
    y_validation, random_forest_classifier.predict_proba(x_validation)[:,1])
print(random_forest_classifier_roc)

print("training error:", 1 - random_forest_classifier.score(x_train, y_train))
print("validation error:", 1 - random_forest_classifier.score(x_validation, y_validation))

training started
training finished, took 226.163815975 seconds
0.786231409184
('training error:', 0.12881874999999998)
('validation error:', 0.25797499999999995)


In [None]:
neural_network_classifier = pipeline.Pipeline([
    ("scale", preprocessing.StandardScaler().fit(x_train)),
    ("nn", neural_network.MLPClassifier(
        hidden_layer_sizes=(14*20, 14*10),
        warm_start=True))])

print("training started")
starting_time = time.time()
neural_network_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

neural_network_classifier_roc = metrics.roc_auc_score(
    y_validation, neural_network_classifier.predict_proba(x_validation)[:,1])
print("validation roc:", neural_network_classifier_roc)

training started


In [19]:
extra_tree_classifier = ensemble.ExtraTreesClassifier(
    n_estimators=500, max_depth=50, min_samples_split=10, min_samples_leaf=2, max_features='log2', n_jobs=-1)

print("training started")
starting_time = time.time()
extra_tree_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

extra_tree_classifier_roc = metrics.roc_auc_score(
    y_validation, extra_tree_classifier.predict_proba(x_validation)[:,1])
print("validation roc:", extra_tree_classifier_roc)

training started
training finished, took 50.4556488991 seconds
('validation roc:', 0.78362188463172111)


In [20]:
naive_bayes_classifier = naive_bayes.GaussianNB()

print("training started")
starting_time = time.time()
naive_bayes_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

naive_bayes_classifier_roc = metrics.roc_auc_score(
    y_validation, naive_bayes_classifier.predict_proba(x_validation)[:,1])
print("validation roc:", naive_bayes_classifier_roc)

training started
training finished, took 0.123184204102 seconds
('validation roc:', 0.61240952891841061)


In [21]:
gradient_boosting_classifier = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.5,
     max_depth=10, max_leaf_nodes=15, random_state=0)

print("training started")
starting_time = time.time()
gradient_boosting_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

gradient_boosting_classifier_roc = metrics.roc_auc_score(
   y_validation, gradient_boosting_classifier.predict_proba(x_validation)[:,1])
print(gradient_boosting_classifier_roc)

print("training error:", 1 - gradient_boosting_classifier.score(x_train, y_train))
print("validation error:", 1 - gradient_boosting_classifier.score(x_validation, y_validation))

training started
training finished, took 50.8728189468 seconds
0.744006779983
('training error:', 0.25843125)
('validation error:', 0.27507499999999996)


In [22]:
ada_classifier = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=3, max_features=7),
               n_estimators=1000, learning_rate=0.5)

print("training started")
starting_time = time.time()
ada_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

ada_classifier_roc = metrics.roc_auc_score(
   y_validation, ada_classifier.predict_proba(x_validation)[:,1])
print(ada_classifier_roc)

training started
training finished, took 277.880982161 seconds
0.76433083815


In [23]:
classifier_list = [
    ("logistic_regression", logistic_classifier, 3),
    ("k_nearest_neighbor", knn_classifier, 1),
    ("random_forest", random_forest_classifier, 15),
    ("neural_network", neural_network_classifier, 4),
    ("gradient_boosting", gradient_boosting_classifier, 9),
#     ("ada_boosting", ada_classifier, 9),
]


y_validation_hat_list = []
for classifier in classifier_list:
    y_validation_hat = classifier[1].predict_proba(x_validation)[:,1]
    for i in range(classifier[2]):
        y_validation_hat_list.append(y_validation_hat)
        
y_validation_hat_average = np.mean(np.array(y_validation_hat_list), axis=0)

voting_roc = metrics.roc_auc_score(y_validation, y_validation_hat_average)
print("roc:", voting_roc)
print("weight:", [classifier[2] for classifier in classifier_list])

('roc:', 0.78682868185895594)
('weight:', [3, 1, 15, 4, 9])


In [24]:
y_test_hat_list = []
for classifier in classifier_list:
    y_test_hat = classifier[1].predict_proba(x_test)[:,1]
    for i in range(classifier[2]):
        y_test_hat_list.append(y_test_hat)
        
y_test_hat_average = np.mean(np.array(y_test_hat_list), axis=0)

In [27]:
to_kaggle(y_test_hat_average)
print("finished")

finished
