In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import time
from sklearn import *

from itertools import product



In [2]:
# a helper function to generate a file
def to_kaggle(prediction):
    current_time_str = time.strftime("%H-%M-%S_%a_%b_%d", time.localtime())
    file_name = "results/{}.txt".format(current_time_str)
    np.savetxt(file_name,
           np.vstack( (np.arange(len(prediction)) , prediction) ).T,
           '%d, %.2f',header='ID,Prob1',comments='',delimiter=',');


In [3]:
# load data
x_data_all = np.genfromtxt("data/X_train.txt", delimiter=None)
y_data_all = np.genfromtxt("data/Y_train.txt", delimiter=None)
x_test = np.genfromtxt("data/X_test.txt", delimiter=None)

In [4]:
# x_data = x_data_all[:10000, :]
# y_data = y_data_all[:10000]

x_data = x_data_all
y_data = y_data_all

In [5]:
# split training / validation data
x_train, x_validation, y_train, y_validation = model_selection.train_test_split(
    x_data, y_data, test_size=0.2, random_state=42)

In [6]:
print(y_validation)

[ 0.  1.  0. ...,  0.  0.  1.]


In [9]:
logistic_classifier = pipeline.Pipeline([
    ('poly', preprocessing.PolynomialFeatures(degree=2, interaction_only=False)),
    ('logistic', linear_model.LogisticRegression())])

print("training started")
logistic_classifier.fit(x_train, y_train)
print("training finished")

logistic_classifier_roc = metrics.roc_auc_score(y_validation, logistic_classifier.predict_proba(x_validation)[:,1])
print("validation roc:", logistic_classifier_roc)

print("training error:", 1 - logistic_classifier.score(x_train, y_train))
print("validation error:", 1 - logistic_classifier.score(x_validation, y_validation))

training started
training finished
validation roc: 0.662912115548
training error: 0.310375
validation error: 0.30025


0.666027054393


In [16]:
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors=10, weights="distance", n_jobs=-1)

print("training started")
starting_time = time.time()
knn_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

y_validation_hat = knn_classifier.predict_proba(x_validation)[:,1]

knn_classifier_roc = metrics.roc_auc_score(y_validation, y_validation_hat)
print(knn_classifier_roc)

print("training error:", 1 - knn_classifier.score(x_train, y_train))
print("validation error:", 1 - knn_classifier.score(x_validation, y_validation))

training started
training finished, took 1.9437119960784912 seconds
0.698577652125
training error: 0.032875
validation error: 0.2981


In [11]:
random_forest_classifier = ensemble.RandomForestClassifier(
    n_estimators=1000, min_samples_leaf=4, n_jobs=-1, oob_score=True)

print("training started")
starting_time = time.time()
random_forest_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

random_forest_classifier_roc = metrics.roc_auc_score(
    y_validation, random_forest_classifier.predict_proba(x_validation)[:,1])
print(random_forest_classifier_roc)

print("training error:", 1 - random_forest_classifier.score(x_train, y_train))
print("validation error:", 1 - random_forest_classifier.score(x_validation, y_validation))

training started
training finished, took 115.00101113319397 seconds
0.755056462307
training error: 0.12035
validation error: 0.2688


In [20]:
neural_network_classifier = pipeline.Pipeline([
    ("scale", preprocessing.StandardScaler().fit(x_train)),
    ("nn", neural_network.MLPClassifier(
        hidden_layer_sizes=(14*20, 14*10),
        warm_start=True))])

print("training started")
starting_time = time.time()
neural_network_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

neural_network_classifier_roc = metrics.roc_auc_score(
    y_validation, neural_network_classifier.predict_proba(x_validation)[:,1])
print("validation roc:", neural_network_classifier_roc)

training started
training finished, took 631.049479007721 seconds
validation roc: 0.704233635354


In [21]:
print("training error:", 1 - neural_network_classifier.score(x_train, y_train))
print("validation error:", 1 - neural_network_classifier.score(x_validation, y_validation))

training error: 0.21175
validation error: 0.29475


In [58]:
extra_tree_classifier = ensemble.ExtraTreesClassifier(
    n_estimators=500, max_depth=50, min_samples_split=10, min_samples_leaf=2, max_features='log2', n_jobs=-1)

print("training started")
starting_time = time.time()
extra_tree_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

extra_tree_classifier_roc = metrics.roc_auc_score(
    y_validation, extra_tree_classifier.predict_proba(x_validation)[:,1])
print("validation roc:", extra_tree_classifier_roc)

training started
training finished, took 25.13662600517273 seconds
validation roc: 0.956336506031


In [59]:
naive_bayes_classifier = naive_bayes.GaussianNB()

print("training started")
starting_time = time.time()
naive_bayes_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

naive_bayes_classifier_roc = metrics.roc_auc_score(
    y_validation, naive_bayes_classifier.predict_proba(x_validation)[:,1])
print("validation roc:", naive_bayes_classifier_roc)

training started
training finished, took 0.06046605110168457 seconds
validation roc: 0.602394200656


In [42]:
gradient_boosting_classifier = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.5,
     max_depth=10, max_leaf_nodes=15, random_state=0)

print("training started")
starting_time = time.time()
gradient_boosting_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

gradient_boosting_classifier_roc = metrics.roc_auc_score(
   y_validation, gradient_boosting_classifier.predict_proba(x_validation)[:,1])
print(gradient_boosting_classifier_roc)

print("training error:", 1 - gradient_boosting_classifier.score(x_train, y_train))
print("validation error:", 1 - gradient_boosting_classifier.score(x_validation, y_validation))

training started
training finished, took 22.75811004638672 seconds
0.722670601296
training error: 0.2485625
validation error: 0.2793


In [38]:
print("training error:", 1 - gradient_boosting_classifier.score(x_train, y_train))
print("validation error:", 1 - gradient_boosting_classifier.score(x_validation, y_validation))

training error: 0.262225
validation error: 0.2847


In [25]:
ada_classifier = ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=3, max_features=7),
               n_estimators=1000, learning_rate=0.5)

print("training started")
starting_time = time.time()
ada_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

ada_classifier_roc = metrics.roc_auc_score(
   y_validation, ada_classifier.predict_proba(x_validation)[:,1])
print(ada_classifier_roc)

training started
training finished, took 147.051344871521 seconds
0.729018364083


In [36]:
# classifier_list = [
#     ("logistic_regression", logistic_classifier, 3),
#     ("naive_bayes", naive_bayes_classifier, 1),
#     ("k_nearest_neighbor", knn_classifier, 4),
#     ("random_forest", random_forest_classifier, 11),
#     ("extra_tree", extra_tree_classifier, 7),
#     ("neural_network", neural_network_classifier, 7),
#     ("gradient_boosting", gradient_boosting_classifier, 9),
#     ("ada_boosting", ada_classifier, 8),
# ]

classifier_list = [
    ("logistic_regression", logistic_classifier, 3),
    ("k_nearest_neighbor", knn_classifier, 1),
    ("random_forest", random_forest_classifier, 15),
    ("neural_network", neural_network_classifier, 4),
    ("gradient_boosting", gradient_boosting_classifier, 9),
#     ("ada_boosting", ada_classifier, 9),
]


y_validation_hat_list = []
for classifier in classifier_list:
    y_validation_hat = classifier[1].predict_proba(x_validation)[:,1]
    for i in range(classifier[2]):
        y_validation_hat_list.append(y_validation_hat)
        
y_validation_hat_average = np.mean(np.array(y_validation_hat_list), axis=0)

voting_roc = metrics.roc_auc_score(y_validation, y_validation_hat_average)
print("roc:", voting_roc)
print("weight:", [classifier[2] for classifier in classifier_list])

roc: 0.762889233293
weight: [3, 1, 15, 4, 9]


In [84]:
y_test_hat_list = []
for classifier in classifier_list:
    y_test_hat = classifier[1].predict_proba(x_test)[:,1]
    for i in range(classifier[2]):
        y_test_hat_list.append(y_test_hat)
        
y_test_hat_average = np.mean(np.array(y_test_hat_list), axis=0)


In [85]:
to_kaggle(y_test_hat_average)

In [86]:
print("finished")

finished


In [None]:
# weights = [0, 1, 3, 6]
# weight_list = list(product(weights, repeat=len(classifier_list)))

# roc_search_list = []

# for weight_i in range(len(weight_list)):
#     print("weight #", weight_i)
#     # find the voting_roc
#     y_validation_hat_list = []
#     for classifier_i in range(len(classifier_list)):
#         y_validation_hat = classifier_list[classifier_i][1].predict_proba(x_validation)[:,1]
#         for i in range(weight_list[weight_i][classifier_i]):
#             y_validation_hat_list.append(y_validation_hat)
#     y_validation_hat_average = np.mean(np.array(y_validation_hat_list), axis=0)
#     voting_roc = metrics.roc_auc_score(y_validation, y_validation_hat_average)
#     # add the voting_roc to search list
#     roc_search_list.append(voting_roc)
            

In [None]:
max_roc = max(roc_search_list)
max_roc_weight = weight_list[roc_search_list.index(max_roc)]

print("max roc:", max_roc)
print("weight:", max_roc_weight)

In [None]:
classifier_list