In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import time
from sklearn import *

from itertools import product



In [2]:
# load data
x_data_all = np.genfromtxt("data/X_train.txt", delimiter=None)
y_data_all = np.genfromtxt("data/Y_train.txt", delimiter=None)
x_test = np.genfromtxt("data/X_test.txt", delimiter=None)

In [3]:
# x_data = x_data_all[:10000, :]
# y_data = y_data_all[:10000]

x_data = x_data_all
y_data = y_data_all

In [4]:
# split training / validation data
x_train, x_validation, y_train, y_validation = model_selection.train_test_split(
    x_data, y_data, test_size=0.2, random_state=42)

In [5]:
gradient_boosting_classifier = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.5,
     max_depth=15, max_leaf_nodes=150, random_state=0)

print("training started")
starting_time = time.time()
gradient_boosting_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

gradient_boosting_classifier_roc = metrics.roc_auc_score(
   y_validation, gradient_boosting_classifier.predict_proba(x_validation)[:,1])
print("ROC:", gradient_boosting_classifier_roc)

training started
training finished, took 454.1300449371338 seconds
ROC: 0.775414018092


In [6]:
random_forest_classifier = ensemble.RandomForestClassifier(
    n_estimators=1000, min_samples_leaf=5, n_jobs=-1, oob_score=True)

print("training started")
starting_time = time.time()
random_forest_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

random_forest_classifier_roc = metrics.roc_auc_score(
    y_validation, random_forest_classifier.predict_proba(x_validation)[:,1])
print("ROC:", random_forest_classifier_roc)

training started
training finished, took 414.6953058242798 seconds
ROC: 0.786627537826


In [7]:
extra_tree_classifier = ensemble.ExtraTreesClassifier(
    n_estimators=1000, max_depth=50, min_samples_split=10, min_samples_leaf=2, max_features='log2', n_jobs=-1)

print("training started")
starting_time = time.time()
extra_tree_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

extra_tree_classifier_roc = metrics.roc_auc_score(
    y_validation, extra_tree_classifier.predict_proba(x_validation)[:,1])
print("validation roc:", extra_tree_classifier_roc)

training started
training finished, took 161.22091603279114 seconds
validation roc: 0.783247667724


In [9]:
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors=10, weights="distance", n_jobs=-1)

print("training started")
starting_time = time.time()
knn_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

y_validation_hat = knn_classifier.predict_proba(x_validation)[:,1]

knn_classifier_roc = metrics.roc_auc_score(y_validation, y_validation_hat)
print("ROC:",knn_classifier_roc)

training started
training finished, took 18.43257713317871 seconds
ROC: 0.722975348052


In [8]:
gradient_boosting_classifier = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.5,
     max_depth=15, max_leaf_nodes=150, random_state=0)
random_forest_classifier = ensemble.RandomForestClassifier(
    n_estimators=1000, min_samples_leaf=5, n_jobs=-1, oob_score=True)
extra_tree_classifier = ensemble.ExtraTreesClassifier(
    n_estimators=1000, max_depth=50, min_samples_split=10, min_samples_leaf=2, max_features='log2', n_jobs=-1)
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors=10, weights="distance", n_jobs=-1)


voting_classifier = ensemble.VotingClassifier(
    estimators=[('GB',gradient_boosting_classifier), ('RF',random_forest_classifier),
                ('Extra',extra_tree_classifier), ('KNN',knn_classifier)], voting='soft',
    weights=[77,78,78,72], flatten_transform=True)

print("training started")
starting_time = time.time()
voting_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

y_validation_hat = voting_classifier.predict_proba(x_validation)[:,1]
voting_classifier_roc = metrics.roc_auc_score(y_validation, y_validation_hat)
print("ROC:",voting_classifier_roc)

training started
training finished, took 902.1342298984528 seconds
ROC: 0.785299689502


In [9]:
gradient_boosting_classifier = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.5,
     max_depth=15, max_leaf_nodes=150, random_state=0)
random_forest_classifier = ensemble.RandomForestClassifier(
    n_estimators=1000, min_samples_leaf=5, n_jobs=-1, oob_score=True)
extra_tree_classifier = ensemble.ExtraTreesClassifier(
    n_estimators=1000, max_depth=50, min_samples_split=10, min_samples_leaf=2, max_features='log2', n_jobs=-1)
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors=10, weights="distance", n_jobs=-1)


voting_classifier = ensemble.VotingClassifier(
    estimators=[('GB',gradient_boosting_classifier), ('RF',random_forest_classifier),
                ('Extra',extra_tree_classifier), ('KNN',knn_classifier)], voting='soft',
    weights=[7,15,8,2], flatten_transform=True)

print("training started")
starting_time = time.time()
voting_classifier.fit(x_train, y_train)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

y_validation_hat = voting_classifier.predict_proba(x_validation)[:,1]
voting_classifier_roc = metrics.roc_auc_score(y_validation, y_validation_hat)
print("ROC:",voting_classifier_roc)

training started
training finished, took 921.9069209098816 seconds
ROC: 0.79136950903


In [10]:
gradient_boosting_classifier = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.5,
     max_depth=15, max_leaf_nodes=150, random_state=0)
random_forest_classifier = ensemble.RandomForestClassifier(
    n_estimators=1000, min_samples_leaf=5, n_jobs=-1, oob_score=True)
extra_tree_classifier = ensemble.ExtraTreesClassifier(
    n_estimators=1000, max_depth=50, min_samples_split=10, min_samples_leaf=2, max_features='log2', n_jobs=-1)
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors=10, weights="distance", n_jobs=-1)


voting_classifier = ensemble.VotingClassifier(
    estimators=[('GB',gradient_boosting_classifier), ('RF',random_forest_classifier),
                ('Extra',extra_tree_classifier), ('KNN',knn_classifier)], voting='soft',
    weights=[7,15,8,2], flatten_transform=True)

print("training started")
starting_time = time.time()
voting_classifier.fit(x_data, y_data)
end_time = time.time()
print("training finished, took {} seconds".format(end_time - starting_time))

y_test_hat = voting_classifier.predict_proba(x_test)[:,1]
#voting_classifier_roc = metrics.roc_auc_score(y_validation, y_validation_hat)
#print("ROC:",voting_classifier_roc)

training started
training finished, took 1356.8677637577057 seconds


In [11]:
Yte = np.vstack((np.arange(len(y_test_hat)), y_test_hat)).T
np.savetxt('Y_submit_Final1.txt', Yte, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')

## Test Result ROC=0.79721