# Kaggle Competition

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
from sklearn.metrics import roc_auc_score

np.random.seed(0)
%matplotlib inline

## Loading Data

In [2]:
# Load the training data
X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)

# And the test features
Xte = np.genfromtxt('data/X_test.txt', delimiter=None)

In [3]:
print(X.shape, Y.shape, Xte.shape)

(200000, 14) (200000,) (200000, 14)


In [4]:
# Xtr, Xva, Ytr, Yva = ml.splitData(X, Y) # Default is 80% training/20% validation
# Xtr, Ytr = ml.shuffleData(Xtr, Ytr)

# X, Y = ml.shuffleData(X, Y)
# Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, train_fraction=0.80)

Xtr, Xva, Ytr, Yva = ml.splitData(X, Y, train_fraction=0.80)
Xtr, Ytr = ml.shuffleData(Xtr, Ytr)

In [5]:
print(Xtr.shape, Xva.shape)

(160000, 14) (40000, 14)


In [6]:
# Taking a subsample of the data so that trains faster.  You should train on whole data for the Kaggle competition.
Xt, Yt = Xtr[:4000], Ytr[:4000]

## kNN Classifier

#### Approach #1

In [None]:
from sklearn.neighbors import KNeighborsClassifier

nBag = 20
k = 6
nChoice = 100000

nTrainData = Xtr.shape[0]
nVaData = Xva.shape[0]
Ytr_hat_mat = np.zeros((nTrainData, nBag))
Yva_hat_mat = np.zeros((nVaData, nBag))

for i in range(nBag):
    ind = np.random.choice(nTrainData, nChoice, replace=False)
    Xi, Yi = Xtr[ind,:] , Ytr[ind]
    neigh = KNeighborsClassifier(n_neighbors=k, weights="distance", leaf_size=60)
    neigh.fit(Xi, Yi)
    Ytr_hat_mat[:,i] = neigh.predict(Xtr)
    Yva_hat_mat[:,i] = neigh.predict(Xva)
Ytr_hat = (np.mean(Ytr_hat_mat, axis=1) > 0).astype(int)
Yva_hat = (np.mean(Yva_hat_mat, axis=1) > 0).astype(int)
print('Training AUC:',roc_auc_score(Ytr, Ytr_hat))
print('Validation AUC:',roc_auc_score(Yva, Yva_hat))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

nBag = 2
k = 6

nTrainData = Xtr.shape[0]
nVaData = Xva.shape[0]
nChoice = nTrainData // nBag
Ytr_hat_mat = np.zeros((nTrainData, nBag))
Yva_hat_mat = np.zeros((nVaData, nBag))

for i in range(nBag):
    ind = np.random.choice(nTrainData, nChoice, replace=False)
    Xi, Yi = Xtr[ind,:] , Ytr[ind]
    neigh = KNeighborsClassifier(n_neighbors=k, weights="distance", leaf_size=60)
    neigh.fit(Xi, Yi)
    Ytr_hat_mat[:,i] = neigh.predict(Xtr)
    Yva_hat_mat[:,i] = neigh.predict(Xva)
Ytr_hat = (np.mean(Ytr_hat_mat, axis=1) > 0).astype(int)
Yva_hat = (np.mean(Yva_hat_mat, axis=1) > 0).astype(int)
print('Training AUC:',roc_auc_score(Ytr, Ytr_hat))
print('Validation AUC:',roc_auc_score(Yva, Yva_hat))

#### Approach #2

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(Xtr, Ytr)
Ytr_hat = neigh.predict(Xtr)
print('Training AUC:',roc_auc_score(Ytr, Ytr_hat))
Yva_hat = neigh.predict(Xva)
print('Validation AUC:',roc_auc_score(Yva, Yva_hat))

## Neural Network

#### Approach #1

In [None]:
nn = ml.nnet.nnetClassify()
nn.init_weights([Xtr.shape[1], 2000, len(np.unique(Ytr))], 'random', Xtr, Ytr)
nn.setActivation('logistic')

nn.train(Xtr, Ytr, stopTol=1e-8, stepsize=.25, stopIter=30)
print("{0:>15}: {1:.4f}".format('Train AUC',nn.auc(Xtr, Ytr)))
print("{0:>15}: {1:.4f}".format('Validation AUC', nn.auc(Xva, Yva)))

#### Approach #2

In [None]:
from sklearn.neural_network import MLPClassifier

clf_nnet = MLPClassifier(learning_rate='adaptive', solver='sgd', activation="logistic",
                         learning_rate_init=0.01, hidden_layer_sizes=(6000, 400, 20), max_iter=100)
clf_nnet.fit(Xtr, Ytr)
Ytr_hat = clf_nnet.predict(Xtr)
Yva_hat = clf_nnet.predict(Xva)
print('Train AUC:',roc_auc_score(Ytr, Ytr_hat))
print('Validation AUC:',roc_auc_score(Yva, Yva_hat))

# Random Forest

In [7]:
class BaggedTree(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedTree class with a set of learners. """
        self.learners = learners
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        n_bags = len(self.learners)
        preds = [self.learners[l].predictSoft(X) for l in range(n_bags)]
        return np.mean(preds, axis=0)

In [8]:

n_bags = 30
bags = []
for l in range(n_bags):
    Xi, Yi = ml.bootstrapData(Xtr, Ytr, X.shape[0] //4 )
    tree = ml.dtree.treeClassify(Xi, Yi,minLeaf = 8, minParent = 16, maxDepth=28)
    bags.append(tree)
    
bt = BaggedTree(bags)
bt.classes = np.unique(Y)
probs = bt.predictSoft(Xte)

print(probs)
print("{0:>15}: {1:.4f}".format('Train AUC', bt.auc(Xtr, Ytr)))
print("{0:>15}: {1:.4f}".format('Validation AUC', bt.auc(Xva, Yva)))

[[0.34959964 0.65040036]
 [0.56193982 0.43806018]
 [0.79910184 0.20089816]
 ...
 [0.69210961 0.30789039]
 [0.77858499 0.22141501]
 [0.85783268 0.14216732]]
      Train AUC: 0.8757
 Validation AUC: 0.7617


## Submitting Predictions

In [None]:
from sklearn.neighbors import KNeighborsClassifier

Xtr = X
Ytr = Y

nBag = 2
k = 6

nTrainData = Xtr.shape[0]
nTestData = Xte.shape[0]
nChoice = nTrainData // nBag
Ytr_hat_mat = np.zeros((nTrainData, nBag))
Yte_hat_mat = np.zeros((nTestData, nBag))
Yte_prob_mat = np.zeros((nTestData, nBag))

for i in range(nBag):
    ind = np.random.choice(nTrainData, nChoice, replace=False)
    Xi, Yi = Xtr[ind,:] , Ytr[ind]
    neigh = KNeighborsClassifier(n_neighbors=k, weights="distance", leaf_size=60)
    neigh.fit(Xi, Yi)
    Ytr_hat_mat[:,i] = neigh.predict(Xtr)
    Yte_hat_mat[:,i] = neigh.predict(Xte)
    Yte_prob_mat[:,i] = neigh.predict_proba(Xte)[:, 1]
Ytr_hat = (np.mean(Ytr_hat_mat, axis=1) > 0).astype(int)
Yte_hat = (np.mean(Yte_hat_mat, axis=1) > 0).astype(int)
Yte_prob = np.mean(Yte_prob_mat, axis=1)
print('Training AUC:',roc_auc_score(Ytr, Ytr_hat))

In [None]:
print(Yte_hat.shape)
print(Yte_prob.shape)

print(Yte_hat[:10])
print(Yte_prob[:10])

In [None]:
# Create the data for submission by taking the P(Y=1) column from probs and just add a running index as the first column.
Y_sub = np.vstack([np.arange(Xte.shape[0]), probs[:, 1]]).T

# We specify the header (ID, Prob1) and also specify the comments as '' so the header won't be commented out with
# the # sign.
np.savetxt('data/Y_sub.txt', Y_sub, '%d, %.5f', header='ID,Prob1', comments='', delimiter=',')