In [None]:
# Settings
NN_USE_LAST = True
PTH_PATH = '/content/drive/My Drive/CS 178/net.pth'

In [None]:
# Use gdrive
from google.colab import drive
drive.mount('/content/drive')

# Add path
import sys
sys.path.insert(1, '/content/drive/My Drive/CS 178')

# Imports
import numpy as np
import mltools as ml
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.utils.estimator_checks import check_estimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import time

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Classifiers

Each classifier must have these functions:

In [None]:
class Classifier():
  def __init__(self):
    pass

  def train(self, X, Y):
    pass

  def predict(self, X):
    pass

  def auc(self, Y, Ypred):
    fpr, tpr, thresholds = metrics.roc_curve(Y, Ypred[:, 1])
    return metrics.auc(fpr, tpr)

## Random Forest Classifier

In [None]:
class RFClassifier(Classifier):
  def __init__(self, nBags, maxDepth, minParent):
    super().__init__()
    self.nBags = nBags
    self.maxDepth = maxDepth
    self.minParent = minParent
    self.classifiers = [None] * nBags

  def train(self, X, Y):
    m = X.shape[0]
    for i in range(self.nBags):
      Xi, Yi = ml.bootstrapData(X, Y, m)
      self.classifiers[i] = ml.dtree.treeClassify(Xi, Yi, maxDepth=self.maxDepth, minParent=self.minParent)

  def predict(self, X):
    predict = [self.classifiers[i].predictSoft(X) for i in range(self.nBags)]
    return np.mean(predict, axis=0)

## kNN Classifier

In [None]:
class KNNClassifier(Classifier):
  def __init__(self, n_neighbors, weights):
    super().__init__()
    self.classifier = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)

  def train(self, X, Y):
    self.classifier = self.classifier.fit(X,Y)

  def predict(self, X):
    return self.classifier.predict_proba(X)

## Neural Network Classifier

In [None]:
class Net(nn.Module):
  def __init__(self, features, layers):
    super(Net, self).__init__()

    # Assemble layers.
    all_layers = []
    input_size = features
    for i in layers[:-1]:
      all_layers.append(nn.Linear(input_size, i))
      all_layers.append(nn.Tanh())
      # all_layers.append(nn.BatchNorm1d(i))
      # all_layers.append(nn.Dropout(0.2))
      input_size = i

    all_layers.append(nn.Linear(input_size, layers[-1]))
    all_layers.append(nn.Softmax(dim=1))

    self.layers = nn.Sequential(*all_layers)

  def forward(self, x):
    x = self.layers(x)
    return x

class NNClassifier(Classifier):
  def __init__(self, n_features, layers, output, class_weights, pth=None):
    super().__init__()
    n_l = [n_features * l for l in layers]
    n_l.append(output)
    self.net = Net(n_features, n_l)
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.net.to(self.device)
    self.weights = torch.Tensor(class_weights).float().to(self.device)
    self.criterion = nn.CrossEntropyLoss(weight=self.weights)
    self.optimizer = torch.optim.Adam(self.net.parameters(), lr=0.001)

    self.pth = pth
    if self.pth:
      self.net.load_state_dict(torch.load(self.pth))

  def train(self, X, Y, epochs=2000, batch_size=10000):
    if self.pth:
      return

    X = torch.from_numpy(X).float()
    Y = torch.from_numpy(Y).float().type(torch.LongTensor)

    losses = []
    timer = time.time()

    for epoch in range(epochs):
      epoch_loss = 0
      start = 0
      end = batch_size
      for i in range(int(X.shape[0] / batch_size)):
        Xb = X[start:end]
        Yb = Y[start:end]
        start += batch_size
        end += batch_size

        input = Xb.to(self.device)
        output = Yb.to(self.device)
        yhat = self.net(input)
        loss = self.criterion(yhat, output)
        epoch_loss += loss

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

      losses.append(epoch_loss)
      if epoch % 100 == 0:
        print(f"[{epoch}] {loss.item():10.8f}")

    print(f"[{epochs}] {loss.item():10.8f}")
    print('Finished training in', str(time.time() - timer) + 's')

  def predict(self, X):
    with torch.no_grad():
      input = torch.from_numpy(X).float().to(self.device)
      yhat = self.net(input)
      return yhat.cpu().numpy()

  def loss(self, X, Y):
    with torch.no_grad():
      input = torch.from_numpy(X).float().to(self.device)
      output = torch.from_numpy(Y).float().type(torch.LongTensor).to(self.device)
      yhat = self.net(input)
      loss = self.criterion(yhat, output)
      return loss.item()

# Training

Load training data.

In [None]:
X = np.genfromtxt('/content/drive/My Drive/CS 178/X_train.txt', delimiter=None)
Y = np.genfromtxt('/content/drive/My Drive/CS 178/Y_train.txt', delimiter=None)

# Normalize
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

Xtr, Xva, Ytr, Yva = train_test_split(X, Y, test_size=.25, shuffle=True)

Create classifiers.

In [None]:
# Random Forest
rf = RFClassifier(100, 4, 1024)

# kNN
knn = KNNClassifier(10, 'distance')

# Neural Network
classes = np.unique(Y)
n_i = X.shape[1]
n_o = len(classes)
classes_size = np.zeros(len(classes))
for i, c in enumerate(classes):
  classes_size[i] = (Y == c).sum().item()
weights = [1 / c for c in classes_size]
pth = PTH_PATH if NN_USE_LAST else None
nn = NNClassifier(n_i, [64, 32, 16, 8], n_o, weights, pth)

Train each classifier.

In [None]:
print('Training random forest...')
rf.train(Xtr, Ytr)

print('Training kNN...')
knn.train(Xtr, Ytr)

print('Training neural network...')
nn.train(Xtr, Ytr)

Training random forest...
Training kNN...
Training neural network...


Get prediction for each classifier.

In [None]:
predRFtr = rf.predict(Xtr)
predRFva = rf.predict(Xva)

predKNNtr = knn.predict(Xtr)
predKNNva = knn.predict(Xva)

predNNtr = nn.predict(Xtr)
predNNva = nn.predict(Xva)

Print each classifier score.

In [None]:
print("TRAINING SCORE")
print('Random forest:', rf.auc(Ytr, predRFtr))
print('kNN:', knn.auc(Ytr, predKNNtr))
print('NN:', nn.auc(Ytr, predNNtr))
print()
print("VALIDATION SCORE")
print('Random forest:', rf.auc(Yva, predRFva))
print('kNN:', knn.auc(Yva, predKNNva))
print('NN:', nn.auc(Yva, predNNva))

TRAINING SCORE
Random forest: 0.6649325596032527
kNN: 0.9941760208037207
NN: 0.8997748078966752

VALIDATION SCORE
Random forest: 0.6584797879452038
kNN: 0.716388364479713
NN: 0.898101000076494


# Ensemble

In [None]:
def auc(Y, Ypred):
  fpr, tpr, thresholds = metrics.roc_curve(Y, Ypred)
  return metrics.auc(fpr, tpr)

## Majority Voting

In [None]:
def majorityVote(Ypred, weights, size):
  # Ypred - list of predictions per classifer
  # weights - list of weights per each classifer
  # size - size of dataset
  
  Yfin = np.zeros(size)

  for i in range(size):
    yi = [y[i, 1] for y in Ypred]
    pred_0, pred_1 = 0, 0
    predv_0, predv_1 = 0, 0
    for j, yij in enumerate(yi):
      if yij < 0.5:
        pred_0 += weights[j]
        predv_0 += yij * weights[j]
      else:
        pred_1 += weights[j]
        predv_1 += yij * weights[j]

    if pred_0 > pred_1:
      pred_final = predv_0 / pred_0
    else:
      pred_final = predv_1 / pred_1

    Yfin[i] = pred_final

  return Yfin

Test out some combinations.

In [None]:
# KNN VAL: predVal, predTrain 
# RF: predictVa, predictTr
# NN: YvaPredNN, YtrPredNN

weights = [rf.auc(Yva, predRFva), knn.auc(Yva, predKNNva), nn.auc(Yva, predNNva)]

print('RF + KNN')
# Result of RF + KNN on Training
ensembleTr = majorityVote([predRFtr, predKNNtr], weights, len(Ytr))
print('Training:', auc(Ytr, ensembleTr))

# Results of RF + KNN on Validation
ensembleVa = majorityVote([predRFva, predKNNva], weights, len(Yva))
print('Validation:', auc(Yva, ensembleVa))

print()
print('RF + KNN + NN')
# Result of RF + KNN + NN on Training
Ypredtr = [predRFtr, predKNNtr, predNNtr]
ensembleTr = majorityVote(Ypredtr, weights, len(Ytr))
print('Training:', auc(Ytr, ensembleTr))

# Results of RF + KNN + NN on Validation
YpredVa = [predRFva, predKNNva, predNNva]
ensembleVa = majorityVote(YpredVa, weights, len(Yva))
print('Validation:', auc(Yva, ensembleVa))

RF + KNN
Training: 0.9940122971370583
Validation: 0.7182815038507545

RF + KNN + NN
Training: 0.9753933073436409
Validation: 0.8408107774566507


In [None]:
Xte = np.genfromtxt('/content/drive/My Drive/CS 178/X_test.txt', delimiter=None)
Xte = scaler.transform(Xte)
Ypred = [rf.predict(Xte), knn.predict(Xte), nn.predict(Xte)]
Yte = majorityVote(Ypred, weights, Xte.shape[0])
Yte = np.vstack((np.arange(Xte.shape[0]), Yte)).T
np.savetxt('/content/drive/My Drive/CS 178/Y_Predictions.txt', Yte, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')

## Majority Voting 2

This one use the idea from [here](https://scikit-learn.org/stable/modules/ensemble.html#weighted-average-probabilities-soft-voting).

In [None]:
def majorityVote2(Ypred, weights):
  # Ypred - list of predictions per classifer (np.array)
  # weights - list of weights per each classifer
  
  n_classifier, n_data, n_class = Ypred.shape
  Yfin = [None] * n_data

  for i in range(n_data):
    yi = Ypred[:, i, :]
    d_class = np.zeros(n_class)
    for j in range(n_class):
      d_class[j] = np.sum(weights * yi[:, j]) / n_classifier
    Yfin[i] = d_class
  return np.array(Yfin)

In [None]:
aucs = [rf.auc(Yva, predRFva), knn.auc(Yva, predKNNva), nn.auc(Yva, predNNva)]
weights = [max(aucs) / auc for auc in aucs]

print('RF + KNN + NN')
# Result of RF + KNN + NN on Training
YpredTr = np.array([predRFtr, predKNNtr, predNNtr])
ensembleTr = majorityVote2(YpredTr, weights)
print('Training:', auc(Ytr, ensembleTr[:, 1]))

# Results of RF + KNN + NN on Validation
YpredVa = np.array([predRFva, predKNNva, predNNva])
ensembleVa = majorityVote2(YpredVa, weights)
print('Validation:', auc(Yva, ensembleVa[:, 1]))

RF + KNN + NN
Training: 0.9856361422101106
Validation: 0.8731616079713447


In [None]:
Xte = np.genfromtxt('/content/drive/My Drive/CS 178/X_test.txt', delimiter=None)
Xte = scaler.transform(Xte)
Ypred = [rf.predict(Xte), knn.predict(Xte), nn.predict(Xte)]
Yte = majorityVote(Ypred, weights, Xte.shape[0])
Yte = np.vstack((np.arange(Xte.shape[0]), Yte)).T
np.savetxt('/content/drive/My Drive/CS 178/Y_Predictions.txt', Yte, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')

## Stacking

Create stacking classifier

In [None]:
estimators = [
              ('knn', KNeighborsClassifier(n_neighbors=10)), 
              ('rf', RandomForestClassifier(max_depth=1024)), 
              ('nn', MLPClassifier(hidden_layer_sizes=(8, 4), activation='tanh'))
             ]
sk = StackingClassifier(estimators=estimators, stack_method='predict_proba')


In [None]:
sk.fit(X, Y)

StackingClassifier(cv=None,
                   estimators=[('knn',
                                KNeighborsClassifier(algorithm='auto',
                                                     leaf_size=30,
                                                     metric='minkowski',
                                                     metric_params=None,
                                                     n_jobs=None,
                                                     n_neighbors=10, p=2,
                                                     weights='uniform')),
                               ('rf',
                                RandomForestClassifier(bootstrap=True,
                                                       ccp_alpha=0.0,
                                                       class_weight=None,
                                                       criterion='gini',
                                                       max_depth=1024,
                                      

In [None]:
yhat = sk.predict_proba(Xtr)
print('Stack Training:', auc(Ytr, yhat[:, 1]))

yhat = sk.predict_proba(Xva)
print('Stack Validation:', auc(Yva, yhat[:, 1]))

Stack Training: 0.9699776811410254
Stack Validation: 0.9696968496017537


In [None]:
Xte = np.genfromtxt('/content/drive/My Drive/CS 178/X_test.txt', delimiter=None)
Xte = scaler.transform(Xte)
Yte = sk.predict_proba(Xte)
Yte = np.vstack((np.arange(Xte.shape[0]), Yte[:, 1])).T
np.savetxt('/content/drive/My Drive/CS 178/Y_PredictionsStack.txt', Yte, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',')