# CS178 Final Project
### Joseph Zheng
### Shiqi Wu
### Qi Hong Chen

In [2]:
from __future__ import division

import numpy as np

import mltools as ml
import sys
sys.path.append('code')

import matplotlib.pyplot as plt   # use matplotlib for plotting with inline plots
plt.set_cmap('jet');
%matplotlib inline
import warnings
warnings.filterwarnings('ignore'); # for deprecated matplotlib functions

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import mutual_info_classif
import itertools

from sklearn.neighbors import KNeighborsRegressor

from sklearn.neural_network import MLPRegressor

from sklearn.ensemble import AdaBoostRegressor

ModuleNotFoundError: No module named 'mltools'

# KNN Classifier

In [2]:
class KNN(ml.base.classifier):
    def __init__(self, learner, features, k):
        """Constructs a BaggedTree class with a set of learners. """
        self.learner = learner
        self.classes = [0,1]
        self.features = features
        self.k = k
        self.train_auc = None
        self.valid_auc = None
    
    def __str__(self):
        return 'k = {}, features = {}, training_auc = {}, validation_auc = {}'.format(self.k, self.features, self.train_auc, self.valid_auc)
    
    def set_train_auc(self, train_auc):
        self.train_auc = train_auc
        
    def set_valid_auc(self, valid_auc):
        self.valid_auc = valid_auc
        
    def get_train_auc(self):
        return self.train_auc
        
    def get_valid_auc(self):
        return self.valid_auc
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        return np.vstack((1-self.learner.predict(X[:,self.features]), self.learner.predict(X[:,self.features]))).T

In [3]:
class BaggedKNN(ml.base.classifier):
    def __init__(self, learners):
        """Constructs a BaggedKNN class with a set of learners. """
        self.learners = learners
        self.classes = [0,1]
        self.train_auc = None
        self.valid_auc = None
        self.total_auc = None
    
    def __str__(self):
        return 'Bagged KNN, training_auc = {}, validation_auc = {}'.format(self.train_auc, self.valid_auc)
    
    def set_train_auc(self, train_auc):
        self.train_auc = train_auc
        
    def set_valid_auc(self, valid_auc):
        self.valid_auc = valid_auc
        
    def get_train_auc(self):
        return self.train_auc
        
    def get_valid_auc(self):
        return self.valid_auc
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        return np.average([learner.predictSoft(X) for learner in self.learners], weights=[learner.get_valid_auc() for learner in self.learners], axis=0)

In [4]:
X = np.genfromtxt('data/X_train.txt', delimiter=',')
Y = np.genfromtxt('data/Y_train.txt', delimiter=',')
Xtr, Xva, Ytr, Yva = train_test_split(X,Y,shuffle=False)

In [5]:
best_ten_knn = []
best_bagged_knn = None

trial = 0
empty_run_count = 0
while empty_run_count == 0 or empty_run_count / trial < 0.7:
    flag = False
    print('Trial {}:'.format(trial+1))
    mutual_info = mutual_info_classif(X, Y)
    mutual_info_rank = np.argsort(mutual_info)[::-1]
    top_features = mutual_info_rank[:3]
    print('Top 3 Features:', top_features)
    
    feature_combo = []
    for i in range(1,4):
        feature_combo += list(itertools.combinations(top_features, i))
    # print('Feature Combo:', feature_combo)
    
    for idx in feature_combo:
        idx = np.array(idx)
        # print('Features:', idx)
        if mutual_info_rank[0] in idx:
            for k in range(1,101):
                Xtr, Xva, Ytr, Yva = train_test_split(X,Y)
                knn = KNeighborsRegressor(n_neighbors=k)
                knn.fit(Xtr[:,idx], Ytr)
                ml_knn = KNN(knn, idx, k)
                ml_train_auc = ml_knn.auc(Xtr,Ytr)
                ml_valid_auc = ml_knn.auc(Xva,Yva)
                # print('Current:', idx, k, ml_train_auc, ml_valid_auc)
                ml_knn.set_train_auc(ml_train_auc)
                ml_knn.set_valid_auc(ml_valid_auc)
                if len(best_ten_knn) < 10:
                    best_ten_knn.append(ml_knn)
                    # print('New Best 10 KNN:\n', ml_knn)
                    flag = True
                    if best_bagged_knn is None:
                        best_bagged_knn = BaggedKNN(best_ten_knn)
                        bagged_train_auc = best_bagged_knn.auc(Xtr,Ytr)
                        bagged_valid_auc = best_bagged_knn.auc(Xva,Yva)
                        best_bagged_knn.set_train_auc(bagged_train_auc)
                        best_bagged_knn.set_valid_auc(bagged_valid_auc)
                        print('New Best Bagged KNN:\n', best_bagged_knn)
                    else:
                        bagged_knn = BaggedKNN(best_ten_knn)
                        bagged_train_auc = bagged_knn.auc(Xtr,Ytr)
                        bagged_valid_auc = bagged_knn.auc(Xva,Yva)
                        bagged_knn.set_train_auc(bagged_train_auc)
                        bagged_knn.set_valid_auc(bagged_valid_auc)
                        if bagged_knn.get_valid_auc() > best_bagged_knn.get_valid_auc():
                            best_bagged_knn = bagged_knn
                            print('New Best Bagged KNN:\n', best_bagged_knn)
                else:
                    for top_knn in best_ten_knn:
                        if top_knn.get_valid_auc() < ml_knn.get_valid_auc():
                            best_ten_knn.append(ml_knn)
                            # print('New Best 10 KNN:\n', ml_knn)
                            flag = True
                            best_ten_knn = sorted(best_ten_knn, key = lambda x : x.get_valid_auc(), reverse = True)[:10]
                            bagged_knn = BaggedKNN(best_ten_knn)
                            bagged_train_auc = bagged_knn.auc(Xtr,Ytr)
                            bagged_valid_auc = bagged_knn.auc(Xva,Yva)
                            bagged_knn.set_train_auc(bagged_train_auc)
                            bagged_knn.set_valid_auc(bagged_valid_auc)
                            if bagged_knn.get_valid_auc() > best_bagged_knn.get_valid_auc():
                                best_bagged_knn = bagged_knn
                                print('New Best Bagged KNN:\n', best_bagged_knn)
                            break
    if not flag:
        empty_run_count += 1
    else:
        print('Best 10 KNN:\n', '\n '.join(str(x) for x in best_ten_knn))
        print('Best Bagged KNN:\n', best_bagged_knn)
    trial += 1
    print('Empty Run Count:', empty_run_count)
    print('Empty Run Ratio:', empty_run_count / trial)

print('\nTotal Trials:', trial)
print('Very Best KNN:\n', best_ten_knn[0])
print('Very Best Bagged KNN:\n', best_bagged_knn)

Trial 1:
Top 3 Features: [28  7 56]
New Best Bagged KNN:
 Bagged KNN, training_auc = 0.5619284329137958, validation_auc = 0.556074549307632
New Best Bagged KNN:
 Bagged KNN, training_auc = 0.6356348480138472, validation_auc = 0.6375989335612366
New Best Bagged KNN:
 Bagged KNN, training_auc = 0.6510063116319892, validation_auc = 0.6566020766713765
New Best Bagged KNN:
 Bagged KNN, training_auc = 0.6686671416641174, validation_auc = 0.6587748003945062
New Best Bagged KNN:
 Bagged KNN, training_auc = 0.6708871902728488, validation_auc = 0.6683883901556288
New Best Bagged KNN:
 Bagged KNN, training_auc = 0.6671670628243889, validation_auc = 0.6856963416548908
New Best Bagged KNN:
 Bagged KNN, training_auc = 0.682528837415837, validation_auc = 0.6958209412811244
New Best Bagged KNN:
 Bagged KNN, training_auc = 0.6880143736132678, validation_auc = 0.6980242578141851
New Best Bagged KNN:
 Bagged KNN, training_auc = 0.6820670660785217, validation_auc = 0.7156204277851321
New Best Bagged KNN:


Top 3 Features: [28 56 70]
Best 10 KNN:
 k = 90, features = [28 56], training_auc = 0.6880618357482935, validation_auc = 0.7285546292846498
 k = 45, features = [28 56], training_auc = 0.6872326252396351, validation_auc = 0.7244099105532738
 k = 52, features = [28 56 70], training_auc = 0.702597460096019, validation_auc = 0.7178756479692994
 k = 64, features = [28 56 96], training_auc = 0.6986343403699621, validation_auc = 0.7175059350420679
 k = 80, features = [28 56], training_auc = 0.696669549279777, validation_auc = 0.716873419721871
 k = 87, features = [28 56], training_auc = 0.6889406974156577, validation_auc = 0.7160781171059738
 k = 100, features = [28 56 96], training_auc = 0.6950981685428007, validation_auc = 0.7143833873757904
 k = 42, features = [28 95], training_auc = 0.6893317277038207, validation_auc = 0.712277545118633
 k = 85, features = [28 56], training_auc = 0.6932863101152321, validation_auc = 0.7121850379708189
 k = 25, features = [28 56], training_auc = 0.70037641

Top 3 Features: [28 56  7]
Empty Run Count: 11
Empty Run Ratio: 0.4583333333333333
Trial 25:
Top 3 Features: [28  7 70]
Empty Run Count: 12
Empty Run Ratio: 0.48
Trial 26:
Top 3 Features: [ 28  89 103]
New Best Bagged KNN:
 Bagged KNN, training_auc = 0.7005317912806371, validation_auc = 0.7454762402452619
Best 10 KNN:
 k = 90, features = [28 56], training_auc = 0.6880618357482935, validation_auc = 0.7285546292846498
 k = 45, features = [28 56], training_auc = 0.6872326252396351, validation_auc = 0.7244099105532738
 k = 52, features = [28 56 70], training_auc = 0.702597460096019, validation_auc = 0.7178756479692994
 k = 64, features = [28 56 96], training_auc = 0.6986343403699621, validation_auc = 0.7175059350420679
 k = 78, features = [28 89], training_auc = 0.6858115852981647, validation_auc = 0.7168859750998792
 k = 80, features = [28 56], training_auc = 0.696669549279777, validation_auc = 0.716873419721871
 k = 85, features = [ 28  89 103], training_auc = 0.6782933231080676, validat

Top 3 Features: [28 95  7]
Empty Run Count: 36
Empty Run Ratio: 0.6666666666666666
Trial 55:
Top 3 Features: [28 56 59]
Empty Run Count: 37
Empty Run Ratio: 0.6727272727272727
Trial 56:
Top 3 Features: [28 56 67]
Best 10 KNN:
 k = 90, features = [28 56], training_auc = 0.6880618357482935, validation_auc = 0.7285546292846498
 k = 45, features = [28 56], training_auc = 0.6872326252396351, validation_auc = 0.7244099105532738
 k = 16, features = [28 56 42], training_auc = 0.7052510526088874, validation_auc = 0.7220933149379155
 k = 93, features = [28 60], training_auc = 0.6821959673704454, validation_auc = 0.7201809240130758
 k = 24, features = [28 56], training_auc = 0.6920439247022809, validation_auc = 0.7197167103282983
 k = 99, features = [28 73 56], training_auc = 0.6893167313808913, validation_auc = 0.7190585795195232
 k = 33, features = [28 56], training_auc = 0.6832026560968122, validation_auc = 0.7187128044879052
 k = 52, features = [28 56 70], training_auc = 0.702597460096019, va

In [6]:
Xte = np.genfromtxt('data/X_test.txt', delimiter=',')
print('Predicting using:', best_ten_knn[0])
Yte = np.vstack((np.arange(Xte.shape[0]), best_ten_knn[0].predictSoft(Xte)[:,1])).T
np.savetxt('Y_KNN_submit.txt',Yte,'%d, %.2f',header='Id,Predicted',comments='',delimiter=',')

Predicting using: k = 58, features = [28 56], training_auc = 0.6926510852462182, validation_auc = 0.7289553257900898


In [7]:
Xte = np.genfromtxt('data/X_test.txt', delimiter=',')
print('Predicting using:', best_bagged_knn)
Yte = np.vstack((np.arange(Xte.shape[0]), best_bagged_knn.predictSoft(Xte)[:,1])).T
np.savetxt('Y_Bagged_KNN_submit.txt',Yte,'%d, %.2f',header='Id,Predicted',comments='',delimiter=',')

Predicting using: Bagged KNN, training_auc = 0.7057581989113381, validation_auc = 0.7505352933454004


# Neural Network

In [1]:
class NN(ml.base.classifier):
    def __init__(self, learner, features, layers):
        """Constructs a BaggedTree class with a set of learners. """
        self.learner = learner
        self.classes = [0,1]
        self.features = features
        self.layers = layers
        self.train_auc = None
        self.valid_auc = None
        self.total_auc = None
        
    def __str__(self):
        return 'layers = {}, features = {}, training_auc = {}, validation_auc = {}'.format(self.layers, self.features, self.train_auc, self.valid_auc)
    
    def set_train_auc(self, train_auc):
        self.train_auc = train_auc
        
    def set_valid_auc(self, valid_auc):
        self.valid_auc = valid_auc
        
    def get_train_auc(self):
        return self.train_auc
        
    def get_valid_auc(self):
        return self.valid_auc
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        return np.vstack((1-self.learner.predict(X[:,self.features]), self.learner.predict(X[:,self.features]))).T

NameError: name 'ml' is not defined

In [9]:
X = np.genfromtxt('data/X_train.txt', delimiter=',')
Y = np.genfromtxt('data/Y_train.txt', delimiter=',')
Xtr, Xva, Ytr, Yva = train_test_split(X,Y,shuffle=False)

In [10]:
best_nn = None

trial = 0
empty_run_count = 0
while empty_run_count < 1:
    flag = False
    print('Trial {}:'.format(trial+1))
    mutual_info = mutual_info_classif(X, Y)
    mutual_info_rank = np.argsort(mutual_info)[::-1]
    top_features = mutual_info_rank[:3]
    print('Top 3 Features:', top_features)

    feature_combo = []
    for i in range(1,4):
        feature_combo += list(itertools.combinations(top_features, i))
    # print('Feature Combo:', feature_combo)
    
    layers_combo = list(itertools.combinations([800, 500, 300, 200, 100], 2))
    # print('Layers Combo:', layers_combo)
    
    for idx in feature_combo:
        idx = np.array(idx)
        print('Features:', idx)
        if mutual_info_rank[0] in idx:
            for layers in layers_combo:
                nn = MLPRegressor(hidden_layer_sizes=layers, activation='logistic', solver='adam')
                nn.fit(Xtr[:,idx], Ytr)
                ml_nn = NN(nn, idx, layers)
                ml_train_auc = ml_nn.auc(Xtr,Ytr)
                ml_valid_auc = ml_nn.auc(Xva,Yva)
                ml_nn.set_train_auc(ml_train_auc)
                ml_nn.set_valid_auc(ml_valid_auc)
                if best_nn is None:
                    best_nn = ml_nn
                    print('New Best NN:\n', ml_nn)
                    flag = True
                else:
                    if best_nn.get_valid_auc() < ml_nn.get_valid_auc():
                        best_nn = ml_nn
                        print('New Best NN:\n', ml_nn)
                        flag = True
    if not flag:
        empty_run_count += 1
    trial += 1
    print('Empty Run Count:', empty_run_count)
    print('Empty Run Ratio:', empty_run_count / trial)

print('\nTotal Trials:', trial)
print('Very Best NN:\n', best_nn)

Trial 1:
Top 3 Features: [28 59 56]
Features: [28]
New Best NN:
 layers = (800, 500), features = [28], training_auc = 0.5337634131306185, validation_auc = 0.526341017873266
New Best NN:
 layers = (800, 300), features = [28], training_auc = 0.6724490786346073, validation_auc = 0.6546053204951665
New Best NN:
 layers = (800, 200), features = [28], training_auc = 0.6839802148579076, validation_auc = 0.6741873596116013
New Best NN:
 layers = (500, 300), features = [28], training_auc = 0.6882044733152014, validation_auc = 0.6760239565212844
Features: [59]
Features: [56]
Features: [28 59]
Features: [28 56]
New Best NN:
 layers = (800, 500), features = [28 56], training_auc = 0.6926245980707395, validation_auc = 0.6794200049037893
New Best NN:
 layers = (500, 300), features = [28 56], training_auc = 0.6942306371804959, validation_auc = 0.6800672586074865
Features: [59 56]
Features: [28 59 56]
Empty Run Count: 0
Empty Run Ratio: 0.0
Trial 2:
Top 3 Features: [28  7 87]
Features: [28]
Features: 

In [11]:
Xte = np.genfromtxt('data/X_test.txt', delimiter=',')
print('Predicting using:', best_nn)
Yte = np.vstack((np.arange(Xte.shape[0]), best_nn.predictSoft(Xte)[:,1])).T
np.savetxt('Y_NN_submit.txt',Yte,'%d, %.2f',header='Id,Predicted',comments='',delimiter=',')

NameError: name 'best_ten_nn' is not defined

# AdaBoost

In [4]:
class AB(ml.base.classifier):
    def __init__(self, learner, learning_rate):
        """Constructs a BaggedTree class with a set of learners. """
        self.learner = learner
        self.classes = [0,1]
        self.learning_rate = learning_rate
        self.train_auc = None
        self.valid_auc = None
        self.total_auc = None
        
    def __str__(self):
        return 'learning rate = {}, training_auc = {}, validation_auc = {}'.format(self.learning_rate, self.train_auc, self.valid_auc)
    
    def set_train_auc(self, train_auc):
        self.train_auc = train_auc
        
    def set_valid_auc(self, valid_auc):
        self.valid_auc = valid_auc
        
    def get_train_auc(self):
        return self.train_auc
        
    def get_valid_auc(self):
        return self.valid_auc
    
    def predictSoft(self, X):
        """Predicts the probabilities with each bagged learner and average over the results. """
        return np.vstack((1-self.learner.predict(X), self.learner.predict(X))).T

In [5]:
X = np.genfromtxt('data/X_train.txt', delimiter=',')
Y = np.genfromtxt('data/Y_train.txt', delimiter=',')
Xtr, Xva, Ytr, Yva = train_test_split(X,Y,shuffle=False)

In [6]:
best_ab = None

trial = 0
empty_run_count = 0
while empty_run_count == 0 or empty_run_count / trial < 0.7:
    flag = False
    print('Trial {}:'.format(trial+1))
    for lr in np.arange(0.01,1.01,0.03):
        ab = AdaBoostRegressor(n_estimators=300, loss='exponential', learning_rate=lr)
        ab.fit(Xtr, Ytr)
        ml_ab = AB(ab, round(lr,2))
        ml_train_auc = ml_ab.auc(Xtr,Ytr)
        ml_valid_auc = ml_ab.auc(Xva,Yva)
        ml_ab.set_train_auc(ml_train_auc)
        ml_ab.set_valid_auc(ml_valid_auc)
        if best_ab is None:
            best_ab = ml_ab
            print('New Best AB:\n', best_ab)
            flag = True
        else:
            if best_ab.get_valid_auc() < ml_ab.get_valid_auc():
                best_ab = ml_ab
                print('New Best AB:\n', best_ab)
                flag = True    
    if not flag:
        empty_run_count += 1
    trial += 1
    print('Empty Run Count:', empty_run_count)
    print('Empty Run Ratio:', empty_run_count / trial)

print('\nTotal Trials:', trial)
print('Very Best AB:\n', best_ab)

Trial 1:
New Best AB:
 learning rate = 0.01, training_auc = 0.7403634426622225, validation_auc = 0.710876883514088
New Best AB:
 learning rate = 0.04, training_auc = 0.7596630170353987, validation_auc = 0.7320323185278406
New Best AB:
 learning rate = 0.07, training_auc = 0.7709950601680617, validation_auc = 0.7351947978000346
New Best AB:
 learning rate = 0.1, training_auc = 0.770866311151588, validation_auc = 0.7353824665130456
New Best AB:
 learning rate = 0.13, training_auc = 0.7730929269743323, validation_auc = 0.7354678761130846
Empty Run Count: 0
Empty Run Ratio: 0.0
Trial 2:
New Best AB:
 learning rate = 0.46, training_auc = 0.7650584720595919, validation_auc = 0.7361174538875312
Empty Run Count: 0
Empty Run Ratio: 0.0
Trial 3:
New Best AB:
 learning rate = 0.1, training_auc = 0.7783931916229371, validation_auc = 0.7377681251372654
New Best AB:
 learning rate = 0.19, training_auc = 0.783222215510494, validation_auc = 0.7383177678694894
Empty Run Count: 0
Empty Run Ratio: 0.0
Tr

In [7]:
Xte = np.genfromtxt('data/X_test.txt', delimiter=',')
print('Predicting using:', best_ab)
Yte = np.vstack((np.arange(Xte.shape[0]), best_ab.predictSoft(Xte)[:,1])).T
np.savetxt('Y_AB_submit.txt',Yte,'%d, %.2f',header='Id,Predicted',comments='',delimiter=',')

Predicting using: learning rate = 0.61, training_auc = 0.7780091387923432, validation_auc = 0.7384281612300841
