### 1. Logistic Regression
### 2. KNN
### 3. Support Vector Machine

unbalanced training data:
(1) x171
(0) x77 

use confusion matrix \
(https://towardsdatascience.com/handling-imbalanced-datasets-in-machine-learning-7a0e84220f28)

4. compare model performance (compare the average accuracy of cross validation on different models and see who has highest)


In [1]:
import numpy as np
import pandas as pd

In [2]:
# method:
# PREPROCESS
# 1. read csv file
# 2. normalize data (delete unimportant features) [standardize, binning]
# 3. split data into train and val set
# TRAIN

In [3]:
# tool functions


def accuracy(y_pred, Y):
    correct = 0
    for i in range(len(Y)):
        if y_pred[i] == Y[i]: correct += 1
    accuracy = float(correct) / len(Y)
    return accuracy

def normalize(X):
    return (X - X.min(0)) / X.ptp(0)
    
def standardize(X): # standardize x_data
    mean_vector = np.mean(X, axis=0)
    std_vector = np.std(X, axis=0)
    return (X - mean_vector) / std_vector

def remove_outliers(X, Y, threshold=3): # identify and remove outliers
    #identify outliers
    z = np.abs(standardize(X))

    outliers = np.where(z > threshold)

    #remove outlying rows
    outlying_rows = [row for row in outliers[0]]
    X = np.delete(X, outlying_rows, axis=0)
    Y = np.delete(Y, outlying_rows, axis=0)
    
    return X, Y

    

In [4]:
class Preprocess:
    def __init__(self, x_data, y_data):
        self.x_data = x_data
        self.y_data = y_data # label array of data
    
    def get_data(self):
        return self.x_data, self.y_data
    
    def concat_labels(self): # concatenate labels to data
        self.y_data = self.y_data[np.newaxis, :] # convert label array to 2d
        dataset = np.concatenate((self.x_data, self.y_data.T), axis=1) # append labels to data
        return dataset
        
    def peel_labels(self, dataset): # peel labels from data
        y_data = dataset[: , -1:]
        y_data = y_data.flatten() # turn back into 1D array
        x_data = dataset[: , :-1]
        return x_data, y_data
        
    def shuffle_data(self): # shuffles dataset x and y
        dataset = self.concat_labels()
        np.random.shuffle(dataset) # in-place shuffle dataset
        self.x_data, self.y_data = self.peel_labels(dataset)
        self.y_data = self.y_data.astype(int)
    
    def remove_outliers(self):
        pass
        
    def train_test_split(self, test_size=0.25, shuffle=True):
        
        test_len = int(test_size*len(self.x_data))
        
        if shuffle == True:
            self.shuffle_data()
            
        x_train = self.x_data[:test_len]
        y_train = self.y_data[:test_len]
        x_test = self.x_data[test_len:]
        y_test =  self.y_data[test_len:]
        
        return (x_train, y_train), (x_test, y_test)
    
    def cv_fold_split(self, folds=10, shuffle=True): # folds=0 indicates doing LOOCV (leave-one-out cross validation)
        
        if shuffle == True:
            self.shuffle_data()
        
        x_fold_set = []
        y_fold_set = []
        
        if folds == 0:
            folds = len(self.x_data) # doing LOOCV
        
        fold_len = int(len(self.x_data) / folds) # the size per fold
            
        for i in range(folds): # create fold set with folds of data
            x_fold_set.append(self.x_data[(i*fold_len):(i+1)*fold_len])
            y_fold_set.append(self.y_data[(i*fold_len):(i+1)*fold_len])
        
        return x_fold_set, y_fold_set # list with nparrays
            

cross validation
1. shuffle data randomly
2. split the dataset into k groups
3. for each unique group:\
    i. take the group as a test data set\
    ii. take the remaining group as a training data set\
    iii. fit a model on the training set and evaluate it on the test set\
    iv. retain the evaluation score and discard the model
4. summarize the skill of the model using the sample of  model evaluation scores

In [5]:
from statistics import mode

class KNearestNeighbors:
    
    def __init__(self, k_neighbors=5):
        self.k_neighbors = k_neighbors
    
    def euclidean_dist(self, pointA, pointB):
        distance = np.linalg.norm(pointA - pointB)
        return distance
    
    def predict(self, base_dataset, base_datalabel, X):
        y_pred = []
        for target_x in X:
            distance_set = []
            for index, datapoint in enumerate(base_dataset):
                dist = self.euclidean_dist(datapoint, target_x)
                distance_set.append((dist, index))
            distance_set.sort() # by default, sort() sorts by first item in tuple (sort() is in place, does not return value)
            nearest_neighbors = distance_set[:self.k_neighbors]

            neighbor_labels = []
            for neighbor in nearest_neighbors:
                neighbor_labels.append(base_datalabel[neighbor[1]]) # get label of neighbor
            try:
                y_pred.append(mode(neighbor_labels)) # for python versions before 3.8, statistics.mode() raises error when more than one common
            except:
                import random
                y_pred.append(random.randrange(0,2)) # generate random 0 or 1
        return y_pred

In [6]:
class LogisticRegression:
    def __init__(self, max_iter=100):
        self.max_iter = max_iter

    
    def sigmoid(self, z):
        return 1/(1 + np.exp(-z))
    
    
    def cost_func(self, y_hat, y):
        loss = - (y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)).mean()        
        return loss
    
    
    def fit(self, X, Y, LEARNING_RATE=0.001):
        
        n_features = len(X[0])
        data_size = len(X)
        
        self.w = np.zeros(n_features)  # initialize weights
        self.b = 0
        
        for i in range(self.max_iter):
            y_hat = self.sigmoid(np.dot(X, self.w) + self.b)

            loss = self.cost_func(y_hat, Y)
#             print("loss: " + str(loss))
            
            w_gradient = np.dot(X.T, (y_hat - Y)) / data_size # calculate gradients
            
            b_gradient = np.sum(y_hat - Y) / data_size
            
            self.w -= LEARNING_RATE * w_gradient # update weights
            self.b -= LEARNING_RATE * b_gradient
    
    
    def predict(self, X): # use the resulting y_hat array to make predicted array
        y_hat = self.sigmoid(np.dot(X, self.w) + self.b)
        y_pred = np.zeros(len(y_hat))
        for i in range(len(y_hat)):
            if y_hat[i] > 0.5:
                y_pred[i] = 1
            else:
                y_pred[i] = 0
        return y_pred.astype(int)
        
    
        

# Read Data

In [7]:
path = "./nctu-ml-2020-lab2/X_train.csv"
x_train = pd.read_csv(path, usecols=[i for i in range(1,13)]).to_numpy()

path = "./nctu-ml-2020-lab2/y_train.csv"
y_train = pd.read_csv(path, usecols=[1]).to_numpy()
y_train = y_train.flatten() # convert 2d array to 1d

path = "./nctu-ml-2020-lab2/X_test.csv"
x_test = pd.read_csv(path, usecols=[i for i in range(1,13)]).to_numpy()

# normalize or standardize data
x_train = standardize(x_train)
x_test = standardize(x_test)

# plot labels with respect to each feature
from matplotlib import pyplot as plt
for i in range(len(x_train[0])):
    fig, ax = plt.subplots(figsize=(8,3))
    ax.scatter(x_train[ : , i], y_train)
    ax.set_xlabel('Feature ' + str(i+1) + ' of X')
    ax.set_ylabel('Y label')
    plt.show()

# Logistic Regression

standardized train, test 500 iter, lr=0.1 ----> accuracy: 0.83720\
standardized train, test 500 iter, lr=0.1, remove outliers ---> accuracy: 0.79069\
standardized train, test 5000000 iter, lr=0.001, remove outliers ---> accuracy: 0.81395\

In [8]:
# preprocess = Preprocess(x_train, y_train)
# (x_train, y_train), (x_test, y_test) = preprocess.train_test_split()
# x_train, y_train = preprocess.get_data()

x_train, y_train = remove_outliers(x_train, y_train)
x_train = standardize(x_train)
x_test = standardize(x_test)
# 5000000, 0.0001
LR = LogisticRegression(max_iter=5000000)
LR.fit(x_train, y_train, LEARNING_RATE=0.001)
y_train_pred = LR.predict(x_train)
print(accuracy(y_train_pred, y_train))

0.8916666666666667


y_test_pred = LR.predict(x_test)
accuracy(y_test_pred, y_test)

In [9]:
y_pred = LR.predict(x_test)

In [10]:
y_pred = pd.DataFrame(data=y_pred, index=[i for i in range(len(y_pred))], columns=["Class"])
y_pred.index.name = "ID"
y_pred.to_csv("0610101_result.csv", sep=',')

## cross validation

x_train = standardize(x_train)
preprocess = Preprocess(x_train, y_train)

x_fold_set, y_fold_set = preprocess.cv_fold_split()

avg_train_accuracy = 0
avg_valid_accuracy = 0
for i in range(len(x_fold_set)):
    x_train_set = list(x_fold_set) # copy list (since list.copy() doesn't work in old versions)
    x_valid_set = x_train_set.pop(i)
    x_train_set = np.concatenate(x_train_set, axis=0)

    y_train_set = list(y_fold_set)
    y_valid_set = y_train_set.pop(i)
    y_train_set = np.concatenate(y_train_set, axis=0)

    LR = LogisticRegression(max_iter=5000)
    LR.fit(x_train_set, y_train_set, LEARNING_RATE=0.1)
    y_train_pred = LR.predict(x_train_set)
    y_valid_pred = LR.predict(x_valid_set)
    
#     print("using " + str(i) + " as validation set:")
    print("training accuracy ====> " + str(accuracy(y_train_pred, y_train_set)))
    avg_train_accuracy += accuracy(y_train_pred, y_train_set)
    print("validation accuracy ===> " + str(accuracy(y_valid_pred, y_valid_set)))
    avg_valid_accuracy += accuracy(y_valid_pred, y_valid_set)
    
print("average training accuracy: " + str(avg_train_accuracy / 10))
print("average validation accuracy: " + str(avg_valid_accuracy / 10))

#NORMALIZE THE TESTING SET TOO!

# KNN

Preprocessing Techniques:\
    normalization yields highest accuracy on testing set (0.7674)\
    no preprocessing (0.69767)\
    standardization (0.62790)\
    \
    remove outliers:(k=10)\
        no preprocessing (0.69767)\
        normalization (0.62790)\
        standardized (0.62790)
        
    remove outliers:(k=18)\
        none (0.76744)
        standardized (0.72093)\
   
    

# remove outliers
x_train, y_train = remove_outliers(x_train, y_train)

x_train = standardize(x_train)
x_test = standardize(x_test)

KNN = KNearestNeighbors(k_neighbors=18)
y_pred = KNN.predict(x_train, y_train, x_test)
print(y_pred)

y_pred = pd.DataFrame(data=y_pred, index=[i for i in range(len(y_pred))], columns=["Class"])
y_pred.index.name = "ID"
y_pred.to_csv("0610101_result.csv", sep=',')