In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from collections import Counter

%matplotlib inline 

def read_dataset(feature_file, label_file):
    ''' Read data set in *.csv to data frame in Pandas'''
    df_X = pd.read_csv(feature_file)
    df_y = pd.read_csv(label_file)
    X = df_X.values # convert values in dataframe to numpy array (features)
    y = df_y.values # convert values in dataframe to numpy array (label)
    return X, y

X_train, y_train = read_dataset('Digits_X_train.csv', 'Digits_y_train.csv')
X_test, y_test = read_dataset('Digits_X_test.csv', 'Digits_y_test.csv')


'''
    normalize the features
    Using normal distribution
'''
def get_mean_variance(X):
    mean = np.mean(X, axis=0) # axis=0: taking means along the
    # vertical line (column)
    # (sum(x_i-\mu)^2)/N
    X_temp = X - mean #
    X_temp_entrypointwise = X_temp*X_temp
    variance = np.mean(X_temp_entrypointwise, axis=0) #axis=0: 
    # taking means along the vertical line (column)
    return mean, variance
    
def normalize_features(X_train, X_test):
    mean, variance = get_mean_variance(X_train)
    variance += 1e-15
    ''' transform the feature '''
    X_train_norm = (X_train - mean)/np.sqrt(variance)
    #math.sqrt doesnot work for numpy
    X_test_norm = (X_test - mean)/np.sqrt(variance)
    return X_train_norm, X_test_norm

X_train_norm, X_test_norm = normalize_features(X_train, X_test)

## Do not need to add bias

## No predictor

## k-NN model

In [125]:
## find common labels
def find_common_labels(labels):
    most_common_label = Counter(labels).most_common(1)
    return most_common_label[0][0]
# label_list = [0, 1, 0, 3, 2, 0, 1, 0, 2, 3]
# find_common_labels(label_list)

In [126]:
## get Distance
def k_NN_EuclideanDistance(X, point, k):
    distance = ???
    k_NN_indices = ???
    return k_NN_indices

def predict_label(X_train, y_train, X_test, k):
    n_test_samples = X_test.shape[0]
    y_pred = []
    for i in range(n_test_samples):
        test_point = X_test[i]
        k_NN_indices = ???
        k_NN_labels = ???
        # print(type(k_NN_labels))
        pred_label = find_common_labels(k_NN_labels)
        y_pred.append(pred_label)
    return np.asarray(y_pred, dtype=int)

## Define your own score for k-NN

### One-hot-Encoder

In [2]:
def one_hot_encoder(y_train, y_test):
    ''' convert label to a vector under one-hot-code fashion '''
    from sklearn import preprocessing
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    y_train_ohe = lb.transform(y_train)
    y_test_ohe = lb.transform(y_test)
    return y_train_ohe, y_test_ohe
# label is 0 -> [1 0 0 0 0 0 0 0 0]
# label is 3 -> [0 0 0 1 0 0 0 0 0]
y_train_ohe, y_test_ohe = one_hot_encoder(y_train, y_test)

### Define predicor scores (or probability matrix)

In [None]:
def get_predictor_scores(X_train, y_train, X_test, k):
    '''
        y_train: one_hot_encoder format
    '''
    n_test_samples = X_test.shape[0]
    y_pred = []
    n_classes= y_train.shape[1]
    probability_matrix = np.zeros((n_test_samples, n_classes))
    for i in range(n_test_samples):
        test_point = X_test[i]
        k_NN_indices = k_NN_EuclideanDistance(X_train, test_point, k)
        k_NN_labels = y_train[k_NN_indices]
        for class_id in range(n_classes):
            ohe_labels = k_NN_labels[:, class_id]
            num_zeros = (ohe_labels == 0).sum()
            num_ones =  (ohe_labels == 1).sum()
            probability_matrix[i, class_id] = num_ones/(num_zeros + num_ones)
    return probability_matrix
probability_matrix = get_predictor_scores(X_train_norm, y_train_ohe, X_test_norm, 3)       

In [None]:
# print(y_test_ohe)
# print(probability_matrix)
def predict_binary_label(score, threshold):
    label = np.copy(score)
    label[label >= threshold] = 1
    label[label < threshold] = 0
    return label

def get_confusion_matrix(predictor_score, true_labels):
    '''
        true_lables: binary labels (0 and 1)
    '''
    # step1 change threshold from 1 to 0
    # step2 get predicted labels
    # step3 compare pred ones to true ones -> TN, TF, FN, FP
    TNR_list = []
    TPR_list = []
    FNR_list = []
    FPR_list = []
    for i in range(100, -1, -1):
        threshold = i/100;
        pred_labels = predict_binary_label(predictor_score, threshold)
        # compare between pred_labels and true_labels
        #use np.logical_and to calculate
        TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0)) 
        FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0)) 
        FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1)) 
        TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1)) 
        TNR = TN/(TN+FP)
        FPR = FP/(TN+FP)
        FNR = FN/(FN+TP)
        TPR = TP/(FN+TP)
        TNR_list.append(TNR)
        FPR_list.append(FPR)
        FNR_list.append(FNR)
        TPR_list.append(TPR)
    return TNR_list, FPR_list, FNR_list, TPR_list

def plot_ROC_curve(FPR_list, TPR_list):
    plt.plot(FPR_list, TPR_list, color='g', lw=2) # roc curve
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') # diagonal line
    plt.xlim([0, 1.01]) # set the limits of x-axis
    plt.ylim([0, 1.01]) # set the limiets of y-axis
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')    
#     plt.legend(loc="lower right")
    plt.show()
    

def ROC_curve(FPR_list, TPR_list):
    '''
        convert the rates to the staircase effect
    '''
    FPR_ROC = []
    TPR_ROC = []
    for i in range(len(FPR_list)-1):
        FPR_ROC.append(FPR_list[i])
        TPR_ROC.append(TPR_list[i])
        FPR_ROC.append(FPR_list[i+1])
        TPR_ROC.append(TPR_list[i])
    ## toFIX: add the two last points
    FPR_ROC.append(FPR_list[-1])
    TPR_ROC.append(TPR_list[-2])
    FPR_ROC.append(FPR_list[-1])
    TPR_ROC.append(TPR_list[-1])
    return FPR_ROC, TPR_ROC # get staircase

## Predict labels

In [134]:
y_pred = ???

## Accuracy

In [135]:
def accuracy(ypred, yexact):
   ????
print("Accuracy = %.3f" % accuracy(y_pred, y_test.ravel()))

Accuracy = 0.971
