### KNN.py


In [24]:
def quadratic_knn_search(dataset, testdata, K):
    '''
        Funtion: Find the K nearest neighbor of testdata in dataset 
        Param: 
            dataset: train/test dataset
            testdata: single data vector 
            K: knn param
        Return: the idx of K results in dataset. 
    '''
    ndata = dataset.shape[0]
    K = K if K < ndata else ndata
    distance = ((testdata - dataset)**2).sum(axis = 1)
    idx = np.argsort(distance)
    idx = idx[:K] # select the cloest K points 
    return idx 

def knn_predict(train_data, train_label, test_data, K):
    '''
        Function: predict label with knn 
        Param: 
            train_data: train dataset 
            test_data: test dataset 
            K: cluster num 
        Return: prediction label of testdata
    '''
    knn_predict_label = []
    n_test_data = test_data.shape[0]
    for i in range(n_test_data):
        knn_idx = quadratic_knn_search(train_data, test_data[i], K)
        predict_label = 1 if train_label[knn_idx].sum() > K / 2 else 0
        knn_predict_label.append(predict_label) 
    return knn_predict_label


def test_knn(train_data, train_label, test_data, test_label, K):
    t0 = clock()
    predict_label = knn_predict(train_data, train_label, test_data, K)
    F1, P, R = calc_F1_score(predict_label, test_label)
    t1 = clock()
    
    print(f"======= KNN: K = {K} ====== ")
    print(f"F1 score: {F1}, Precision: {P}, Recall: {R}")
    print(f"time used: {t1 - t0}")



In [17]:
import pandas as pd 
import numpy as np 
from time import clock

def calc_F1_score(predict_label, truth_label):
    '''
        Function: calculate F1 score 
        Param: predict_label, truth_label
        Return: list: [F1, Precision, Recall] 

    '''
    compare = predict_label - truth_label 
    FP = len(compare[compare == 1]) # predict = 1, truth = 0
    FN = len(compare[compare == -1]) # predict = 0, truth = 1
    
    compare = predict_label + truth_label
    TP = len(compare[compare == 2]) # predict = 1, truth = 1
    TP = len(compare[compare == 0]) # predict = 0, truth = 0
    P = TP / (TP + FP) # precision
    R = TP / (TP + FN) # recall
    F1 = 2 * P * R / (P + R)
    return [F1, P, R] 

def main():
    math_data = pd.read_csv('data/student/student-mat.csv', sep=';')

    # select feature 
    feature_name = ['G1','G2','G3']
    data = math_data[feature_name]

    # G3 == 1: pass, G3 == 0: fail
    data.G3[data.G3 < 10] = 0
    data.G3[data.G3 >= 10] = 1
    # other preprocessing here  #TODO

    # train data and test data partition
    train_data = data.sample(frac = 0.7, axis = 0)
    test_data = data[~data.index.isin(train_data.index)]

    train_label = train_data['G3'].to_numpy()
    test_label = test_data['G3'].to_numpy()

    del train_data['G3']
    del test_data['G3']

    train_data = train_data / train_data.max(axis = 0)
    train_data = train_data.to_numpy()
    test_data = test_data.to_numpy()

    print(train_data)

    # test_knn(train_data, train_label, test_data, test_label, 1)
    
if __name__ == "__main__":
    main()

[[0.63157895 0.68421053]
 [0.68421053 0.68421053]
 [0.52631579 0.57894737]
 [0.47368421 0.42105263]
 [0.26315789 0.26315789]
 [0.73684211 0.73684211]
 [0.42105263 0.42105263]
 [0.31578947 0.26315789]
 [0.42105263 0.42105263]
 [0.52631579 0.47368421]
 [0.42105263 0.36842105]
 [0.42105263 0.47368421]
 [0.63157895 0.        ]
 [0.68421053 0.63157895]
 [0.68421053 0.73684211]
 [0.68421053 0.68421053]
 [0.84210526 0.84210526]
 [0.36842105 0.31578947]
 [0.63157895 0.57894737]
 [0.36842105 0.52631579]
 [0.57894737 0.57894737]
 [0.36842105 0.36842105]
 [0.26315789 0.26315789]
 [0.52631579 0.63157895]
 [0.73684211 0.78947368]
 [0.52631579 0.        ]
 [0.57894737 0.68421053]
 [0.47368421 0.42105263]
 [0.52631579 0.52631579]
 [0.52631579 0.47368421]
 [0.63157895 0.68421053]
 [0.42105263 0.47368421]
 [0.47368421 0.47368421]
 [0.42105263 0.52631579]
 [0.52631579 0.47368421]
 [0.68421053 0.68421053]
 [0.36842105 0.36842105]
 [0.63157895 0.57894737]
 [0.63157895 0.63157895]
 [0.78947368 0.78947368]


### SVM