### KNN.py


In [24]:
def quadratic_knn_search(dataset, testdata, K):
    '''
        Funtion: Find the K nearest neighbor of testdata in dataset 
        Param: 
            dataset: train/test dataset
            testdata: single data vector 
            K: knn param
        Return: the idx of K results in dataset. 
    '''
    ndata = dataset.shape[0]
    K = K if K < ndata else ndata
    distance = ((testdata - dataset)**2).sum(axis = 1)
    idx = np.argsort(distance)
    idx = idx[:K] # select the cloest K points 
    return idx 

def knn_predict(train_data, train_label, test_data, K):
    '''
        Function: predict label with knn 
        Param: 
            train_data: train dataset 
            test_data: test dataset 
            K: cluster num 
        Return: prediction label of testdata
    '''
    knn_predict_label = []
    n_test_data = test_data.shape[0]
    for i in range(n_test_data):
        knn_idx = quadratic_knn_search(train_data, test_data[i], K)
        predict_label = 1 if train_label[knn_idx].sum() > K / 2 else 0
        knn_predict_label.append(predict_label) 
    return knn_predict_label


def test_knn(train_data, train_label, test_data, test_label, K):
    t0 = clock()
    predict_label = knn_predict(train_data, train_label, test_data, K)
    F1, P, R = calc_F1_score(predict_label, test_label)
    t1 = clock()
    
    print(f"======= KNN: K = {K} ====== ")
    print(f"F1 score: {F1}, Precision: {P}, Recall: {R}")
    print(f"time used: {t1 - t0}")



In [3]:
import pandas as pd 
import numpy as np 
from time import clock
from sklearn import preprocessing
from SVM import *

def calc_F1_score(predict_label, truth_label):
    '''
        Function: calculate F1 score 
        Param: predict_label, truth_label
        Return: list: [F1, Precision, Recall] 

    '''
    compare = predict_label - truth_label 
    FP = len(compare[compare == 1]) # predict = 1, truth = 0
    FN = len(compare[compare == -1]) # predict = 0, truth = 1
    
    compare = predict_label + truth_label
    TP = len(compare[compare == 2]) # predict = 1, truth = 1
    TP = len(compare[compare == 0]) # predict = 0, truth = 0
    P = TP / (TP + FP) # precision
    R = TP / (TP + FN) # recall
    F1 = 2 * P * R / (P + R)
    return [F1, P, R] 



def main():
    math_data = pd.read_csv('data/student/student-mat.csv', sep=';')

    # select feature 
    feature_name = ['Medu','G1','G2','G3']
    data = math_data[feature_name]

    # G3 == 1: pass, G3 == 0: fail
    data.G3[data.G3 < 10] = 0
    data.G3[data.G3 >= 10] = 1
    # other preprocessing here  #TODO

    # train data and test data partition
    train_data = data.sample(frac = 0.7, axis = 0)
    test_data = data[~data.index.isin(train_data.index)]

    train_label = train_data['G3'].to_numpy()
    test_label = test_data['G3'].to_numpy()

    del train_data['G3']
    del test_data['G3']

    train_data = train_data.apply(preprocessing.LabelEncoder().fit_transform)
    print(train_data)

    # preprocessing.LabelEncoder().fit_transform(train_data)
    # preprocessing.LabelEncoder().fit_transform(test_data)

    train_data = train_data / train_data.max(axis = 0)
    test_data = test_data / test_data.max(axis = 0)
    train_data = train_data.to_numpy()
    test_data = test_data.to_numpy()

    # print(test_data[0:10])

    # test_knn(train_data, train_label, test_data, test_label, 1)
    def test_add(a,b):
        return np.dot(a,b)
    
    # vadd = np.vectorize(test_add, signature = '(m,n),(n)->(m)')
    # result = vadd(test_data, test_data[0])
    # print("")
    # print(result[0:10])
    # print(test_data.T[:10])
    ndata = train_data.shape[0]
    print(train_data * train_label.reshape((ndata,1)))
    # SVM_fit(train_data, train_label, kernel_linear, 100)

if __name__ == "__main__":
    main()

Medu  G1  G2
297     4   6   5
16      4   9  11
202     1   5   6
214     4   4   7
242     4   2   0
..    ...  ..  ..
245     2  14  15
220     2   2   3
96      4   7  12
342     3  12  12
181     3   8  10

[276 rows x 3 columns]
[[0.         0.         0.        ]
 [1.         0.6        0.6875    ]
 [0.25       0.33333333 0.375     ]
 [1.         0.26666667 0.4375    ]
 [0.         0.         0.        ]
 [0.         0.         0.        ]
 [0.75       0.6        0.75      ]
 [1.         0.33333333 0.5       ]
 [1.         0.8        0.875     ]
 [0.         0.         0.        ]
 [0.25       0.46666667 0.5625    ]
 [0.         0.         0.        ]
 [0.75       0.53333333 0.5625    ]
 [0.5        0.4        0.4375    ]
 [0.5        0.53333333 0.625     ]
 [0.         0.         0.        ]
 [1.         0.66666667 0.5       ]
 [0.5        0.8        0.8125    ]
 [0.5        0.33333333 0.4375    ]
 [1.         0.6        0.6875    ]
 [0.         0.         0.        ]
 [0.5    

### SVM

In [None]:
# class SVM:
#     def __init__(self, xi, C)
import numpy as np
import qpsolvers as qs

def kernel_linear(x,y):
    return np.dot(x,y)

def SVM_fit(train_data, train_label, kernel, C):
    '''
    '''
    ndata = train_data.shape[0] # num of samples 
    vkernel = np.vectorize(kernel, signature='(m,n),(n)->(m)')
    P = np.zeros((ndata,ndata))
    for i in range(ndata):
        P[i] = vkernel(train_data, train_data[i])
    print(P)
    q = np.ones(ndata)
    lb = 0
    ub = C
    A = train_label
    G = A 
    h = 0
    b = 0
    alpha = qs.solve_qp(P,q,G,h,A,b,lb,ub)
    print(alpha)
    return [alpha]
    
