In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef
from sklearn.metrics import roc_auc_score, average_precision_score
from prettytable import PrettyTable
from sklearn.neighbors import KNeighborsClassifier
import random

df = pd.read_csv("20000_reduced_featureSelectedAllDataWithY.csv")
print(df.shape)
df.head()

In [None]:
# construct the KNN model
def KNNModel(data, initialFile=None, nneighbor=8, method="one-time",  once_add=1000, query_method = "random"):
    # split the data into training and testing set with 2:8 ratio
    # and evaluate its performance 
    if method == "one-time":
        training_data, testing_data = train_test_split(data, test_size=0.2)
        y_train = training_data['disposition']
        y_test = testing_data['disposition']
        X_train = StandardScaler().fit_transform(training_data.drop("disposition",axis = 1))
        X_test = StandardScaler().fit_transform(testing_data.drop("disposition",axis = 1))
        # jump through grid search
        clf = KNeighborsClassifier(n_neighbors = nneighbor, metric = 'minkowski', p = 2)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        # evaluate the model performance
        accuracy_scores = []
        f1_scores = []
        recall_scores = []
        precision_scores = []
        MCCs = []
        auROCs = []
        auPRCs = []

        # calculate the metrices
        accuracy_scores.append(accuracy_score(y_true=y_test, y_pred=y_pred))
        f1_scores.append(f1_score(y_true=y_test, y_pred=y_pred))
        recall_scores.append(recall_score(y_true=y_test, y_pred=y_pred))
        precision_scores.append(precision_score(y_true=y_test, y_pred=y_pred))
        MCCs.append(matthews_corrcoef(y_true=y_test, y_pred=y_pred))
        auROCs.append(roc_auc_score(y_true=y_test, y_score=clf.predict_proba(X_test)[:, 1]))
        auPRCs.append(average_precision_score(y_true=y_test,  y_score=clf.predict_proba(X_test)[:, 0]))

        table = PrettyTable()
        column_names = ['Accuracy', 'auROC', 'auPRC', 'recall', 'precision', 'f1', 'MCC']
        table.add_column(column_names[0], np.round(accuracy_scores, 4))
        table.add_column(column_names[1], np.round(auROCs, 4))
        table.add_column(column_names[2], np.round(auPRCs, 4))
        table.add_column(column_names[3], np.round(recall_scores, 4))
        table.add_column(column_names[4], np.round(precision_scores, 4))
        table.add_column(column_names[5], np.round(f1_scores, 4))
        table.add_column(column_names[6], np.round(MCCs, 4))
        return table
    elif method == "iterative":
        # obtain initial index
        initial_df = pd.read_csv(initialFile, sep=" ", header=None)
        initial_index = [x for x in initial_df.iloc[:,0]]
        initial = len(initial_index)
        remain_index = [x for x in range(0,len(data)) if x not in initial_index]
        current_dat = data.iloc[initial_index]
        remain_dat = data.iloc[remain_index]
        
        # set training and testing data
        y_train = current_dat['disposition']
        y_test = remain_dat['disposition']
        X_train = StandardScaler().fit_transform(current_dat.drop("disposition",axis = 1))
        X_test = StandardScaler().fit_transform(remain_dat.drop("disposition",axis = 1))
        
        # evaluate the model performance
        sampleNum = []
        accuracy_scores = []
        f1_scores = []
        recall_scores = []
        precision_scores = []
        MCCs = []
        auROCs = []
        auPRCs = []
        
        # construct model
        ## jump through grid search
        model = KNeighborsClassifier(n_neighbors = nneighbor, metric = 'minkowski', p = 2)
        clf = model.fit(X_train, y_train)
        # add 10000 new samples each time
        # calculate the number of round we need to run in total
        numRound = ((len(data) - initial)//once_add) + 1
        for i in range(numRound+1):
            # obtain performance
            y_pred = clf.predict(X_test)
            sampleNum.append(initial+once_add*i)
            accuracy_scores.append(accuracy_score(y_true=y_test, y_pred=y_pred))
            f1_scores.append(f1_score(y_true=y_test, y_pred=y_pred))
            recall_scores.append(recall_score(y_true=y_test, y_pred=y_pred))
            precision_scores.append(precision_score(y_true=y_test, y_pred=y_pred))
            MCCs.append(matthews_corrcoef(y_true=y_test, y_pred=y_pred))
            auROCs.append(roc_auc_score(y_true=y_test, y_score=clf.predict_proba(X_test)[:, 1]))
            auPRCs.append(average_precision_score(y_true=y_test,  y_score=clf.predict_proba(X_test)[:, 0]))
            # add certain number of samples each time
            if query_method == "random":
                if once_add < len(remain_index):
                    print(len(remain_index))
                    new_index = random.sample(remain_index, once_add)
                # if the number of remaining sample is less than once_add, end the for loop
                else:
                    break
            #################### active learning ####################
#             elif query_method == ""
            #################### active learning ####################
            remain_index = [i for i in remain_index if i not in new_index]
            new_dat = data.iloc[new_index]
            current_dat = pd.concat([current_dat, new_dat])
            remain_dat = data.iloc[remain_index]
            
            # set training and testing data
            y_train = current_dat['disposition']
            y_test = remain_dat['disposition']
            X_train = StandardScaler().fit_transform(current_dat.drop("disposition",axis = 1))
            X_test = StandardScaler().fit_transform(remain_dat.drop("disposition",axis = 1))
            
            clf = model.fit(X_train, y_train)           
            print(i)
        # print performance table
        table = PrettyTable()
        column_names = ['sampleNum', 'Accuracy', 'auROC', 'auPRC', 'recall', 'precision', 'f1', 'MCC']
        table.add_column(column_names[0], sampleNum)
        table.add_column(column_names[1], np.round(accuracy_scores, 4))
        table.add_column(column_names[2], np.round(auROCs, 4))
        table.add_column(column_names[3], np.round(auPRCs, 4))
        table.add_column(column_names[4], np.round(recall_scores, 4))
        table.add_column(column_names[5], np.round(precision_scores, 4))
        table.add_column(column_names[6], np.round(f1_scores, 4))
        table.add_column(column_names[7], np.round(MCCs, 4))
        filename = "C://CMU/Courses/Automation/Project/KNN_" + str(once_add) + ".csv" 
        with open(filename, 'w', newline='') as f_output:
            f_output.write(table.get_csv_string())
        return table