In [None]:
import pandas as pd
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold,cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import precision_recall_curve,average_precision_score,precision_score,recall_score,accuracy_score
import time
import matplotlib.pyplot as plt

import io


# uploaded = files.upload()
# #extract the data
# raw_df = pd.read_csv(io.BytesIO(uploaded['BitcoinHeistData.csv']))


#clean the data
cleaned_df['class']=np.array([0 if x=='white' else 1 for x in cleaned_df.pop('label')])
eps=0.001 # 0 => 0.1¢
cleaned_df['Log income'] = np.log(cleaned_df.pop('income')+eps)
cleaned_df['Log weight'] = np.log(cleaned_df.pop('weight')+eps)
cleaned_df.pop("address")
cleaned_df.pop("year")
cleaned_df.pop("day")
print(cleaned_df.head(5))


#summarize the data distribution
y=np.array(cleaned_df['class'])
X=np.array(cleaned_df.iloc[:,:-1])
print(Counter(y))

#rescale data
scaler = StandardScaler()
X = scaler.fit_transform(X)


#define the undersampling method
under = RandomUnderSampler(sampling_strategy=0.5)




In [None]:
##cross validation to find out the best neighbor number using kd_tree or ball_tree###
t=7
cv = KFold(n_splits=t,shuffle=True)
AP_train_score_list=[]
AP_test_score_list=[]

neighbors_num = [3,4,5,6,7]
algorithms = ['kd_tree','ball_tree']
# parameters=(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs)[source]
for neighbors in neighbors_num:
    neigh = KNeighborsClassifier(n_neighbors=neighbors,weights='distance',algorithm='kd_tree')
    steps = [('under',under),('model',neigh)]
    pipeline = Pipeline(steps=steps)
    scores =  cross_validate(pipeline, X, y, scoring=['average_precision'], cv=cv, n_jobs=-1,return_train_score=True)
    AP_train_score_list.append(sum(scores['train_average_precision'])/t)
    AP_test_score_list.append(sum(scores['test_average_precision'])/t)
    

# #generate Fig_1.5.
fig_0, axs = plt.subplots(1, 1, figsize=(5, 5), sharey=True)
axs.plot(neighbors_num,recall_train_score_list,"r^",linestyle = "--", label='training data')
axs.plot(neighbors_num,recall_test_score_list,"b^",linestyle = "--",label='test data')
axs.set_xlabel("neighbors_num")
axs.set_ylabel("AP score")
axs.legend()
fig_0.suptitle("Fig_1.5.0: AP_score(kd_tree)")
fig_0.savefig("KNN_fig_1.5_tunning:AP score.png")




In [None]:
########check if using ball_tree can get higher AP score######
neigh = KNeighborsClassifier(n_neighbors=**********,weights='distance',algorithm='ball_tree')
steps = [('under',under),('model',neigh)]
pipeline = Pipeline(steps=steps)
scores =  cross_validate(pipeline, X, y, scoring=['average_precision'], cv=cv, n_jobs=-1,return_train_score=True)
print("AP_score on training set if using ball_tree to store:",sum(scores['train_average_precision'])/t)
print("AP_score on test set if using ball_tree to store:",sum(scores['test_average_precision'])/t)
    

In [None]:
###############Apply neighbors_num and storing methods to train the model##################
Recall_train_score=[]
Recall_test_score=[]
training_time = []

for k in range(4,11):
    cv = KFold(n_splits=k,shuffle=True)
    neigh = KNeighborsClassifier(n_neighbors=******,weights='distance',algorithm='*****')
    steps = [('under',under),('model',neigh)]
    pipeline = Pipeline(steps=steps)
    scores =  cross_validate(pipeline, X, y, scoring=['recall'], cv=cv, n_jobs=-1,return_train_score=True)
    Recall_train_score.append(sum(scores['train_recall'])/k)
    Recall_test_score_list.append(sum(scores['test_recall'])/k)
    training_time.append(sum(scores['fit_time'])/k)
    
K= range(4,11)
fig_1, axs = plt.subplots(1, 2, figsize=(10, 5), sharey=False)
axs[0].plot(K,training_time,"r^",linestyle = "--",label="traing_time:seconds")
axs[0].set_xlabel("training size k")
axs[0].set_ylabel("training time")
axs[1].plot(K,Recall_train_score,"r^",linestyle = "--",label='training data')
axs[1].plot(K,Recall_test_score,"b^",linestyle = "--",label='test data')
axs[1].set_xlabel("training size k")
axs[1].set_ylabel("Recall_score")
axs[0].legend()
axs[1].legend()
fig_1.suptitle("Fig_1.5.1: recall score(KNN)")
fig_1.savefig("NN_fig_1.5_trainnig:learning curves.png")
    

In [None]:
##################prediction results, generate the data in table1.5.0#########
cv = KFold(n_splits=****,shuffle=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=***, random_state=123)
neigh = KNeighborsClassifier(n_neighbors=******,weights='distance',algorithm='*****')
steps = [('under',under),('model',neigh)]
pipeline = Pipeline(steps=steps)
print("The AP scores on training set and test set are",sum(scores['train_average_precision'])/4,sum(scores['test_average_precision'])/4)
print("The Recall scores on training set and test set are",sum(scores['train_recall'])/4,sum(scores['test_recall'])/4)

In [None]:
# #generate Fig_0,Fig_1
fig_0, axs = plt.subplots(1, 1, figsize=(5, 5), sharey=True)
axs.plot(neighbors_num[:m],fit_time_list[:m],"r^",linestyle = "--",label="fit_time,seconds")
axs.set_xlabel("neighbors_num")
axs.set_ylabel("fit_time")
axs.legend()
fig_0.suptitle("Fig_1.5.0: Fit_time(kd_tree)")

fig_0.savefig("KNN_fig_1.5_kd:Fit_time.png")



fig_1, axs = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
axs[0].plot(neighbors_num[:m],recall_train_score_list[:m],"r^",linestyle = "--", label='training data')
axs[0].plot(neighbors_num[:m],recall_test_score_list[:m],"b^",linestyle = "--",label='test data')
axs[0].set_xlabel('neighbors_num')
axs[0].set_ylabel('Recall_score')
axs[1].plot(neighbors_num[:m],AP_train_score_list,"r^",linestyle = "--",label='training data')
axs[1].plot(neighbors_num[:m],AP_test_score_list,"b^",linestyle = "--",label='test data')
axs[1].set_xlabel('neighbors_num')
axs[1].set_ylabel('AP_score')
fig_1.suptitle('Fig_1.5.1:Recall_score,AP_score(kd_tree)')

axs[0].legend()
axs[1].legend()
fig_1.savefig("KNN_fig_1.5_kd:scores.png")



In [None]:
##cross validation to find out the best neighbor number using ball_tree###
recall_train_score_list=[]
recall_test_score_list=[]
AP_train_score_list=[]
AP_test_score_list=[]


neighbors_num = [3,5,7]
algorithms = ['kd_tree','ball_tree']
# parameters=(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs)[source]
for neighbors in neighbors_num:
    neigh = KNeighborsClassifier(n_neighbors=neighbors,weights='distance',algorithm='ball_tree')
    steps = [('under',under),('model',neigh)]
    pipeline = Pipeline(steps=steps)
    scores =  cross_validate(pipeline, X, y, scoring=['recall','average_precision'], cv=cv, n_jobs=-1,return_train_score=True)
    recall_train_score_list.append(sum(scores['train_recall'])/k)
    recall_test_score_list.append(sum(scores['test_recall'])/k)
    
    AP_train_score_list.append(sum(scores['train_average_precision'])/k)
    AP_test_score_list.append(sum(scores['test_average_precision'])/k)
    fit_time_list.append(sum(scores['fit_time'])/k)
m = len(neighbors_num)
# #generate Fig_0,Fig_1
fig_2, axs = plt.subplots(1, 1, figsize=(5, 5),sharey=True)
axs.plot(neighbors_num[:m],fit_time_list[:m],"r^",linestyle = "--",label="fit_time,seconds")
axs.set_xlabel("neighbors_num")
axs.set_ylabel("fit_time")
axs.legend()
fig_2.suptitle("Fig_1.5.2: Fit_time(ball_tree)")

fig_2.savefig("KNN_fig_1.5_ball:Fit_time.png")



fig_3, axs = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
axs[0].plot(neighbors_num[:m],recall_train_score_list[:m],"r^",linestyle = "--", label='training data')
axs[0].plot(neighbors_num[:m],recall_test_score_list[:m],"b^",linestyle = "--",label='test data')
axs[0].set_xlabel('neighbors_num with ball_tree')
axs[0].set_ylabel('Recall_score')
axs[1].plot(n_estimators_range[:m],AP_train_score_list,"r^",linestyle = "--",label='training data')
axs[1].plot(n_estimators_range[:m],AP_test_score_list,"b^",linestyle = "--",label='test data')
axs[1].set_xlabel('neighbors_num')
axs[1].set_ylabel('AP_score')
fig_3.suptitle('Fig_1.5.3:Recall_score,AP_score(ball_tree)')

axs[0].legend()
axs[1].legend()
fig_3.savefig("KNN_fig_1.5_ball:scores.png")
    
    

In [None]:
#################using k=3 to train the model#######################
