In [1]:
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split,KFold,cross_validate
import numpy as np
from sklearn.metrics import precision_score,roc_auc_score,accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


# #extract the data
raw_df = pd.read_csv('../Skin_NonSkin.txt', delimiter='	',names=["val1", "val2", "val3", "class"])
cleaned_df = raw_df.copy()


cleaned_df['class']=np.array([1 if x==1 else 0 for x in cleaned_df['class']])
print(cleaned_df.head(5))
#summarize data distribution
y=np.array(cleaned_df['class'])
X=np.array(cleaned_df.iloc[:,:-1])
print(Counter(y))

#rescale data
scaler = StandardScaler()
X = scaler.fit_transform(X)


   val1  val2  val3  class
0    74    85   123      1
1    73    84   122      1
2    72    83   121      1
3    70    81   119      1
4    70    81   119      1
Counter({0: 194198, 1: 50859})


In [None]:
##cross validation to find out the best kernel to train the model###
t=10
cv = KFold(n_splits=t,shuffle=True)
AUC_train_score_list=[]
AUC_test_score_list=[]

kernels=['linear','poly']
# parameters=(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs)[source]
for kernel in kernels:
    svm = SVC(gamma='auto',kernel=kernel,probability=True)
    scores =  cross_validate(svm, X, y, scoring=['roc_auc'], cv=cv, n_jobs=-1,return_train_score=True)
    
    AUC_train_score_list.append(sum(scores['train_roc_auc'])/t)
    AUC_test_score_list.append(sum(scores['test_roc_auc'])/t)
    print(***)
m = len(kernels)
# #generate Fig_0,Fig_1
fig_0, axs = plt.subplots(1, 1, figsize=(5, 5), sharey=True)
axs.plot(kernels,AUC_train_score_list,"r^",linestyle = "--", label='training data')
axs.plot(kernels,AUC_test_score_list,"b^",linestyle = "--",label='test data')
axs.set_xlabel("Kernels")
axs.set_ylabel("AUC_score")
axs.legend()
fig_0.suptitle("Fig_2.4.0: AUC_score(SVM)")
fig_0.savefig("SVM_fig_2.4_tunning:AP_score.png")




In [None]:
###############Apply kernel=****** to train the model##################
Accuracy_train_score=[]
Accuracy_test_score=[]
Precision_train_score=[]
Precision_test_score=[]
training_time = []

for k in range(4,11):
    cv = KFold(n_splits=k,shuffle=True)
    svm = SVC(gamma='auto',kernel='******',probability=True)
    steps = [('under',under),('model',svm)]
    pipeline = Pipeline(steps=steps)
    scores =  cross_validate(pipeline, X, y, scoring=['recall'], cv=cv, n_jobs=-1,return_train_score=True)
    Recall_train_score.append(sum(scores['train_recall'])/k)
    Recall_test_score_list.append(sum(scores['test_recall'])/k)
    training_time.append(sum(scores['fit_time'])/k)
    
#generate Fig_2.4.1
K= range(4,11)
fig_3, axs = plt.subplots(1, 3, figsize=(15, 5), sharey=False)
axs[0].plot(K,training_time,"r^",linestyle = "--",label="traing_time:seconds")
axs[0].set_xlabel("training size k")
axs[0].set_ylabel("training time")
axs[1].plot(K,Accuracy_train_score,"r^",linestyle = "--",label='training data')
axs[1].plot(K,Accuracy_test_score,"b^",linestyle = "--",label='test data')
axs[1].set_xlabel("training size k")
axs[1].set_ylabel("accuracy_score")
axs[2].plot(K,Precision_train_score,"r^",linestyle = "--",label='training data')
axs[2].plot(K,Precision_test_score,"b^",linestyle = "--",label='test data')
axs[2].set_xlabel("training size k")
axs[2].set_ylabel("precision_score")
axs[0].legend()
axs[1].legend()
axs[2].legend()
fig_1.suptitle("Fig_2.4.1: recall score(NN)")
fig_1.savefig("NN_fig_2.4_trainnig:learning curves.png")
    

In [None]:
##################prediction results, generate the data in table1.4.0#########
cv = KFold(n_splits=****,shuffle=True)
svm = SVC(gamma='auto',kernel=******,probability=True)
scores =  cross_validate(svm, X, y, scoring=['recall','average_precision'], cv=cv, n_jobs=-1,return_train_score=True)
print("The AP scores on training set and test set are",sum(scores['train_average_precision'])/4,sum(scores['test_average_precision'])/4)
print("The Recall scores on training set and test set are",sum(scores['train_recall'])/4,sum(scores['test_recall'])/4)
