In [None]:
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split,KFold,cross_validate
import numpy as np
from sklearn.metrics import precision_score,roc_auc_score,accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree
import matplotlib.pyplot as plt
# import seaborn as sns
import io
from google.colab import files


# uploaded = files.upload()
# #extract the data
# raw_df = pd.read_csv(io.BytesIO(uploaded['Skin_NonSkin.txt'], delimiter='	'))


# #extract the data
raw_df = pd.read_csv('~/ML_class_projects/a_1/Skin_NonSkin.txt', delimiter='	')
cleaned_df = raw_df.copy()


cleaned_df['class']=np.array([1 if x==1 else 0 for x in cleaned_df['class']])
print(cleaned_df.head(5))
#summarize data distribution
y=np.array(cleaned_df['class'])
X=np.array(cleaned_df.iloc[:,:-1])
print(Counter(y))



In [None]:
t=10
cv = KFold(n_splits=t,shuffle=True)
AUC_train_score_list=[]
AUC_test_score_list=[]
n_estimators_range=range(50,101,10)
for n_estimator in n_estimators_range:
    #define pipline
    clf = tree.DecisionTreeClassifier(min_impurity_decrease=0.1)
    abc =AdaBoostClassifier(n_estimators=n_estimator, base_estimator=clf,learning_rate=1)
    scores =  cross_validate(abc, X, y, scoring=['roc_auc'], cv=cv, n_jobs=-1,return_train_score=True)
#     recall_train_score_list.append(sum(scores['train_recall'])/t)
#     recall_test_score_list.append(sum(scores['test_recall'])/t)
    
    AUC_train_score_list.append(sum(scores['train_roc_auc'])/t)
    AUC_test_score_list.append(sum(scores['test_roc_auc'])/t)
#     fit_time_list.append(sum(scores['fit_time'])/t)




#generate Fig_1.3.0 tunning curves
fig_0, axs = plt.subplots(1, 1, figsize=(10, 5), sharey=True)
axs.plot(n_estimators_range,AUC_train_score_list,"r^",linestyle = "--",label='training data')
axs.plot(n_estimators_range,AUC_test_score_list,"b^",linestyle = "--",label='test data')
axs.set_xlabel("The maximum number of estimators: n_estimators")
axs.set_ylabel("AUC score")
axs.legend()
fig_0.suptitle("Fig_2.3.0: AUC score(Ada tunning)")
fig_0.savefig("Ada_fig_2.3_tunning:AP score.png")
    
    

In [None]:
Accuracy_train_score=[]
Accuracy_test_score=[]
Precision_train_score=[]
Precision_test_score=[]
training_time = []

for k in range(4,11):
    cv = KFold(n_splits=k,shuffle=True)
    clf = tree.DecisionTreeClassifier(min_impurity_decrease=0.1)
    abc =AdaBoostClassifier(n_estimators=80, base_estimator=clf,learning_rate=1)
    scores =  cross_validate(abc, X, y, scoring=['accuracy','precision'], cv=cv, n_jobs=-1,return_train_score=True)
    Accuracy_train_score.append(sum(scores['train_accuracy'])/k)
    Accuracy_test_score.append(sum(scores['test_accuracy'])/k)
    Precision_train_score.append(sum(scores['train_precision'])/k)
    Precision_test_score.append(sum(scores['test_precision'])/k)
    training_time.append(sum(scores['fit_time'])/k)
    

K= range(4,11)
fig_1, axs = plt.subplots(1, 3, figsize=(15, 5), sharey=False)
axs[0].plot(K,training_time,"r^",linestyle = "--",label="traing_time:seconds")
axs[0].set_xlabel("training size k")
axs[0].set_ylabel("training time")
axs[1].plot(K,Accuracy_train_score,"r^",linestyle = "--",label='training data')
axs[1].plot(K,Accuracy_test_score,"b^",linestyle = "--",label='test data')
axs[1].set_xlabel("training size k")
axs[1].set_ylabel("accuracy_score")
axs[2].plot(K,Precision_train_score,"r^",linestyle = "--",label='training data')
axs[2].plot(K,Precision_test_score,"b^",linestyle = "--",label='test data')
axs[2].set_xlabel("training size k")
axs[2].set_ylabel("precision_score")
axs[0].legend()
axs[1].legend()
axs[2].legend()
fig_1.suptitle("Fig_2.3.1: learning curves(Ada boosting)")

fig_1.savefig("Ada_fig_2.3_traninig:learning curves.png")

In [None]:
############This cell is to compare decision tree without and with boosting: generate the data in table_2.3.0 in the write up##
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/8, random_state=42)
clf = tree.DecisionTreeClassifier(min_impurity_decrease=0.1)
clf.fit(X_train,y_train)
print("base decision estimator's tree depth is:",clf.get_depth())
print("accuracy and precision scores for training data before boosting are:",accuracy_score(y_train,clf.predict(X_train)),precision_score(y_train,clf.predict(X_train)))
print("accuracy and precision for test data before boosting are:",accuracy_score(y_test,clf.predict(X_test)),precision_score(y_test,clf.predict(X_test)))


clf = tree.DecisionTreeClassifier(min_impurity_decrease=0.1)
abc =AdaBoostClassifier(n_estimators=80, base_estimator=clf,learning_rate=1)
abc.fit(X_train,y_train)

print("accuracy and precision scores for training data after boosting are:",accuracy_score(y_train,abc.predict(X_train)),precision_score(y_train,abc.predict(X_train)))
print("accuracy and precision for test data after boosting are:",accuracy_score(y_test,abc.predict(X_test)),precision_score(y_test,abc.predict(X_test)))
