In [1]:
import pandas as pd
import matplotlib
import numpy as np
from datascience import *
from sklearn import datasets
from sklearn.model_selection import StratifiedKFold,cross_val_score,cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve,mean_squared_error
from scipy.sparse import *
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score,auc
from sklearn.preprocessing import LabelBinarizer

Problem 1. Random forest

(a)

In [2]:
cancer = datasets.load_breast_cancer()
x_cancer = cancer.data
y_cancer=cancer.target
avg_accuracy = []
avg_sensitivity = []
avg_false_alarm= []
avg_specificity = []
avg_AUC = []
avg_features = []
skf = StratifiedKFold(n_splits=5,random_state=np.random.seed(1234), shuffle=True)
clf = RandomForestClassifier(n_estimators=100,random_state=np.random.seed(1234))
for train_index, test_index in skf.split(x_cancer,y_cancer):
    x_cancer_train, x_cancer_test,y_cancer_train, y_cancer_test = x_cancer[train_index], x_cancer[test_index],y_cancer[train_index], y_cancer[test_index]
    clf.fit(x_cancer_train,y_cancer_train)
    prediction = clf.predict(x_cancer_test)
    tn, fp, fn, tp = confusion_matrix(y_cancer_test, prediction).ravel()
    n = tn+fp+fn+tp
    accuracy = (tp+tn)/(n)
    sensitivity = (tp)/(tp+fn)
    specificity = (tp)/(tp+fp)
    false_alarm = (fp)/(tn+fp)
    AUC = roc_auc_score(y_cancer_test,prediction)
    avg_accuracy.append(accuracy)
    avg_sensitivity.append(sensitivity)
    avg_specificity.append(specificity)
    avg_false_alarm.append(false_alarm)
    avg_AUC.append(AUC)
    feature_importances = clf.feature_importances_
    avg_features.append(feature_importances)

In [3]:
print("The average accuracy of 5-fold cross-validation is",np.mean(avg_accuracy))
print("The average sensitivity of 5-fold cross-validation is",np.mean(avg_sensitivity))
print("The average specificity of 5-fold cross-validation is",np.mean(avg_specificity))
print("The average false alarm of 5-fold cross-validation is",np.mean(avg_false_alarm))
print("The average AUC of 5-fold cross-validation is",np.mean(avg_AUC))

The average accuracy of 5-fold cross-validation is 0.9595998460946518
The average sensitivity of 5-fold cross-validation is 0.9664710485133021
The average specificity of 5-fold cross-validation is 0.9694269058974943
The average false alarm of 5-fold cross-validation is 0.05204872646733112
The average AUC of 5-fold cross-validation is 0.9572111610229854


(b)

In [4]:
answer= []
for i in range (0,30):
    result = 0
    result=avg_features[0][i]+avg_features[1][i]+avg_features[2][i]+avg_features[3][i]+avg_features[4][i]
    answer.append(result/5)
for i in range(0,30):
    print("The average feature importance of",cancer.feature_names[i],"is",answer[i])

The average feature importance of mean radius is 0.03518229916218503
The average feature importance of mean texture is 0.01403610232604256
The average feature importance of mean perimeter is 0.043777047176623125
The average feature importance of mean area is 0.04696411754510499
The average feature importance of mean smoothness is 0.0067146611569918095
The average feature importance of mean compactness is 0.009572990441750515
The average feature importance of mean concavity is 0.04708229407552539
The average feature importance of mean concave points is 0.09648144458237268
The average feature importance of mean symmetry is 0.0034913746551913163
The average feature importance of mean fractal dimension is 0.00414854943324716
The average feature importance of radius error is 0.012874966526833533
The average feature importance of texture error is 0.004584787264900253
The average feature importance of perimeter error is 0.01677293659942347
The average feature importance of area error is 0.039

Problem 2. Multinomial Naive Bayes Classifier

In [5]:
def multiclass_roc_auc_score(y_test, y_pred, average='macro'):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

def multiclass_precision_score(y_test, y_pred, average= 'macro'):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return precision_score(y_test, y_pred, average=average)

def multiclass_recall_score(y_test, y_pred, average= 'macro'):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return recall_score(y_test, y_pred, average=average)

def multiclass_accuracy_score(y_test, y_pred):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return accuracy_score(y_test, y_pred)

In [6]:
categories = ['comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','comp.windows.x',
              'rec.autos','rec.motorcycles','rec.sport.baseball','rec.sport.hockey','sci.crypt','sci.electronics',
             'sci.med','sci.space','misc.forsale','talk.politics.misc','talk.politics.guns','talk.politics.mideast',
             'talk.religion.misc','alt.atheism','soc.religion.christian']
news = datasets.fetch_20newsgroups(subset= 'all',categories = categories,shuffle=True,random_state=np.random.seed(1234))
news_x = news.data
news_y = news.target
count_vect = CountVectorizer()
news_x_counts = count_vect.fit_transform(news_x)
tfidf_transformer = TfidfTransformer()
news_x_tfidf = tfidf_transformer.fit_transform(news_x_counts)
skf = StratifiedKFold(n_splits=5,random_state=np.random.seed(1234), shuffle=True)
clf_nb = MultinomialNB()
recall_list = []
accuracy_list = []
precision_list = []
auc_list = []
for train_index, test_index in skf.split(news_x_tfidf,news_y):
    x_news_train, x_news_test,y_news_train, y_news_test = news_x_tfidf[train_index], news_x_tfidf[test_index],news_y[train_index], news_y[test_index]
    clf_nb.fit(x_news_train,y_news_train)
    predict = clf_nb.predict(x_news_test)
    accuracy = multiclass_accuracy_score(y_news_test, predict)
    precision = multiclass_precision_score(y_news_test, predict)
    recall = multiclass_recall_score(y_news_test, predict)
    auc = multiclass_roc_auc_score(y_news_test, predict)
    accuracy_list.append(accuracy)
    recall_list.append(recall)
    precision_list.append(precision)
    auc_list.append(auc)

In [7]:
print("The average accuracy of 5-fold cross-validation is",np.mean(accuracy_list))
print("The average sensitivity of 5-fold cross-validation is",np.mean(recall_list))
print("The average specificity of 5-fold cross-validation is",np.mean(precision_list))
print("The average AUC of 5-fold cross-validation is",np.mean(auc_list))

The average accuracy of 5-fold cross-validation is 0.8543960408824974
The average sensitivity of 5-fold cross-validation is 0.838896709402475
The average specificity of 5-fold cross-validation is 0.879409718634229
The average AUC of 5-fold cross-validation is 0.9156100220805122
