## Calculate an accuracy value in HSLE Dataset

Data 
    
    - Collected from weight.csv file and applied two columns( weighted_value, IsHS) only. 
    - Manipulated four models(Random-Forest, LogisticRegression,AdaBoostClassifier and KNeighborsClassifier)
    - Selected the best model to predict a label value whether a HS Comment or No HS Comment.
    - Calculated a confusion matrix (Accuracy, Precision,Recall, F1-Score). 


In [None]:
import pandas as pd
import numpy as np
from glob import glob 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
import math 
import gspread 
import os 
from oauth2client.service_account import ServiceAccountCredentials

In [None]:
scope = ["https://spreadsheets.google.com/feeds", 'https://www.googleapis.com/auth/drive']

creds= ServiceAccountCredentials.from_json_keyfile_name('hs_data_sheet.json',scope)
client = gspread.authorize(creds)
HS_Accuracy =client.open('HS_Accuracy')

weighted_value_by_sentence = HS_Accuracy.get_worksheet(4)   #annotate_here

weighted_value_by_sentence_val=weighted_value_by_sentence.get_all_records()
weighted_value_by_sentence_list=[]
for index in range(len(weighted_value_by_sentence_val)):
    for key in weighted_value_by_sentence_val[index]:
        if key=='MsgUniSeg':
            MsgUniSeg_key=weighted_value_by_sentence_val[index][key]
        elif key=='Weighted_Value':
            Weighted_Value_key =weighted_value_by_sentence_val[index][key]
        elif key=='IsHS':
            IsHS=weighted_value_by_sentence_val[index][key]
        else:pass
    weighted_value_by_sentence_list.append([MsgUniSeg_key,Weighted_Value_key,IsHS])

#Create annotate_here dataframe 
weighted_val_new = pd.DataFrame(weighted_value_by_sentence_list, columns=['MsgUniSeg','Weighted_Value','IsHS'])


In [None]:
weighted_val_new.columns

In [None]:
# Preprocessing Step
weighted_val_new['Weighted_Value'].isna().sum()

In [None]:
X=weighted_val_new[['Weighted_Value']]

In [None]:
 weighted_val_new['IsHS'].unique()

In [None]:
# y is a vector, hence we use dot to access 'label'
y = weighted_val_new[['IsHS']]

In [None]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=1,shuffle=True,stratify=y)

In [None]:
## Apply RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
ytrain_pred = rf_model.predict_proba(X_train)
print('RF train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = (rf_model.predict_proba(X_test))
print('RF test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

In [None]:
log_classifier=LogisticRegression()
log_classifier.fit(X_train, y_train)
ytrain_pred = log_classifier.predict_proba(X_train)
print('Logistic train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = log_classifier.predict_proba(X_test)
print('Logistic test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

In [None]:
ada_classifier=AdaBoostClassifier()
ada_classifier.fit(X_train, y_train)
ytrain_pred = ada_classifier.predict_proba(X_train)
print('Adaboost train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = ada_classifier.predict_proba(X_test)
print('Adaboost test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

In [None]:
knn_classifier=KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)
ytrain_pred = knn_classifier.predict_proba(X_train)
print('Adaboost train roc-auc: {}'.format(roc_auc_score(y_train, ytrain_pred[:,1])))
ytest_pred = knn_classifier.predict_proba(X_test)
print('Adaboost test roc-auc: {}'.format(roc_auc_score(y_test, ytest_pred[:,1])))

In [None]:
pred=[]
for model in [rf_model,log_classifier,ada_classifier,knn_classifier]:
    pred.append(pd.Series(model.predict_proba(X_test)[:,1]))
final_prediction=pd.concat(pred,axis=1).mean(axis=1)
print(' test roc-auc: {}'.format(roc_auc_score(y_test,final_prediction)))

In [None]:
pd.concat(pred,axis=1).mean(axis=1)

In [None]:
#### Calculate the ROc Curve
fpr, tpr, thresholds = roc_curve(y_test, final_prediction)
# thresholds

In [None]:
accuracy_ls = []
for thres in thresholds:
    y_pred = np.where(final_prediction>thres,1,0)
    accuracy_ls.append(accuracy_score(y_test, y_pred, normalize=True))
    
accuracy_ls = pd.concat([pd.Series(thresholds), pd.Series(accuracy_ls)],
                        axis=1)
accuracy_ls.columns = ['thresholds', 'accuracy']
accuracy_ls.sort_values(by='accuracy', ascending=False, inplace=True)
accuracy_ls.head()

In [None]:
# print the first predicted probabilities of class membership
y_pred_class =(rf_model.predict_proba(X_test)[:,1] >0.3).astype(bool)

In [None]:
# save confusion matrix and slice into four pieces
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)
#[row, column]
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

In [None]:
# use float to perform true division, not integer division
Accuracy =(TP + TN) / float(TP + TN + FP + FN)

print("Accuracy  :  ",Accuracy)
print("Accuracy  :  ",metrics.accuracy_score(y_test, y_pred_class))

In [None]:
classification_error = (FP + FN) / float(TP + TN + FP + FN)

print(classification_error)
print(1 - metrics.accuracy_score(y_test, y_pred_class))

In [None]:
sensitivity_recall = TP / float(FN + TP)   # True Positive Rate

print("Recall           : ",sensitivity_recall)

print("Metric Recall    : ",metrics.recall_score(y_test, y_pred_class))

In [None]:
specificity = TN / (TN + FP)  

print(specificity)

In [None]:
false_positive_rate = FP / float(TN + FP)   # False Positive Rate

print(false_positive_rate)
print(1 - specificity)

In [None]:
precision = TP / float(TP + FP)

print(precision)
print(metrics.precision_score(y_test, y_pred_class))

In [None]:
F1=2*((precision*sensitivity_recall)/(precision+sensitivity_recall))

In [None]:
print("Accuracy     :   ",Accuracy)
print("Precision    :   ",precision)
print("Recall       :   ",sensitivity_recall)
print("F1 Score     :   ",F1)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

In [None]:
plot_roc_curve(fpr,tpr)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_pred_class))

In [None]:
from sklearn.metrics import metrics

In [None]:
print(metrics.accuracy_score(y_test, y_pred_class))
print(metrics.precision_score(y_test, y_pred_class))
print(metrics.recall_score(y_test, y_pred_class))
print(metrics.f1_score(y_test, y_pred_class))

In [None]:
print(metrics.accuracy_score(y_test, y_pred_class))
print(metrics.precision_score(y_test, y_pred_class,pos_label=0))
print(metrics.recall_score(y_test, y_pred_class,pos_label=0))
print(metrics.f1_score(y_test,y_pred_class, pos_label=0))

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(rf_model,X,y,cv=5)
scores

In [None]:
recall = cross_val_score(rf_model,X,y,cv=5,scoring='recall')
recall

In [None]:
f1_macro = cross_val_score(rf_model,X,y,cv=5,scoring='f1_macro')
f1_macro

In [None]:
precision = cross_val_score(rf_model,X,y,cv=5,scoring='precision')
precision

In [None]:
weighted_val_new.shape

In [None]:
weighted_val_new.describe()