In [None]:
import numpy as np 
from numpy import argmax
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from scipy.stats import uniform
from scipy import interp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

#predictive model
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

#metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, roc_auc_score, precision_recall_curve

In [None]:
# df = pd.read_csv("../../data/processed/cleaned_reviews.csv")
df = pd.read_csv("../../data/processed/clean_reviews_w_topics.csv")
df.head()

In [None]:
#split the data
# X = df['clean_reviews']
X = df['cleaned_text']
y = [1 if label =='positive' else 0 for label in df['sentiment']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
# scale_pos_weight
print((len(y_train)-sum(y_train))/sum(y_train))

# Parameter Tuning #1

In [None]:
#RandomSearchCV
# define the parameters to tune
param_dist = {"learning_rate": uniform(0, 2),
              "gamma": uniform(1, 0.000001),
              "max_depth": range(1,50),
              "n_estimators": range(1,300),
              "min_child_weight": range(1,10),
              'n_jobs': range(1,5),
              'subsample':[i/10.0 for i in range(6,10)],
              'colsample_bytree':[i/10.0 for i in range(6,10)]}
#instance of RandomSearchCV
rs = RandomizedSearchCV(XGBClassifier(), param_distributions=param_dist, n_iter=3) #25 iterations

In [None]:
model  = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', rs)
])

In [None]:
#fit the data
model.fit(X_train, y_train)

# Parameter Tuning #2

In [None]:
# RandomSearchCV + Stratified K Fold for cross validation
params = {
        'n_estimators': [i for i in range(100, 1000)],
        'max_depth': [i for i in range(3, 10)],
        'min_child_weight': [i for i in range(1, 10)],
        'subsample': [i/100 for i in range(60, 100)],
        'learning_rate': [i/1000.0 for i in range(100,1000,25)],
        'colsample_bytree':[i/10.0 for i in range(6,10)]
        }

clf = XGBClassifier(scale_pos_weight=3.5, eval_metric="aucpr")

folds = 15
param_comb = 5
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1002)

random_search = RandomizedSearchCV(clf, 
                                   param_distributions=params, 
                                   n_iter=param_comb, 
                                   scoring ='f1',
                                   n_jobs=4, 
                                   cv=skf.split(X_train,y_train), 
                                   verbose=3, 
                                  )

model  = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', random_search)
])

#fit the data
model.fit(X_train, y_train)

In [None]:
# save best parameters
# joblib.dump(model.named_steps.model.best_estimator_, 'xgboost_best_model.pkl')

# Load Saved Model

In [None]:
saved_model = joblib.load('xgboost_best_model.pkl')

In [None]:
model  = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', saved_model)
])

model.fit(X_train, y_train)

# Model Evaluation

In [None]:
#predict the test data
predictions=model.predict(X_test)

In [None]:
auc = roc_auc_score(y_test, predictions)
print("Model AUC ROC : ", auc)

In [None]:
print('Model Accuracy: ', round(accuracy_score(y_test, predictions)*100,2), '%')

In [None]:
print(classification_report(y_test, predictions))

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
# Error rate : 
err_rate = (fp + fn) / (tp + tn + fn + fp)
print("Error rate  : ", err_rate)
# Accuracy : 
acc_ = (tp + tn) / (tp + tn + fn + fp)
print("Accuracy  : ", acc_)
# Sensitivity : 
sens_ = tp / (tp + fn)
print("Sensitivity  : ", sens_)
# Specificity 
sp_ = tn / (tn + fp)
print("Specificity  : ", sens_)
# False positive rate (FPR)
FPR = fp / (tn + fp)
print("False positive rate  : ", FPR)

# Optimal Threshold Probability

In [None]:
def Find_Optimal_Threshold(target, predicted):
    # calculate precision-recall roc curves
    precision, recall, thresholds = precision_recall_curve(target, predicted)
    # convert to f score
    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = argmax(fscore)
    print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))

    return thresholds[ix]

In [None]:
prob_test = model.predict_proba(X_test)[:,1]
prob_train = model.predict_proba(X_train)[:,1]

predicted_test = []

threshold_train = Find_Optimal_Threshold(y_train, prob_train)
print('threshold is: ', threshold_train)

for i in prob_test:
    if i > 0.65:
        predicted_test.append(1)
    else:
        predicted_test.append(0)

In [None]:
auc = roc_auc_score(y_test, predicted_test)
print("Model AUC ROC : ", auc)

In [None]:
print('Model Accuracy: ', round(accuracy_score(y_test, predicted_test)*100,2), '%')

In [None]:
print(classification_report(y_test, predicted_test))

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, predicted_test).ravel()
# Error rate : 
err_rate = (fp + fn) / (tp + tn + fn + fp)
print("Error rate  : ", err_rate)
# Accuracy : 
acc_ = (tp + tn) / (tp + tn + fn + fp)
print("Accuracy  : ", acc_)
# Sensitivity : 
sens_ = tp / (tp + fn)
print("Sensitivity  : ", sens_)
# Specificity 
sp_ = tn / (tn + fp)
print("Specificity  : ", sens_)
# False positive rate (FPR)
FPR = fp / (tn + fp)
print("False positive rate  : ", FPR)

## Basic XGBoost model with pre-processed data and hyper-parameter tuning

Model accuracy 83.93%

Model AUC ROC: 0.768

Next steps:
- After feature engineering ,we will have additional features (Topic Labels etc) to train

## Parameter tuning #1 with clean_reviews_w_topics.csv 

Model accuracy 81.54%

Model AUC ROC: 0.726

## Parameter tuning #2 with clean_reviews_w_topics.csv 
Added StratifiedKFold CV with Random Search CV + scale_pos_weight

Model accuracy: 85.77%

Model AUC ROC: 0.763

## Parameter tuning #2 with clean_reviews_w_topics.csv 
Added StratifiedKFold CV with Random Search CV + Optimal Threshold Probability + scale_pos_weight

Best Threshold: 0.795

Model accuracy: 86.23%

Model AUC ROC: 0.790