In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import uniform
from scipy import interp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

#predictive model
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

#metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, roc_auc_score, confusion_matrix

In [None]:
df = pd.read_csv("../../data/processed/cleaned_reviews.csv")
df.head()

In [None]:
#split the data
X = df['clean_reviews']
y = [1 if label =='positive' else 0 for label in df['Sentiment']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [None]:
#RandomSearchCV
# define the parameters to tune
param_dist = {"learning_rate": uniform(0, 2),
              "gamma": uniform(1, 0.000001),
              "max_depth": range(1,50),
              "n_estimators": range(1,300),
              "min_child_weight": range(1,10),
              'n_jobs': range(1,5),
              'subsample':[i/10.0 for i in range(6,10)],
              'colsample_bytree':[i/10.0 for i in range(6,10)]}
#instance of RandomSearchCV
rs = RandomizedSearchCV(XGBClassifier(), param_distributions=param_dist, n_iter=3) #25 iterations

In [None]:
model  = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('model', rs)
])

In [None]:
#fit the data
model.fit(X_train, y_train)

In [None]:
#predict the test data
predictions=model.predict(X_test)

In [None]:
auc = roc_auc_score(y_test, predictions)
print("Model AUC ROC : ", auc)

In [None]:
print('Model Accuracy: ', round(accuracy_score(y_test, predictions)*100,2), '%')

In [None]:
print(classification_report(y_test, predictions))

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
# Error rate : 
err_rate = (fp + fn) / (tp + tn + fn + fp)
print("Error rate  : ", err_rate)
# Accuracy : 
acc_ = (tp + tn) / (tp + tn + fn + fp)
print("Accuracy  : ", acc_)
# Sensitivity : 
sens_ = tp / (tp + fn)
print("Sensitivity  : ", sens_)
# Specificity 
sp_ = tn / (tn + fp)
print("Specificity  : ", sens_)
# False positive rate (FPR)
FPR = fp / (tn + fp)
print("False positive rate  : ", FPR)

Basic XGBoost model with pre-processed data and hyper-parameter tuning

Model accuracy 83.93%

Model AUC ROC: 0.768

Next steps:
- After feature engineering ,we will have additional features (Topic Labels etc) to train