# Baseline models

## Import data and libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, RocCurveDisplay
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("../../data/processed/clean_reviews.csv", parse_dates=['Time'])
data.columns = [x.lower().replace(" ", "_") for x in data.columns ]
data.head()

## BOW as Vectorizer

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(data['cleaned_text']).toarray()
y = data['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4263)

### Logistic Regression

In [None]:
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train, y_train)

### Evaluation of logistic regression

In [None]:
y_pred = clf.predict(X_test)
y_pred_probs = clf.predict_proba(X_test)
y_score = clf.decision_function(X_test)

In [None]:
# Calculate the accuracy score
accuracy_bow = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy_bow)

# Print the classification report
report_bow = classification_report(y_test, y_pred)
print("Classification Report:\n", report_bow)

# Print the confusion matrix
confusion_bow = confusion_matrix(y_test, y_pred)
print("Confusion Matrix", sns.heatmap(confusion_bow, annot = True, fmt= 'g'))

In [None]:
# ROC AUC curves
fpr_bow, tpr_bow, _ = roc_curve(y_test, y_score, pos_label=clf.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr_bow, tpr=tpr_bow)
print("ROC_AUC Curve:\n",roc_display.plot())

## TF-IDF as vectorizer

In [None]:
X_tf = data['cleaned_text']
y_tf = data['sentiment']
X_tf_train, X_tf_test, y_tf_train, y_tf_test = train_test_split(X_tf, y_tf, test_size=0.2, random_state=4263)

### Logistic regression

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Use with_mean=False for sparse matrices
    ('classifier', LogisticRegression(solver = 'lbfgs'))
])

In [None]:
pipeline.fit(X_tf_train, y_tf_train)

### Evaluation of logistic regression

In [None]:
y_tf_pred = pipeline.predict(X_tf_test)
y_tf_pred_prob = pipeline.predict_proba(X_tf_test)[:, 1]

In [None]:
# Calculate the accuracy score
accuracy_tf = accuracy_score(y_tf_test, y_tf_pred)
print("Accuracy:", accuracy_tf)

# Print the classification report
report_tf = classification_report(y_tf_test, y_tf_pred)
print("Classification Report:\n", report_tf)

# Print the confusion matrix
confusion_tf = confusion_matrix(y_tf_test, y_tf_pred)
print("Confusion Matrix:\n", sns.heatmap(confusion_tf, annot = True, fmt= 'g'))

In [None]:
y_tf_score = pipeline.decision_function(X_tf_test)
fpr_tf, tpr_tf, _ = roc_curve(y_tf_test, y_tf_score, pos_label=pipeline.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr_bow, tpr=tpr_bow)
print("ROC_AUC Curve:\n",roc_display.plot())

## Conclusion: 
1. BOW vectorizer performs better than TF-IDF for sentiment analysis
2. Baseline logistic regression with BOW has an accuracy of 87%
3. Our improved models should look at improving recall for -ve sentiment