In [39]:
import pandas as pd
import numpy as np
from os import getcwd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB

Movie Reviews

In [2]:
sw_file = f"{getcwd()}/stopwords.txt"
with open(sw_file, 'r') as file:
    content = file.read()
    sw = content.split(',')

In [3]:
df = pd.read_table('https://raw.githubusercontent.com/esnt/Data/main/MovieReviews/moviereviews.tsv', sep='\t')
df = df.dropna()

In [None]:
vectorizer = CountVectorizer(min_df=100, max_df=.7, ngram_range=(1,2), binary=True, stop_words=sw)
target = df['label']
feature = vectorizer.fit_transform(df['review']) 
x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=.3, random_state=713)

Naive Bayes

In [5]:
pipe = Pipeline([
    ('scale', StandardScaler(with_mean=False)),
    ('model', BernoulliNB())
])

params = {'model__alpha': list(np.arange(.1,1.1,.1))}
search = GridSearchCV(pipe, param_grid=params, n_jobs=-1, scoring='f1_weighted')
search.fit(x_train,y_train)
preds = search.predict(x_test)
accuracy = round(accuracy_score(y_true=y_test, y_pred=preds),3)
auc = round(roc_auc_score(y_test, search.predict_proba(x_test)[:,1]),3)
f1 = round(f1_score(y_test, preds, average='weighted'),3)
print(f"Predicted Accuracy: {accuracy}")
print(f"F1: {f1}")
print(f"AUC: {auc}")

Predicted Accuracy: 0.896
F1: 0.896
AUC: 0.956


Logistic

In [6]:
pipe = Pipeline([
    ('scale', StandardScaler(with_mean=False)),
    ('model', LogisticRegression(solver='liblinear', penalty='l1'))
])

params = {'model__C': list(np.arange(.1,1,.1))}
search = GridSearchCV(pipe, param_grid=params, n_jobs=-1, scoring='roc_auc')
search.fit(x_train,y_train)
preds = search.predict(x_test)
accuracy = round(accuracy_score(y_true=y_test, y_pred=preds),3)
auc = round(roc_auc_score(y_test, search.predict_proba(x_test)[:,1]),3)
f1 = round(f1_score(y_test, preds, average='weighted'),3)
print(f"Predicted Accuracy: {accuracy}")
print(f"F1: {f1}")
print(f"AUC: {auc}")

Predicted Accuracy: 0.888
F1: 0.888
AUC: 0.95


Most influential words (indentified by logistic regression)

In [7]:
lr = LogisticRegression(solver='liblinear', penalty='l1', C=search.best_params_.get('model__C'))
lr.fit(x_train,y_train)
cf = pd.Series(lr.coef_[0], index=vectorizer.get_feature_names_out())

In [8]:
# Printing the most inluential negative words
cf.sort_values().head()

worst    -2.051545
waste    -1.846752
bad      -1.290796
awful    -1.249790
boring   -1.009866
dtype: float64

In [9]:
# Printing the most inluential positive words
cf.sort_values(ascending=False).head()

perfect      1.076233
excellent    1.068911
highly       1.043415
10 10        0.920255
great        0.873050
dtype: float64

k-NN

In [11]:
pipe = Pipeline([
    ('scale', StandardScaler(with_mean=False)),
    ('model', KNeighborsClassifier(metric='cosine'))
])

params = {'model__n_neighbors' : list(range(45,66,1))}
search = GridSearchCV(pipe, param_grid=params, n_jobs=-1, scoring='roc_auc')
search.fit(x_train,y_train)
preds = search.predict(x_test)
accuracy = round(accuracy_score(y_true=y_test, y_pred=preds),3)
auc = round(roc_auc_score(y_test, search.predict_proba(x_test)[:,1]),3)
f1 = round(f1_score(y_test, preds, average='weighted'),3)
print(f"Predicted Accuracy: {accuracy}")
print(f"F1: {f1}")
print(f"AUC: {auc}")


Predicted Accuracy: 0.868
F1: 0.868
AUC: 0.944
