In [104]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn import metrics

In [83]:
reviews = pd.read_parquet('../data/reviews_sample_proc.parquet')[['partition', 'rating', 'original', 'original_proc', 'original_proc_no_stop', 'translated', 'translated_proc', 'translated_proc_no_stop']]
train = reviews.query('partition == "train"')
test = reviews.query('partition == "test"')

In [97]:
X_train = train['original'].to_numpy()
y_train = train['rating'].to_numpy()

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
pipe = make_pipeline(CountVectorizer(min_df=50), MultinomialNB())

y_preds = []
y_true = []

for i, (train_ix, test_ix) in enumerate(cv.split(X_train, y_train)):
    y_pred = pipe.fit(X_train[train_ix], y_train[train_ix]).predict(X_train[test_ix])
    y_preds.extend(y_pred)
    y_true.extend(y_train[test_ix])

print(metrics.classification_report(y_true, y_preds))

              precision    recall  f1-score   support

           1       0.62      0.69      0.65     10000
           2       0.45      0.43      0.44     10000
           3       0.45      0.39      0.42     10000
           4       0.54      0.57      0.55     10000
           5       0.79      0.78      0.78     10000

    accuracy                           0.57     50000
   macro avg       0.57      0.57      0.57     50000
weighted avg       0.57      0.57      0.57     50000



In [98]:
X_train = train['translated'].to_numpy()
y_train = train['rating'].to_numpy()

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
pipe = make_pipeline(CountVectorizer(min_df=50), MultinomialNB())

y_preds = []
y_true = []

for i, (train_ix, test_ix) in enumerate(cv.split(X_train, y_train)):
    y_pred = pipe.fit(X_train[train_ix], y_train[train_ix]).predict(X_train[test_ix])
    y_preds.extend(y_pred)
    y_true.extend(y_train[test_ix])

print(metrics.classification_report(y_true, y_preds))

              precision    recall  f1-score   support

           1       0.60      0.68      0.64     10000
           2       0.43      0.42      0.43     10000
           3       0.43      0.38      0.40     10000
           4       0.53      0.56      0.54     10000
           5       0.79      0.76      0.78     10000

    accuracy                           0.56     50000
   macro avg       0.56      0.56      0.56     50000
weighted avg       0.56      0.56      0.56     50000



In [87]:
cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
pipe = make_pipeline(CountVectorizer(), MultinomialNB())

y_preds = []
y_true = []

for i, (train_ix, test_ix) in enumerate(cv.split(X_train, y_train)):
    vectorizer = CountVectorizer()
    mx = vectorizer.fit_transform(X_train[train_ix])
    mx[mx >= 1] = 1
    nb = MultinomialNB().fit(mx, y_train[train_ix])
    y_pred = nb.predict(vectorizer.transform(X_train[test_ix]))
    y_preds.extend(y_pred)
    y_true.extend(y_train[test_ix])

print(metrics.classification_report(y_true, y_preds))

              precision    recall  f1-score   support

           1       0.63      0.66      0.65     10000
           2       0.42      0.47      0.45     10000
           3       0.43      0.39      0.41     10000
           4       0.55      0.56      0.55     10000
           5       0.82      0.73      0.78     10000

    accuracy                           0.56     50000
   macro avg       0.57      0.56      0.57     50000
weighted avg       0.57      0.56      0.57     50000

