In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn import metrics

In [3]:
reviews = pd.read_parquet('../data/reviews_sample_proc.parquet')
train = reviews.query('partition == "train"')
test = reviews.query('partition == "test"')

#### Polish text, raw (punctuation removed)

In [21]:
scope = 'original'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy()

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy()

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
pipe = make_pipeline(CountVectorizer(min_df=50), MultinomialNB())

y_preds = []
y_true = []

for i, (train_ix, test_ix) in enumerate(cv.split(X_train, y_train)):
    y_pred = pipe.fit(X_train[train_ix], y_train[train_ix]).predict(X_train[test_ix])
    y_preds.extend(y_pred)
    y_true.extend(y_train[test_ix])

print(metrics.classification_report(y_true, y_preds))

pipe.fit(X_train, y_train)

y_test_pred = pipe.predict(X_test)
print(metrics.classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.62      0.69      0.65     10000
           2       0.45      0.43      0.44     10000
           3       0.45      0.39      0.42     10000
           4       0.54      0.57      0.55     10000
           5       0.79      0.78      0.78     10000

    accuracy                           0.57     50000
   macro avg       0.57      0.57      0.57     50000
weighted avg       0.57      0.57      0.57     50000

              precision    recall  f1-score   support

           1       0.61      0.69      0.65      2500
           2       0.44      0.41      0.43      2500
           3       0.45      0.42      0.43      2500
           4       0.56      0.59      0.57      2500
           5       0.81      0.79      0.80      2500

    accuracy                           0.58     12500
   macro avg       0.58      0.58      0.58     12500
weighted avg       0.58      0.58      0.58     12500



#### Polish text, lemmatized

In [22]:
scope = 'original_proc'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy()

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy()

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
pipe = make_pipeline(CountVectorizer(min_df=50), MultinomialNB())

y_preds = []
y_true = []

for i, (train_ix, test_ix) in enumerate(cv.split(X_train, y_train)):
    y_pred = pipe.fit(X_train[train_ix], y_train[train_ix]).predict(X_train[test_ix])
    y_preds.extend(y_pred)
    y_true.extend(y_train[test_ix])

print(metrics.classification_report(y_true, y_preds))

pipe.fit(X_train, y_train)

y_test_pred = pipe.predict(X_test)
print(metrics.classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.60      0.69      0.64     10000
           2       0.44      0.43      0.43     10000
           3       0.44      0.38      0.41     10000
           4       0.54      0.56      0.55     10000
           5       0.79      0.77      0.78     10000

    accuracy                           0.57     50000
   macro avg       0.56      0.57      0.56     50000
weighted avg       0.56      0.57      0.56     50000

              precision    recall  f1-score   support

           1       0.60      0.68      0.64      2500
           2       0.44      0.42      0.43      2500
           3       0.44      0.39      0.41      2500
           4       0.54      0.58      0.56      2500
           5       0.81      0.77      0.79      2500

    accuracy                           0.57     12500
   macro avg       0.57      0.57      0.57     12500
weighted avg       0.57      0.57      0.57     12500



#### Polish text, lemmatized with no stopwords

In [23]:
scope = 'original_proc_no_stop'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy()

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy()

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
pipe = make_pipeline(CountVectorizer(min_df=50), MultinomialNB())

y_preds = []
y_true = []

for i, (train_ix, test_ix) in enumerate(cv.split(X_train, y_train)):
    y_pred = pipe.fit(X_train[train_ix], y_train[train_ix]).predict(X_train[test_ix])
    y_preds.extend(y_pred)
    y_true.extend(y_train[test_ix])

print(metrics.classification_report(y_true, y_preds))

pipe.fit(X_train, y_train)

y_test_pred = pipe.predict(X_test)
print(metrics.classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.59      0.67      0.63     10000
           2       0.43      0.42      0.42     10000
           3       0.43      0.37      0.40     10000
           4       0.52      0.55      0.54     10000
           5       0.77      0.76      0.77     10000

    accuracy                           0.55     50000
   macro avg       0.55      0.55      0.55     50000
weighted avg       0.55      0.55      0.55     50000

              precision    recall  f1-score   support

           1       0.59      0.67      0.63      2500
           2       0.42      0.40      0.41      2500
           3       0.43      0.38      0.41      2500
           4       0.53      0.56      0.55      2500
           5       0.78      0.76      0.77      2500

    accuracy                           0.55     12500
   macro avg       0.55      0.55      0.55     12500
weighted avg       0.55      0.55      0.55     12500



#### English translation, raw (punctuation removed)

In [24]:
scope = 'translated'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy()

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy()

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
pipe = make_pipeline(CountVectorizer(min_df=50), MultinomialNB())

y_preds = []
y_true = []

for i, (train_ix, test_ix) in enumerate(cv.split(X_train, y_train)):
    y_pred = pipe.fit(X_train[train_ix], y_train[train_ix]).predict(X_train[test_ix])
    y_preds.extend(y_pred)
    y_true.extend(y_train[test_ix])

print(metrics.classification_report(y_true, y_preds))

pipe.fit(X_train, y_train)

y_test_pred = pipe.predict(X_test)
print(metrics.classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.60      0.68      0.64     10000
           2       0.43      0.42      0.43     10000
           3       0.43      0.38      0.40     10000
           4       0.53      0.56      0.54     10000
           5       0.79      0.76      0.78     10000

    accuracy                           0.56     50000
   macro avg       0.56      0.56      0.56     50000
weighted avg       0.56      0.56      0.56     50000

              precision    recall  f1-score   support

           1       0.60      0.68      0.64      2500
           2       0.44      0.41      0.42      2500
           3       0.43      0.39      0.41      2500
           4       0.53      0.57      0.55      2500
           5       0.79      0.77      0.78      2500

    accuracy                           0.56     12500
   macro avg       0.56      0.56      0.56     12500
weighted avg       0.56      0.56      0.56     12500



#### English translation, lemmatized

In [25]:
scope = 'translated_proc'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy()

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy()

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
pipe = make_pipeline(CountVectorizer(min_df=50), MultinomialNB())

y_preds = []
y_true = []

for i, (train_ix, test_ix) in enumerate(cv.split(X_train, y_train)):
    y_pred = pipe.fit(X_train[train_ix], y_train[train_ix]).predict(X_train[test_ix])
    y_preds.extend(y_pred)
    y_true.extend(y_train[test_ix])

print(metrics.classification_report(y_true, y_preds))

pipe.fit(X_train, y_train)

y_test_pred = pipe.predict(X_test)
print(metrics.classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.59      0.68      0.63     10000
           2       0.43      0.41      0.42     10000
           3       0.43      0.38      0.40     10000
           4       0.53      0.55      0.54     10000
           5       0.79      0.76      0.78     10000

    accuracy                           0.56     50000
   macro avg       0.55      0.56      0.55     50000
weighted avg       0.55      0.56      0.55     50000

              precision    recall  f1-score   support

           1       0.59      0.67      0.63      2500
           2       0.43      0.40      0.41      2500
           3       0.43      0.39      0.41      2500
           4       0.53      0.57      0.55      2500
           5       0.79      0.76      0.78      2500

    accuracy                           0.56     12500
   macro avg       0.56      0.56      0.56     12500
weighted avg       0.56      0.56      0.56     12500



#### English translation, lemmatized with stopwords removed

In [26]:
scope = 'translated_proc_no_stop'
X_train = train[scope].to_numpy()
y_train = train['rating'].to_numpy()

X_test = test[scope].to_numpy()
y_test = test['rating'].to_numpy()

cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
pipe = make_pipeline(CountVectorizer(min_df=50), MultinomialNB())

y_preds = []
y_true = []

for i, (train_ix, test_ix) in enumerate(cv.split(X_train, y_train)):
    y_pred = pipe.fit(X_train[train_ix], y_train[train_ix]).predict(X_train[test_ix])
    y_preds.extend(y_pred)
    y_true.extend(y_train[test_ix])

print(metrics.classification_report(y_true, y_preds))

pipe.fit(X_train, y_train)

y_test_pred = pipe.predict(X_test)
print(metrics.classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           1       0.58      0.67      0.62     10000
           2       0.42      0.41      0.41     10000
           3       0.42      0.35      0.38     10000
           4       0.51      0.53      0.52     10000
           5       0.75      0.75      0.75     10000

    accuracy                           0.54     50000
   macro avg       0.54      0.54      0.54     50000
weighted avg       0.54      0.54      0.54     50000

              precision    recall  f1-score   support

           1       0.57      0.65      0.61      2500
           2       0.41      0.39      0.40      2500
           3       0.43      0.37      0.40      2500
           4       0.52      0.54      0.53      2500
           5       0.76      0.76      0.76      2500

    accuracy                           0.54     12500
   macro avg       0.54      0.54      0.54     12500
weighted avg       0.54      0.54      0.54     12500

