# Preparing environment

### modules loading

In [None]:
!pip3 install --quiet datasets matplotlib numpy sklearn gensim

[K     |████████████████████████████████| 194kB 8.9MB/s 
[K     |████████████████████████████████| 245kB 8.5MB/s 
[K     |████████████████████████████████| 112kB 15.6MB/s 
[?25h

In [None]:
from collections import Counter
from typing import List, Tuple
import re

import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV

from datasets import load_dataset
import gensim.downloader
from gensim.models import KeyedVectors

### dataset downloading 

In [None]:
dataset = load_dataset('amazon_reviews_multi', 'en')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2773.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3624.0, style=ProgressStyle(description…


Downloading and preparing dataset amazon_reviews_multi/en (download: 82.11 MiB, generated: 58.69 MiB, post-processed: Unknown size, total: 140.79 MiB) to /root/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/f3357bd271e187385a38574fe31b8fb10055303f67fa9fce55e84d08c4870efd...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=81989414.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2059600.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2045098.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset amazon_reviews_multi downloaded and prepared to /root/.cache/huggingface/datasets/amazon_reviews_multi/en/1.0.0/f3357bd271e187385a38574fe31b8fb10055303f67fa9fce55e84d08c4870efd. Subsequent calls will reuse this data.


In [None]:
dataset.keys()

dict_keys(['train', 'validation', 'test'])

### setting `train_data`, `valid_data` and `test_data`

In [None]:
train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

# Preprocessing

Clean text from useless words and punctuation

In [None]:
STOP_WORDS = [
    'the', 'a', 'an', 'and', 'of', 'to', 'is', 'in', 'that', 'this', 'was', 'as', 'with', 'for', 'you', 'are', 'it', 
    'so', 'be', 'i', 'my', 'they', 'these', 'them', 'just', 'do', 'did', 'at', 'or', 'me', 'too', 'on', 
    'have', 'am', 'if', 'when', 'has', 'your', 'some', 'now', 'also', 'which', 'had', 'what', 'there', 
]

symbols = re.compile(r'[^\w\s]')


def preprocess(text: str) -> List[str]:
    text = text.lower()
    text = symbols.sub(r' ', text)
    tokens = [token for token in text.split() if token not in STOP_WORDS]
    return tokens

# Universal part

Universal part for most of functions below

In [None]:
def split_dataset(vectorizer, limit: int = 3):

    X_train = vectorizer.fit_transform(train_data[:]["review_body"])
    # y_train = train_data["stars"]  # original labels 
    y_train = ['bad' if star <= limit else 'ok' for star in train_data["stars"]]  # replace stars to `bad` or `good` for better results

    X_valid = vectorizer.transform(test_data["review_body"])
    # y_valid = test_data["stars"]  # original labels 
    y_valid = ['bad' if star <= limit else 'ok' for star in test_data["stars"]]  # replace stars to `bad` or `good` for better results

    return X_train, y_train, X_valid, y_valid

In [None]:
def logistic_regression(X_train, y_train, X_valid, y_valid):
    lr_model = LogisticRegression(solver='liblinear', C=0.1, penalty="l1")
    lr_model.fit(X_train, y_train)
    y_pred = lr_model.predict(X_valid)
    print(f'Logistic regression: {accuracy_score(y_valid, y_pred)}')

In [None]:
def multinomial_bayes(X_train, y_train, X_valid, y_valid):
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    y_pred = mnb.predict(X_valid)
    print(f'Multinomial Bayes: {accuracy_score(y_valid, y_pred)}')

In [None]:
def svm(X_train, y_train, X_valid, y_valid):
    svm_model = LinearSVC(max_iter=100)
    svm_model.fit(X_train, y_train)
    print(f'SVM: {svm_model.score(X_valid, y_valid)}')

# Text classification

## Bag-of-words

In [None]:
vectorizer = CountVectorizer(
    max_features=5000,
    tokenizer=preprocess,
    )


X_train, y_train, X_valid, y_valid = split_dataset(vectorizer)

logistic_regression(X_train, y_train, X_valid, y_valid)
multinomial_bayes(X_train, y_train, X_valid, y_valid)
svm(X_train, y_train, X_valid, y_valid)

Logistic regression: 0.8378
Multinomial Bayes: 0.8178
SVM: 0.836




## Bag-of-ngrams

### 2-gram

In [None]:
vectorizer = CountVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    tokenizer=preprocess,
    )


X_train, y_train, X_valid, y_valid = split_dataset(vectorizer)

logistic_regression(X_train, y_train, X_valid, y_valid)
multinomial_bayes(X_train, y_train, X_valid, y_valid)
svm(X_train, y_train, X_valid, y_valid)

Logistic regression: 0.855
Multinomial Bayes: 0.8288
SVM: 0.8552




### 3-gram

In [None]:
vectorizer = CountVectorizer(
    max_features=5000,
    ngram_range=(1,3),
    tokenizer=preprocess,
    )


X_train, y_train, X_valid, y_valid = split_dataset(vectorizer)

logistic_regression(X_train, y_train, X_valid, y_valid)
multinomial_bayes(X_train, y_train, X_valid, y_valid)
svm(X_train, y_train, X_valid, y_valid)

Logistic regression: 0.8548
Multinomial Bayes: 0.8276
SVM: 0.8504




## Bag-of-ngrams + binarization

In [None]:
vectorizer = CountVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    tokenizer=preprocess,
    binary=True,
    )


X_train, y_train, X_valid, y_valid = split_dataset(vectorizer)

logistic_regression(X_train, y_train, X_valid, y_valid)
multinomial_bayes(X_train, y_train, X_valid, y_valid)
svm(X_train, y_train, X_valid, y_valid)

Logistic regression: 0.8538
Multinomial Bayes: 0.83
SVM: 0.855




## Bag-of-ngrams + TF-IDF

In [None]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    tokenizer=preprocess,
)


X_train, y_train, X_valid, y_valid = split_dataset(vectorizer)

logistic_regression(X_train, y_train, X_valid, y_valid)
multinomial_bayes(X_train, y_train, X_valid, y_valid)
svm(X_train, y_train, X_valid, y_valid)

Logistic regression: 0.8382
Multinomial Bayes: 0.8284
SVM: 0.8598


## Bag-of-embeddings

In [None]:
def load_glove_subset(max_n: int) -> KeyedVectors:
    """Return top `max_n` word vectors. """
    all_glove = gensim.downloader.load("glove-wiki-gigaword-200")
    subset = KeyedVectors(all_glove.vector_size)
    for word in all_glove.vocab:
        if len(subset.vectors) >= max_n:
            break
        subset.add(word, all_glove[word])
    return subset


# glove = load_glove_subset(50_000)  # original load function (slow also)
glove = KeyedVectors.load("http://134.209.248.229:8081/glove-50k.bin")

In [None]:
def bag_of_embeddings(dataset, limit: int = 3):
    X = []
    # y = dataset['stars']  # original labels
    y = ['bad' if star <= limit else 'ok' for star in dataset['stars']]  # replacced labels
    for i, doc in enumerate(dataset):
        tokens = preprocess(doc['review_body'])
        token_vectors = []
        for token in tokens:
            if token in glove:
                token_vectors.append(glove[token])
                
        doc_vector = np.array(token_vectors).mean(axis=0)
        if isinstance(doc_vector, np.float64):  # if NaN -> skip
            y.pop(i)  # remove value from labels
            continue
        X.append(doc_vector)
    
    X = np.stack(X)
    y = np.array(y)

    return (X, y)

In [None]:
X_train, y_train = bag_of_embeddings(train_data)
X_valid, y_valid = bag_of_embeddings(test_data)

logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)
logreg.score(X_valid, y_valid)

  if sys.path[0] == '':
  ret = ret.dtype.type(ret / rcount)


0.7818

# Hyperparams search

function `get_vectorized` to avoid code duplicate

In [None]:
def get_vectorizer(vocab_size: int) -> TfidfVectorizer:
    vectorizer = TfidfVectorizer(
        max_features=vocab_size,
        ngram_range=(1,2),
        tokenizer=preprocess,
    )
    return vectorizer

In [None]:
grid_values = {'penalty': ['l1', 'l2'],'C':[0.001, 0.01, 0.1, 0.5, 1, 2, 5, 10]}

for limit in range(1, 5):
    print(f'LIMIT: {limit}')
    for vocab_size in [1000, 5000, 15000]:
        vect = get_vectorizer(vocab_size)
        X_train, y_train, X_valid, y_valid = split_dataset(vect, limit=limit)
        
        model = GridSearchCV(LogisticRegression(solver='liblinear', max_iter=10000), param_grid = grid_values)
        model.fit(X_train, y_train)

        print(f'Vocab-{vocab_size}: best params: {model.best_params_} with {model.best_score_} accuracy')
    print()


LIMIT: 1
Vocab-1000: best params: {'C': 0.5, 'penalty': 'l2'} with 0.8398200000000001 accuracy
Vocab-5000: best params: {'C': 0.5, 'penalty': 'l2'} with 0.84787 accuracy
Vocab-15000: best params: {'C': 0.5, 'penalty': 'l2'} with 0.8495849999999999 accuracy

LIMIT: 2
Vocab-1000: best params: {'C': 1, 'penalty': 'l1'} with 0.78057 accuracy
Vocab-5000: best params: {'C': 1, 'penalty': 'l1'} with 0.797625 accuracy
Vocab-15000: best params: {'C': 1, 'penalty': 'l1'} with 0.8003500000000001 accuracy

LIMIT: 3
Vocab-1000: best params: {'C': 5, 'penalty': 'l1'} with 0.8181350000000001 accuracy
Vocab-5000: best params: {'C': 1, 'penalty': 'l1'} with 0.842045 accuracy
Vocab-15000: best params: {'C': 2, 'penalty': 'l1'} with 0.8471300000000002 accuracy

LIMIT: 4
Vocab-1000: best params: {'C': 2, 'penalty': 'l2'} with 0.8558600000000001 accuracy
Vocab-5000: best params: {'C': 2, 'penalty': 'l2'} with 0.861645 accuracy
Vocab-15000: best params: {'C': 2, 'penalty': 'l2'} with 0.8632799999999999 accu