In [1]:
import numpy as np
import pandas as pd

# Goodreads Books Reviews example

## Preprocessing

In [111]:
BOOK_REVIEW_TRAIN_PATH = 'goodreads_train.csv'

In [112]:
data = pd.read_csv(BOOK_REVIEW_TRAIN_PATH, usecols=['review_id', 'review_text', 'rating'], nrows=100000)
data.set_index('review_id', inplace=True)

In [113]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, dfdbb7b0eb5a7e4c26d59a937e2e5feb to 1dd15605d2fbfe18a51ff3adeacaf7b9
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   rating       100000 non-null  int64 
 1   review_text  100000 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


In [114]:
data.rating.value_counts()

4    35387
5    30294
3    20288
2     7714
1     3240
0     3077
Name: rating, dtype: int64

In [115]:
import string
import re
from nltk.stem import SnowballStemmer

punct = re.compile('[' + re.escape(string.punctuation) + ']')
digits = re.compile('[' + re.escape(string.digits) + ']')

def clean (str_):  
    
    str_ = str_.lower()
    str_ = re.sub('\n',' ',str_)
    str_ = re.sub(punct,r' ', str_)
    str_ = re.sub(digits,r' ', str_)
    str_ = re.sub(r'\s+',r' ', str_)
    str_ = str_.strip()
    
    return str_

def stem(str_):
    
    str_ = clean(str_)
    
    words = str_.strip().split(' ')
    words = ' '.join([SnowballStemmer('english').stem(word) for word in words])
    
    return words

def remove_stop_words(str_, reduce_funct, stopwords):
    
    str_ = reduce_funct(str_)
    words = str_.strip().split(' ')
    
    words = ' '.join([word for word in words if word not in stopwords])
    
    return words



In [116]:
data['review_text_stem'] = data['review_text'].apply(stem)
data.head(3)

Unnamed: 0_level_0,rating,review_text,review_text_stem
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,this is a special book it start slow for about...
a5d2c3628987712d0e05c4f90798eb67,3,Recommended by Don Katz. Avail for free in Dec...,recommend by don katz avail for free in decemb...
2ede853b14dc4583f96cf5d120af636f,3,"A fun, fast paced science fiction thriller. I ...",a fun fast pace scienc fiction thriller i read...


## Train-test splitting

In [117]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['rating']), data.rating,
                                                    random_state=42, stratify=data.rating)


## TF-IDF pipeline

In [118]:
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score

results = []

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()

def get_tfidf_results(param_dict, stopwords, clf, X_train, y_train, X_test, y_test):
    
    for params in tqdm(ParameterGrid(param_dict)):
#       print(params)
        pipe = Pipeline(steps = [('tfidf', TfidfVectorizer(min_df=params['min_df'], max_df=params['max_df'],
                                                           token_pattern=r'[A-Za-z]{3,}',
                                                           max_features=params['max_features'],
                                                           stop_words=stopwords)),
                                 #('to_dense', DenseTransformer()), 
                                 ('clf', clf)
                                ]
                           )

        pipe.fit(X_train, y_train)
        train_preds = pipe.predict(X_train)

        results.append(dict(estimator=pipe,
                            parameters=params,
                            train_f1 = f1_score(y_true=y_train, y_pred=pipe.predict(X_train), average='micro'),
                            test_f1 = f1_score(y_true=y_test, y_pred=pipe.predict(X_test), average='micro')
        ))
    return results

## Baseline - LogisticRegression

In [119]:
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid

from spacy import load 
en = load('en_core_web_sm')

import warnings
warnings.filterwarnings("ignore")

spacy_stopwords = en.Defaults.stop_words

params = dict(min_df=[.01,], max_df=[.5, ], max_features = [1000,])

results = []

get_tfidf_results(params, spacy_stopwords, LogisticRegression(max_iter=500), 
                  X_train.review_text_stem, y_train,
                  X_test.review_text_stem, y_test)

100%|█████████████████████████████████████████████| 1/1 [00:19<00:00, 19.22s/it]


[{'estimator': Pipeline(steps=[('tfidf',
                   TfidfVectorizer(max_df=0.5, max_features=1000, min_df=0.01,
                                   stop_words={"'d", "'ll", "'m", "'re", "'s",
                                               "'ve", 'a', 'about', 'above',
                                               'across', 'after', 'afterwards',
                                               'again', 'against', 'all',
                                               'almost', 'alone', 'along',
                                               'already', 'also', 'although',
                                               'always', 'am', 'among', 'amongst',
                                               'amount', 'an', 'and', 'another',
                                               'any', ...},
                                   token_pattern='[A-Za-z]{3,}')),
                  ('clf', LogisticRegression(max_iter=500))]),
  'parameters': {'max_df': 0.5, 'max_features': 1000, 'min_df':

## Efficient enough model - CatBoost
https://catboost.ai/

In [120]:
from catboost import CatBoostClassifier

params = dict(min_df=[.01,], max_df=[.5, ], max_features = [1000,])

get_tfidf_results(params, spacy_stopwords, CatBoostClassifier(iterations=10,
                                                              random_state=42), 
                  X_train.review_text_stem, y_train,
                  X_test.review_text_stem, y_test)


  0%|                                                     | 0/1 [00:00<?, ?it/s]

Learning rate set to 0.5
0:	learn: 1.5289701	total: 729ms	remaining: 6.56s
1:	learn: 1.4419874	total: 1.36s	remaining: 5.46s
2:	learn: 1.3965448	total: 2.06s	remaining: 4.8s
3:	learn: 1.3708660	total: 2.57s	remaining: 3.86s
4:	learn: 1.3554182	total: 3.05s	remaining: 3.05s
5:	learn: 1.3401561	total: 3.64s	remaining: 2.43s
6:	learn: 1.3257662	total: 4.17s	remaining: 1.79s
7:	learn: 1.3152249	total: 4.7s	remaining: 1.17s
8:	learn: 1.3088649	total: 5.22s	remaining: 580ms
9:	learn: 1.3011017	total: 5.7s	remaining: 0us


100%|█████████████████████████████████████████████| 1/1 [00:17<00:00, 17.52s/it]


[{'estimator': Pipeline(steps=[('tfidf',
                   TfidfVectorizer(max_df=0.5, max_features=1000, min_df=0.01,
                                   stop_words={"'d", "'ll", "'m", "'re", "'s",
                                               "'ve", 'a', 'about', 'above',
                                               'across', 'after', 'afterwards',
                                               'again', 'against', 'all',
                                               'almost', 'alone', 'along',
                                               'already', 'also', 'although',
                                               'always', 'am', 'among', 'amongst',
                                               'amount', 'an', 'and', 'another',
                                               'any', ...},
                                   token_pattern='[A-Za-z]{3,}')),
                  ('clf', LogisticRegression(max_iter=500))]),
  'parameters': {'max_df': 0.5, 'max_features': 1000, 'min_df':

## Efficient enough model - XGBoost

https://xgboost.readthedocs.io/en/stable/index.html

In [121]:
from xgboost import XGBClassifier

params = dict(min_df=[.01,], max_df=[.5, ], max_features = [1000,])

get_tfidf_results(params, spacy_stopwords, XGBClassifier(n_estimators=10, max_depth=2, 
                                                         learning_rate=1, 
                                                         objective='binary:logistic'), 
                  X_train.review_text_stem, y_train,
                  X_test.review_text_stem, y_test)

100%|█████████████████████████████████████████████| 1/1 [00:13<00:00, 13.39s/it]


[{'estimator': Pipeline(steps=[('tfidf',
                   TfidfVectorizer(max_df=0.5, max_features=1000, min_df=0.01,
                                   stop_words={"'d", "'ll", "'m", "'re", "'s",
                                               "'ve", 'a', 'about', 'above',
                                               'across', 'after', 'afterwards',
                                               'again', 'against', 'all',
                                               'almost', 'alone', 'along',
                                               'already', 'also', 'although',
                                               'always', 'am', 'among', 'amongst',
                                               'amount', 'an', 'and', 'another',
                                               'any', ...},
                                   token_pattern='[A-Za-z]{3,}')),
                  ('clf', LogisticRegression(max_iter=500))]),
  'parameters': {'max_df': 0.5, 'max_features': 1000, 'min_df':

## TF-IDF grid search

In [122]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

tf_idf_pipe = Pipeline(steps = [('tfidf', TfidfVectorizer(token_pattern=r'[A-Za-z]{3,}',
                                                          stop_words=spacy_stopwords)),
                                ('clf', CatBoostClassifier(random_state=42, iterations=10, verbose=False))
                                ]
                        )

params = dict(tfidf__min_df=[.05,], 
                  tfidf__max_df=[.5,],
                  tfidf__max_features = [1000,],
                  clf__learning_rate=[.5, 1]
                   )
                
                          
grid = GridSearchCV(estimator=tf_idf_pipe,
                    param_grid=params,
                    scoring=make_scorer(f1_score, average='micro'),
                    cv=3,
                    refit=True,
                    verbose=5
                    )


In [123]:
grid.fit(X_train.review_text_stem, y_train)


Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV 1/3] END clf__learning_rate=0.5, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=0.05;, score=0.436 total time=   4.5s
[CV 2/3] END clf__learning_rate=0.5, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=0.05;, score=0.438 total time=   4.5s
[CV 3/3] END clf__learning_rate=0.5, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=0.05;, score=0.437 total time=   4.4s
[CV 1/3] END clf__learning_rate=1, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=0.05;, score=0.441 total time=   4.4s
[CV 2/3] END clf__learning_rate=1, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=0.05;, score=0.440 total time=   4.5s
[CV 3/3] END clf__learning_rate=1, tfidf__max_df=0.5, tfidf__max_features=1000, tfidf__min_df=0.05;, score=0.437 total time=   4.4s


In [124]:
grid.cv_results_

{'mean_fit_time': array([3.4266057 , 3.42375731]),
 'std_fit_time': array([0.0436979 , 0.04431355]),
 'mean_score_time': array([1.03686595, 1.03896268]),
 'std_score_time': array([0.00725533, 0.00995766]),
 'param_clf__learning_rate': masked_array(data=[0.5, 1],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_tfidf__max_df': masked_array(data=[0.5, 0.5],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_tfidf__max_features': masked_array(data=[1000, 1000],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_tfidf__min_df': masked_array(data=[0.05, 0.05],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__learning_rate': 0.5,
   'tfidf__max_df': 0.5,
   'tfidf__max_features': 1000,
   'tfidf__min_df': 0.05},
  {'clf__learning_rate': 1,
   'tfidf__max_df': 0.5,
   'tfidf__max_features': 1000,
   'tfid

In [71]:
pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score']].style.bar(vmin=0, vmax=1)

Unnamed: 0,params,mean_test_score
0,"{'clf__learning_rate': 0.5, 'tfidf__max_df': 0.5, 'tfidf__max_features': 1000, 'tfidf__min_df': 0.05}",0.43708
1,"{'clf__learning_rate': 1, 'tfidf__max_df': 0.5, 'tfidf__max_features': 1000, 'tfidf__min_df': 0.05}",0.43924


In [125]:
test_pred_gs = grid.best_estimator_.predict(X_test.review_text_stem)
print(f"Test f1 score: {f1_score(y_true=y_test, y_pred=test_pred_gs, average='micro')}")

Test f1 score: 0.44012


## Pretrained GloVe + CatBoost grid search

In [126]:
import gensim.downloader as api

class MeanEmbeddingVectorizer(object):
    def __init__(self, word_model_vectors):
        self.word_model_vector = word_model_vectors
        self.vector_size = word_model_vectors.vector_size

    def fit(self): 
        return self

    def transform(self, docs):  
        doc_word_vector = np.vstack([self.word_average(sent) for sent in docs])
        return doc_word_vector

    def word_average(self, sent):
        mean = []
        for word in sent:
            if word in self.word_model_vector.index_to_key:
                mean.append(self.word_model_vector.get_vector(word))

        if not mean:  # empty words
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean

glove_vectors = api.load("glove-wiki-gigaword-50")
mev_glove = MeanEmbeddingVectorizer(glove_vectors)

In [127]:
X_train_vectorized = mev_glove.transform(X_train.review_text.apply(remove_stop_words, 
                                                                   stem, 
                                                                   args=[stem, spacy_stopwords]).str.split(' '))

X_test_vectorized = mev_glove.transform(X_test.review_text.apply(remove_stop_words, 
                                                                 stem, 
                                                                 args=[stem, spacy_stopwords]).str.split(' '))


KeyboardInterrupt: 

In [None]:
params = dict(learning_rate=[.5, 1])
                
grid = GridSearchCV(estimator=CatBoostClassifier(random_state=42, iterations=10, verbose=False),
                    param_grid=params,
                    scoring=make_scorer(f1_score, average='micro'),
                    cv=3,
                    refit=True,
                    verbose=5
)


In [None]:
grid.fit(X_train_vectorized, y_train)
pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score']].style.bar(vmin=0, vmax=1)

In [None]:
test_predictions = grid.best_estimator_.predict(X_test.review_text_stem)
print(f"Test f1 score: {f1_score(y_true=y_test, y_pred=test_predictions, average='micro')}")

## BERT fine tuning
https://huggingface.co/docs/transformers/training#finetune-a-pretrained-model <br>
https://huggingface.co/docs/transformers/model_doc/distilbert

In [128]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

X_train_tokenized = tokenizer(X_train['review_text'].tolist(), padding='max_length', truncation=True)
X_test_tokenized = tokenizer(X_test['review_text'].tolist(), padding='max_length', truncation=True)

loading configuration file config.json from cache at /Users/olko/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /Users/olko/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.txt
loa

In [129]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(X_train_tokenized, y_train)
test_dataset = Dataset(X_test_tokenized, y_test)

In [130]:
import evaluate

from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments

bert_model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/trainer#transformers.TrainingArguments
training_params = TrainingArguments(output_dir='test_trainer', evaluation_strategy='epoch')

f1_metric = evaluate.load("f1")

#f1_metric.compute(predictions=np.argmax(eval_pred[0], axis=-1), references=eval_pred[1])

def compute_f1(eval_pred):
    logits, labels = eval_pred
    return f1_metric.compute(predictions=np.argmax(logits, axis=-1), references=labels, average = 'micro')


loading configuration file config.json from cache at /Users/olko/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",


In [131]:
from transformers import Trainer

trainer = Trainer(model=bert_model,
                  args=training_params,
                  train_dataset=train_dataset,
                  eval_dataset=test_dataset,
                  compute_metrics=compute_f1
                  )

In [106]:
trainer.train()

***** Running training *****
  Num examples = 75000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 28125
  Number of trainable parameters = 109486854


Epoch,Training Loss,Validation Loss


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin


KeyboardInterrupt: 