# Goodreads Books Reviews

#### Imports

In [42]:
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import load_dataset, load_metric
from huggingface_hub import notebook_login

import torch

#### Load Data

In [43]:
goodreads_train = load_dataset('csv', data_files='./data/goodreads_train.csv')['train']
goodreads_test = load_dataset('csv', data_files='./data/goodreads_test.csv')['train']

Using custom data configuration default-8d12ff12b322328a
Reusing dataset csv (C:\Users\Terence.Yeung\.cache\huggingface\datasets\csv\default-8d12ff12b322328a\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-ef13ee0a229a23e6
Reusing dataset csv (C:\Users\Terence.Yeung\.cache\huggingface\datasets\csv\default-ef13ee0a229a23e6\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

In [44]:
# Dropping all columns except review_text and rating

dropped_columns = ['user_id', 'book_id', 'review_id', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments']

goodreads_train = goodreads_train.remove_columns(dropped_columns)
goodreads_test = goodreads_test.remove_columns(dropped_columns)

In [45]:
goodreads_train

Dataset({
    features: ['rating', 'review_text'],
    num_rows: 900000
})

In [46]:
goodreads_test

Dataset({
    features: ['review_text'],
    num_rows: 478033
})

In [47]:
# Splitting training data into a training and validation set as test dataset provided is not labelled

goodreads_train_split = goodreads_train.train_test_split(test_size=0.2)

In [48]:
goodreads_train_split

DatasetDict({
    train: Dataset({
        features: ['rating', 'review_text'],
        num_rows: 720000
    })
    test: Dataset({
        features: ['rating', 'review_text'],
        num_rows: 180000
    })
})

In [49]:
# Taking only a small sample of the training data due to computational limitations

def reduce_dataset(dataset):
    total_rows = dataset.num_rows
    sample_size = int(total_rows * 0.001)
    sample_indices = np.random.RandomState(42).randint(total_rows, size=sample_size)
    return dataset.select(sample_indices)

train_data = reduce_dataset(goodreads_train_split['train'])
test_data = reduce_dataset(goodreads_train_split['test'])

In [50]:
train_data

Dataset({
    features: ['rating', 'review_text'],
    num_rows: 720
})

In [51]:
test_data

Dataset({
    features: ['rating', 'review_text'],
    num_rows: 180
})

#### Preprocessing

In [52]:
# Tokenizing review text

tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

def tokenize_text(data):
    return tokenizer(data["review_text"], truncation=True)
 
tokenized_train = train_data.map(tokenize_text, batched=True)
tokenized_test = test_data.map(tokenize_text, batched=True)

loading configuration file https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment/resolve/main/config.json from cache at C:\Users\Terence.Yeung/.cache\huggingface\transformers\d9226eeac7b8b96d83ebc327cdd670490866d8c999505c1f83b6ef206ccb1604.a34960b447312b0727cb670d710444fcb41a6156eddcba062a19b3fc05d95251
Model config BertConfig {
  "_name_or_path": "nlptown/bert-base-multilingual-uncased-sentiment",
  "_num_labels": 5,
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "finetuning_task": "sentiment-analysis",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "1 star",
    "1": "2 stars",
    "2": "3 stars",
    "3": "4 stars",
    "4": "5 stars"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1 star": 0,
    "2 stars": 1,
    "3 stars": 2,
    "4 stars": 3,
    "5 stars": 4
 

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [53]:
# Convert training samples to PyTorch tensors and concatenate them with the correct amount of padding to speed up training

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Model Building

In [54]:
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment', num_labels=5)

loading configuration file https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment/resolve/main/config.json from cache at C:\Users\Terence.Yeung/.cache\huggingface\transformers\d9226eeac7b8b96d83ebc327cdd670490866d8c999505c1f83b6ef206ccb1604.a34960b447312b0727cb670d710444fcb41a6156eddcba062a19b3fc05d95251
Model config BertConfig {
  "_name_or_path": "nlptown/bert-base-multilingual-uncased-sentiment",
  "_num_labels": 5,
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "finetuning_task": "sentiment-analysis",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "1 star",
    "1": "2 stars",
    "2": "3 stars",
    "3": "4 stars",
    "4": "5 stars"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1 star": 0,
    "2 stars": 1,
    "3 stars": 2,
    "4 stars": 3,
    "5 stars": 4
 

In [55]:
# Defining metrics

def compute_metrics(eval_pred):
    load_accuracy = load_metric('accuracy')
    load_precision = load_metric('precision')
    load_recall = load_metric('recall')
    load_f1 = load_metric('f1')
  
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)['accuracy']
    precision = load_precision.compute(predictions=predictions, references=labels)['precision']
    recall = load_recall.compute(predictions=predictions, references=labels)['recall']
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

#### Model Training

In [59]:
# Logging in to HuggingFace

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [39]:
# Training hyperparameters

training_args = TrainingArguments(
    output_dir='goodreads_books_reviews',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    push_to_hub=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [58]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

ValueError: You need to pass a valid `token` or login by using `huggingface-cli login`

In [None]:
trainer.train()