# Goodreads Books Reviews

#### Imports

In [116]:
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset, load_metric

import torch

#### Load Data

In [117]:
goodreads_train = load_dataset('csv', data_files='./Data/goodreads_train.csv')['train']
goodreads_test = load_dataset('csv', data_files='./Data/goodreads_test.csv')['train']

Using custom data configuration default-5bf69ea217ec2509
Reusing dataset csv (C:\Users\Terence.Yeung\.cache\huggingface\datasets\csv\default-5bf69ea217ec2509\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-19aa33e7c88f6a34
Reusing dataset csv (C:\Users\Terence.Yeung\.cache\huggingface\datasets\csv\default-19aa33e7c88f6a34\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

In [118]:
# Dropping all columns except review_text and rating

dropped_columns = ['user_id', 'book_id', 'review_id', 'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes', 'n_comments']

goodreads_train = goodreads_train.remove_columns(dropped_columns)
goodreads_test = goodreads_test.remove_columns(dropped_columns)

In [119]:
goodreads_train

Dataset({
    features: ['rating', 'review_text'],
    num_rows: 900000
})

In [120]:
goodreads_test

Dataset({
    features: ['review_text'],
    num_rows: 478033
})

In [121]:
# Splitting training data into a training and validation set as test dataset provided is not labelled

goodreads_train_split = goodreads_train.train_test_split(test_size=0.2)

In [122]:
goodreads_train_split

DatasetDict({
    train: Dataset({
        features: ['rating', 'review_text'],
        num_rows: 720000
    })
    test: Dataset({
        features: ['rating', 'review_text'],
        num_rows: 180000
    })
})

In [123]:
# Taking only a small sample of the training data due to computational limitations

def reduce_dataset(dataset):
    total_rows = dataset.num_rows
    sample_size = int(total_rows * 0.001)
    sample_indices = np.random.RandomState(42).randint(total_rows, size=sample_size)
    return dataset.select(sample_indices)

train_data = reduce_dataset(goodreads_train_split['train'])
test_data = reduce_dataset(goodreads_train_split['test'])

In [124]:
train_data

Dataset({
    features: ['rating', 'review_text'],
    num_rows: 720
})

In [125]:
test_data

Dataset({
    features: ['rating', 'review_text'],
    num_rows: 180
})

#### Preprocessing

In [126]:
# Tokenizing review text

tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

def tokenize_text(data):
    return tokenizer(data["review_text"], truncation=True)
 
tokenized_train = train_data.map(tokenize_text, batched=True)
tokenized_test = test_data.map(tokenize_text, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [127]:
# Convert training samples to PyTorch tensors and concatenate them with the correct amount of padding to speed up training

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Model Building

In [128]:
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment', num_labels=5)

In [131]:
# Defining metrics

def compute_metrics(eval_pred):
    load_accuracy = load_metric('accuracy')
    load_precision = load_metric('precision')
    load_recall = load_metric('recall')
    load_f1 = load_metric('f1')
  
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)['accuracy']
    precision = load_precision.compute(predictions=predictions, references=labels)['precision']
    recall = load_recall.compute(predictions=predictions, references=labels)['recall']
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

In [36]:
# Training hyperparameters

training_args = TrainingArguments(
    output_dir='test_trainer',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

In [37]:
# Defining metrics

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, logits)
    precision = precision_score(labels, logits, average=None)
    recall = recall_score(labels, logits, average=None)
    f1 = f1_score(labels, logits, average=None)
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

In [38]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [40]:
trainer.train()

***** Running training *****
  Num examples = 40
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6


KeyError: 24