In [2]:
import torch
from transformers import BertTokenizer
from datasets import Dataset
import pathlib as pl
import os
os.chdir('/new-stg/home/banghua/Amazon-Rating-Prediction')
from load_dataset import load_dataset
from tqdm import tqdm
import pickle
import random
random.seed(42)
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from balanced_loss import Loss
from datasets import load_metric
import numpy as np

def get_review_text(entry):
    try:
        return entry['reviewText']
    except KeyError:
        return ''


def get_review_rating(entry):
    try:
        return entry['overall']
    except KeyError:
        return -1


def build_dataset_dict(dataset, encoded_sentences):
    dataset_dict = {
        'orig_idx': [i for i in range(len(dataset))], # original index
        'input_ids': encoded_sentences['input_ids'],
        'attention_masks': encoded_sentences['attention_mask'],
        'labels': [entry['overall'] for entry in dataset]
    }
    return dataset_dict


def convert_to_list_from_big_dict(big_dict, big_dict_orig_idx, dataset):
    list_of_dict = []
    for i in range(len(big_dict['input_ids'])):
        list_of_dict.append({
            'orig_idx': big_dict_orig_idx[i],
            'input_ids': big_dict['input_ids'][i],
            'attention_masks': big_dict['attention_mask'][i],
            'labels': int(dataset[big_dict_orig_idx[i]]['overall'] - 1)
        })
    return list_of_dict


# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('LiYuan/amazon-review-sentiment-analysis')
# 17,280 rows of training set 4,320 rows of dev set. test set: 2,400 rows.


def get_encoded_sentences(sentences, file_path):
    if os.path.exists(file_path):
        encoded_sentences = torch.load(file_path)
        print('Loaded input_ids.')
    else:
        encoded_sentences = tokenizer(sentences, add_special_tokens=True, padding=True, truncation=True, max_length=512, return_tensors='pt')
        torch.save(encoded_sentences, file_path)
        print('Saved input_ids.')
    return encoded_sentences

  from .autonotebook import tqdm as notebook_tqdm
2023-12-02 17:31:25.706535: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-02 17:31:29.124309: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-02 17:31:29.124358: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-02 17:31:29.660676: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-02 17:31:30.5

Loading BERT tokenizer...


In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name())
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


current_path = pl.Path.cwd()

train_path = current_path / 'dataset' / 'train.json.gz'
val_path = current_path / 'dataset' / 'val.json.gz'
test_path = current_path / 'dataset' / 'test.json.gz'

train_dataset = load_dataset(train_path)
val_dataset = load_dataset(val_path)
test_dataset = load_dataset(test_path)

train_dataset_orig_idx = [i for i in range(len(train_dataset))]
val_dataset_orig_idx = [i for i in range(len(val_dataset))]
test_dataset_orig_idx = [i for i in range(len(test_dataset))]


# Get indices of entries with empty reviewText
empty_train_indices = [i for i, entry in enumerate(train_dataset) if get_review_text(entry) == '']
empty_val_indices = [i for i, entry in enumerate(val_dataset) if get_review_text(entry) == '']
empty_test_indices = [i for i, entry in enumerate(test_dataset) if get_review_text(entry) == '']

print('Number of empty reviewText in train dataset:', len(empty_train_indices))
print('Number of empty reviewText in val dataset:', len(empty_val_indices))
print('Number of empty reviewText in test dataset:', len(empty_test_indices))

sentences_train = [get_review_text(entry) for entry in train_dataset]
sentences_val = [get_review_text(entry) for entry in val_dataset]
sentences_test = [get_review_text(entry) for entry in test_dataset]

print('Number of training sentences: {:,}'.format(len(sentences_train)))
print('Number of validation sentences: {:,}'.format(len(sentences_val)))
print('Number of testing sentences: {:,}'.format(len(sentences_test)))

# Correct the path by expanding the tilde to the user's home directory
file_path_train = current_path / 'dataset' / 'input_ids' / 'input_ids_train.pickle'
file_path_valid = current_path / 'dataset' / 'input_ids' / 'input_ids_valid.pickle'
file_path_test = current_path / 'dataset' / 'input_ids' / 'input_ids_test.pickle'

encoded_sentences_train = get_encoded_sentences(sentences_train, file_path_train)
encoded_sentences_valid = get_encoded_sentences(sentences_val, file_path_valid)
encoded_sentences_test = get_encoded_sentences(sentences_test, file_path_test)

There are 2 GPU(s) available.
We will use the GPU: NVIDIA RTX A6000
Loading dataset from /new-stg/home/banghua/Amazon-Rating-Prediction/dataset/train.json.gz...
Loading dataset from /new-stg/home/banghua/Amazon-Rating-Prediction/dataset/val.json.gz...
Loading dataset from /new-stg/home/banghua/Amazon-Rating-Prediction/dataset/test.json.gz...
Number of empty reviewText in train dataset: 2331
Number of empty reviewText in val dataset: 702
Number of empty reviewText in test dataset: 763
Number of training sentences: 3,925,489
Number of validation sentences: 1,308,497
Number of testing sentences: 1,308,497
Loaded input_ids.
Loaded input_ids.
Loaded input_ids.


In [4]:
dataset_train_orig_kept_idx = [train_dataset_orig_idx[i] for i in range(len(train_dataset_orig_idx)) if i not in empty_train_indices]
dataset_valid_orig_kept_idx = [val_dataset_orig_idx[i] for i in range(len(val_dataset_orig_idx)) if i not in empty_val_indices]
dataset_test_orig_kept_idx = [test_dataset_orig_idx[i] for i in range(len(test_dataset_orig_idx)) if i not in empty_test_indices]

In [6]:
encoded_sentences_train_dataset_dict = {key: value[dataset_train_orig_kept_idx] for key, value in encoded_sentences_train.items()}
encoded_sentences_valid_dataset_dict = {key: value[dataset_valid_orig_kept_idx] for key, value in encoded_sentences_valid.items()}
encoded_sentences_test_dataset_dict = {key: value[dataset_test_orig_kept_idx] for key, value in encoded_sentences_test.items()}

In [7]:
# Convert encoded_sentences_train_downsampled to list of entries
encoded_sentences_train_dataset_list = convert_to_list_from_big_dict(encoded_sentences_train_dataset_dict, dataset_train_orig_kept_idx, train_dataset)
encoded_sentences_valid_dataset_list = convert_to_list_from_big_dict(encoded_sentences_valid_dataset_dict, dataset_valid_orig_kept_idx, val_dataset)
encoded_sentences_test_dataset_list = convert_to_list_from_big_dict(encoded_sentences_test_dataset_dict, dataset_test_orig_kept_idx, test_dataset)

In [8]:
# Build dataset
dataset_train = Dataset.from_list(encoded_sentences_train_dataset_list)
dataset_valid = Dataset.from_list(encoded_sentences_valid_dataset_list)
dataset_test = Dataset.from_list(encoded_sentences_test_dataset_list)

In [10]:
dataset_valid = Dataset.from_list(encoded_sentences_valid_dataset_list)
dataset_test = Dataset.from_list(encoded_sentences_test_dataset_list)

In [11]:
# Save datasets
dataset_train.save_to_disk(current_path / 'dataset_huggingface_full' / 'train_dataset')
dataset_valid.save_to_disk(current_path / 'dataset_huggingface_full' / 'valid_dataset')
dataset_test.save_to_disk(current_path / 'dataset_huggingface_full' / 'test_dataset')

Saving the dataset (17/17 shards): 100%|██████████████████████████████████████| 1307795/1307795 [01:26<00:00, 15068.29 examples/s]
Saving the dataset (17/17 shards): 100%|██████████████████████████████████████| 1307734/1307734 [01:26<00:00, 15204.68 examples/s]


In [None]:
samples_per_class = [0] * 5
for entry in dataset_train["labels"]:
    samples_per_class[entry] += 1


model = AutoModelForSequenceClassification.from_pretrained("LiYuan/amazon-review-sentiment-analysis", num_labels = 5)
model.cuda()

metric_name = "accuracy"
print(metric_name)
model_name = "Amazon-Pet-BERT"
print(model_name)

focal_loss = Loss(
    loss_type="focal_loss",
    samples_per_class=samples_per_class,
    class_balanced=True
)

actual_task = "mnli"
metric = load_metric('glue', actual_task)

In [16]:
encoded_sentences_train["input_ids"].shape

torch.Size([3925489, 512])

In [13]:
encoded_sentences_train["input_ids"].shape

torch.Size([3923158, 512])

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").to(torch.int64)
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = focal_loss(logits, labels)
        # loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss_fct, outputs) if return_outputs else loss_fct

In [None]:
batch_size = 32

args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_steps=100,
)



trainer = CustomTrainer(
    model,
    args,
    train_dataset=dataset_test,
    eval_dataset=dataset_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()