In [3]:
!pip install transformers datasets
# Importing the necessary libraries with customized names
import pandas as amazon_pd
import numpy as amazon_np
import torch as amazon_torch
from sklearn.model_selection import train_test_split as amazon_split
from sklearn.metrics import classification_report as amazon_report, confusion_matrix as amazon_matrix
import matplotlib.pyplot as amazon_plt
import seaborn as amazon_sns
from transformers import BertTokenizer as AmazonBertTokenizer, BertForSequenceClassification as AmazonBertModel, Trainer as AmazonTrainer, TrainingArguments as AmazonTrainingArgs
from datasets import Dataset as AmazonDataset, DatasetDict as AmazonDatasetDict




Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# Load the dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [47]:
def load_and_preprocess_amazon_review_data(file_path):
    """
    Load and preprocess the Amazon Review Dataset LLM from an Excel file.
    This function loads the dataset and maps the star ratings to binary sentiment labels.
    """
    try:
        # Load the dataset from an csv file
        amazon_data = amazon_pd.read_csv(file_path)
    except amazon_pd.errors.ParserError as e:
        print(e)
        raise e

    # Map the star ratings to binary sentiment labels: 1,2,3 -> negative (0) and 4,5 -> positive (1)
    amazon_data['label'] = amazon_data['overall'].apply(lambda x: 0 if x in [1, 2, 3] else 1)
    amazon_data['text'] = amazon_data['reviewText']
    # Select only the required columns
    amazon_data = amazon_data[['text', 'label']]

    return amazon_data


def tokenize_amazon_review_data(amazon_dataset, amazon_tokenizer):
    """
    Tokenize the Amazon Review Dataset LLM.
    This function applies the BERT tokenizer to the dataset.
    """
    def amazon_tokenize_function(examples):
        # Convert the input text to strings to ensure they are in the correct format
        texts = [str(text) for text in examples['text']]

        # Ensure the tokenizer receives a list of strings
        return amazon_tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=128
        )

    # Apply the tokenization to the entire dataset
    return amazon_dataset.map(amazon_tokenize_function, batched=True)





def fine_tune_amazon_bert_model(tokenized_datasets, output_dir='./results'):
    """
    Fine-tune the Amazon BERT model on the Amazon Review Dataset LLM.
    This function sets up the training arguments and fine-tunes the model.
    """
    # Load the pre-trained BERT tokenizer and model for sequence classification
    amazon_tokenizer = AmazonBertTokenizer.from_pretrained('bert-base-uncased')
    amazon_model = AmazonBertModel.from_pretrained('bert-base-uncased', num_labels=2)

    # Define the training arguments
    amazon_training_args = AmazonTrainingArgs(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        logging_strategy="steps",  # Log training loss at each step
        logging_steps=10,
        save_strategy="epoch",
        learning_rate=1e-5,  # Reduced learning rate for finer adjustments
        per_device_train_batch_size=16,  # Adjust batch size if needed
        per_device_eval_batch_size=16,
        num_train_epochs=5,  # Increased number of epochs
        weight_decay=0.01,
        logging_dir='./logs',
        load_best_model_at_end=True,  # Load the best model based on evaluation metrics
        metric_for_best_model="accuracy"  # Use accuracy as the evaluation metric
    )

    # Define the Trainer
    amazon_trainer = AmazonTrainer(
        model=amazon_model,
        args=amazon_training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['test'],
        tokenizer=amazon_tokenizer,
        compute_metrics=lambda p: {"accuracy": (amazon_np.argmax(p.predictions, axis=1) == p.label_ids).mean()}
    )

    # Train the model
    amazon_trainer.train()

    # Evaluate the model
    results = amazon_trainer.evaluate()
    print("Evaluation Results:")
    print(results)

    return amazon_trainer, amazon_model, amazon_tokenizer


In [44]:
# Load and preprocess the dataset
amazon_data = load_and_preprocess_amazon_review_data('/content/drive/MyDrive/amazon_reviews.csv')

# Split the data into train and test sets
train_df, test_df = amazon_split(amazon_data, test_size=0.2, random_state=42, stratify=amazon_data['label'])

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = AmazonDataset.from_pandas(train_df)
test_dataset = AmazonDataset.from_pandas(test_df)
amazon_dataset = AmazonDatasetDict({'train': train_dataset, 'test': test_dataset})

# Load the pre-trained BERT tokenizer
amazon_tokenizer = AmazonBertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
tokenized_amazon_datasets = tokenize_amazon_review_data(amazon_dataset, amazon_tokenizer)



Map:   0%|          | 0/3932 [00:00<?, ? examples/s]

Map:   0%|          | 0/983 [00:00<?, ? examples/s]

In [46]:
# Fine-tune the BERT model
amazon_trainer, amazon_model, amazon_tokenizer = fine_tune_amazon_bert_model(tokenized_amazon_datasets)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.119,0.169247,0.948118
2,0.0751,0.155542,0.956256
3,0.0769,0.180761,0.957274
4,0.0915,0.177707,0.953204
5,0.0649,0.197663,0.954222


Evaluation Results:
{'eval_loss': 0.18076112866401672, 'eval_accuracy': 0.9572736520854527, 'eval_runtime': 7.5819, 'eval_samples_per_second': 129.651, 'eval_steps_per_second': 8.177, 'epoch': 5.0}


In [49]:
def evaluate_amazon_bert_model(amazon_trainer, tokenized_datasets, original_texts):
    """
    Evaluate the fine-tuned Amazon BERT model.
    This function generates predictions, prints a classification report, and shows some example predictions.
    """
    # Make predictions on the test set
    amazon_predictions = amazon_trainer.predict(tokenized_datasets['test'])

    # Generate the classification report
    y_true = amazon_predictions.label_ids
    y_pred = amazon_np.argmax(amazon_predictions.predictions, axis=1)
    print("Classification Report:")
    print(amazon_report(y_true, y_pred, target_names=["Negative", "Positive"]))

    # Show some examples of true labels, predicted labels, and review texts
    print("\nSample Predictions:")
    for i in range(5):  # Show 5 examples
        print(f"Review Text: {original_texts[i]}")
        print(f"True Label: {'Positive' if y_true[i] == 1 else 'Negative'}")
        print(f"Predicted Label: {'Positive' if y_pred[i] == 1 else 'Negative'}")
        print("-" + "-"*40)

# Assuming `original_texts` is a list of the original review texts from the test dataset
original_texts = test_dataset['text']

# Call the evaluate function
evaluate_amazon_bert_model(amazon_trainer, tokenized_amazon_datasets, original_texts)


Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.63      0.74        93
    Positive       0.96      0.99      0.98       890

    accuracy                           0.96       983
   macro avg       0.92      0.81      0.86       983
weighted avg       0.96      0.96      0.95       983


Sample Predictions:
Review Text: I formatted this card as NTFS...loaded it up with a bunch of my kids' favorite movie and TV shows. Transfers seem to be pretty speedy, even while playing HD content.
True Label: Positive
Predicted Label: Positive
-----------------------------------------
Review Text: Within an hour of using the memory card, it failed and died. it kept saying &#34;Memory error.&#34;I could never get my dashcam camera to record anything. I went to our walmart and got me just MicroSD not Ultra. It has been working all day so far.That's right. There is no way to tell what you are getting online is OEM or knock offs. When the pric