In [None]:
# ! pip install ipywidgets huggbingface_hub datasets transformers evaluate scikit-learn

In [None]:
import torch

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print(f"CUDA is available: {cuda_available}")

# If CUDA is available, you can also check additional information
if cuda_available:
    # Current device ID
    current_device = torch.cuda.current_device()
    print(f"Current CUDA device ID: {current_device}")
    
    # Device name
    device_name = torch.cuda.get_device_name(current_device)
    print(f"CUDA device name: {device_name}")
    
    # Number of CUDA devices
    device_count = torch.cuda.device_count()
    print(f"Number of CUDA devices: {device_count}")

CUDA is available: True
Current CUDA device ID: 0
CUDA device name: Tesla V100-PCIE-32GB
Number of CUDA devices: 1


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import DataCollatorWithPadding
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import os
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# **Import IMDB Dataset**

In [3]:
imdb = load_dataset("imdb")
imdb["test"][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

# **Preprocess Dataset**

Tokenize the dataset and truncate long sequences.

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    "HamsterShiu/BERT_MLM", 
    subfolder="hf_bert_pro_20_epochs"
)

In [7]:
tokenized_imdb = imdb.map(lambda e: tokenizer(e["text"], truncation=True, padding=True), batched=True)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

For some reason, which I should've Googled, transformers perform best when all of the input sentences have the same length. So, we need to dynamically pad the input to match the longest sentence in the batch.

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Load the huggingface's accuracy metric.

In [9]:
accuracy = evaluate.load("accuracy")

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# **Train Model**
We import the unfinetuned DistilBERT and finetune, or load a finetuned model. We finetuned using arbitrarily chosen hyperparameter because why not.

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    "HamsterShiu/BERT_MLM",
    subfolder="hf_bert_pro_20_epochs",
    num_labels=2
)

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HamsterShiu/BERT_MLM and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Probably need to understand what are these configs
training_args = TrainingArguments(
    output_dir="./SC4001/Assignment2/model/hf_pro" ,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
checkpoint_path = "./SC4001/Assignment2/model/hf_pro/checkpoint-1563"  # Change this directory as required

if os.path.exists(checkpoint_path):
    print(f"Finetuned model exists: {checkpoint_path}")
    model = AutoModelForSequenceClassification.from_pretrained(
        checkpoint_path,
        num_labels=2
    )
else:
    print("Finetuned model does not exist. Finetuning now.")
    trainer.train()

Finetuned model exists: ./SC4001/Assignment2/model/hf_pro/checkpoint-1563


# **Inference**
This should take around 2 minutes in NTU GPU.

In [61]:
# Create evaluation arguments
eval_args = TrainingArguments(
    output_dir="./SC4001/Assignment2/eval_results",
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="none"
)

# Set up the evaluation trainer
eval_trainer = Trainer(
    model=model,
    args=eval_args,
    tokenizer=tokenizer,
    eval_dataset=tokenized_imdb["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Run evaluation
eval_results = eval_trainer.evaluate()
print(f"Evaluation results: {eval_results}")

  eval_trainer = Trainer(


Evaluation results: {'eval_loss': 0.16339823603630066, 'eval_model_preparation_time': 0.0022, 'eval_accuracy': 0.94204, 'eval_f1': 0.9414474481755364, 'eval_precision': 0.9511717155221687, 'eval_recall': 0.93192, 'eval_runtime': 238.957, 'eval_samples_per_second': 104.621, 'eval_steps_per_second': 6.541}
