# Introduction

This homework assignment aims to provide hands-on experience with three different approaches in natural language processing: RNN model training, prompting a pretrained language model, and fine-tuning a language model. The task is to classify news category in text using the AG News dataset available at Hugging Face.

https://huggingface.co/datasets/ag_news

## Data set

* Utilize the Ag News dataset from Hugging Face.
* You will apply three approaches to classify news category including **world**, **sports**, **sci/tech**, and **business** from textual data.
* More details about the dataset can be found at the provided link.


## Three approaches

In this assignment, you will apply three distinct NLP approaches to classify news category from textual data. Each approach should be executable within the Google Colab environment.

# Train a RNN model (>85% accuracy on test set for full credit)

* Introduction: Recurrent Neural Networks (RNNs) are powerful for sequence modeling and have been extensively used in NLP for tasks like text classification.
* Task: Train a RNN to classify news category.
* Details: Implement and train an RNN using PyTorch. The architecture should include an embedding layer, one or more RNN layers, and a dense output layer for classification.
* Model Flexibility: You are free to choose or modify any RNN architecture (e.g., LSTM, GRU) as long as it is compatible with Colab.

## Install dependency

In [None]:
!pip install datasets

## Load and prepare the dataset

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
from torch.utils.data import DataLoader, random_split
import torch.nn as nn
from tqdm import tqdm

# Check if a GPU is available and choose device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("You are currently using: ", device)

# Load and tokenize the dataset
def load_and_preprocess_data():
    dataset = load_dataset('ag_news')
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    ##### Your implementation starts here #####
    # tokenization

    ##### Your implementation ends here #####

    # Splitting the training dataset into training and validation
    train_size = int(0.9 * len(tokenized_datasets['train']))
    val_size = len(tokenized_datasets['train']) - train_size
    train_dataset, val_dataset = random_split(tokenized_datasets['train'], [train_size, val_size])

    return train_dataset, val_dataset, tokenized_datasets['test']

train_dataset, val_dataset, test_dataset = load_and_preprocess_data()

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)
validation_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, pin_memory=True)

## Define the RNN model

In [None]:
# Define the RNN Classifier
class RNNClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, rnn_type="GRU", num_layers=2, bidirectional=True, dropout=0.5):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers,
                          batch_first=True, bidirectional=bidirectional, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, _ = self.rnn(embedded)
        hidden = output[:, -1, :]  # Get the last hidden state
        return self.fc(hidden)

##### Your implementation starts here #####
# Define the model hyperparameter

##### Your implementation ends here #####

## Training and evaluation functions

In [None]:
# Training and evaluation functions
def train_model(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in tqdm(data_loader):
        ##### Your implementation starts here #####
        # batch processing
        # [hint] make sure data are on the correct device (i.e., GPU or CPU)

        ##### Your implementation ends here #####
    return total_loss / len(data_loader)

def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for batch in data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            predictions = model(batch['input_ids'])
            loss = criterion(predictions, batch['label'])
            total_loss += loss.item()
            preds = predictions.argmax(dim=1)
            total_correct += (preds == batch['label']).sum().item()
    avg_loss = total_loss / len(data_loader)
    accuracy = total_correct / len(data_loader.dataset)
    return avg_loss, accuracy

## Main training loop

In [None]:
# Main training loop

##### Your Optional implementation starts here #####
# You are allowed to change the training epoch
num_epochs = 3

##### Your Optional implementation ends here #####

for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    val_loss, val_accuracy = evaluate_model(model, validation_loader, criterion, device)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

test_loss, test_accuracy = evaluate_model(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

# 2. Prompting a pretrained LM (>50% accuracy on test set for full credit)

* Introduction: Prompting involves adapting a pre-trained model to a specific task without extensive retraining, leveraging the model's existing knowledge.
* Task: Use zero-shot learning by prompting a pretrained language model.
* Details: Utilize a pre trained language model to generate predictions based on prompts. Craft three different prompts to evaluate how well the model can infer the correct emotion.
* Model Flexibility: Any pretrained model available via libraries like Hugging Face’s Transformers that runs on Google Colab can be used.

## Install dependency

In [None]:
!pip install datasets

## Load data set and the pretrained language model

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Check device: GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load the AG News dataset
dataset = load_dataset('ag_news')

# Initialize tokenizer and model for masked language model prediction
##### Your Optional implementation starts here #####
# You are allowed to choose other language models if you want
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForMaskedLM.from_pretrained('distilbert-base-uncased').to(device)
##### Your Optional implementation ends here #####

## Prompt engineering

In [None]:
# Prepare the masked predictions in batches
unmasked = []
# Categories for AG News
categories = ['World', 'Sports', 'Business', 'Sci/Tech']

##### Your implementation starts here #####
prefix =  # string
suffix =  # string
##### Your implementation ends here #####

## Batch inference

In [None]:
batch_size = 32

# Function to process batches for mask filling
def process_batch(text_batch):
    prompts = [prefix + '[MASK]' + suffix + " " + text for text in text_batch]
    inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits
    masked_index = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
    predicted_tokens = [tokenizer.decode(predictions[i, idx].argmax(dim=-1)).strip() for i, idx in enumerate(masked_index)]
    return predicted_tokens

for i in tqdm(range(0, len(dataset['test']['text']), batch_size)):
    text_batch = dataset['test']['text'][i:i+batch_size]
    unmasked.extend(process_batch(text_batch))

## Back-mapping result to the pre-defined categories

In [None]:
# Initialize SentenceTransformer and function for back-mapping
matching_model = SentenceTransformer('bert-base-nli-mean-tokens').to(device)

def encode_batches(sentences, model, batch_size=64):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        batch_embeddings = model.encode(batch, convert_to_tensor=True, device=device)
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings)

# Predict and evaluate in batches
prediction = []
for i in tqdm(range(0, len(unmasked), batch_size)):
    z_batch = unmasked[i:i+batch_size]
    x_batch = dataset['test']['text'][i:i+batch_size]

    sentence_batch = []
    for z, x in zip(z_batch, x_batch):
        sentences = [prefix + cat + suffix + " " + x for cat in categories + [z]]
        sentence_batch.extend(sentences)

    # Encode all sentences at once for this batch
    sentence_embeddings = encode_batches(sentence_batch, matching_model, batch_size=512)  # Use a different batch size if necessary

    # Calculate predictions using cosine similarity
    num_categories = len(categories) + 1
    for j in range(len(z_batch)):
        start_idx = j * num_categories
        end_idx = start_idx + num_categories
        back_mapping = cosine_similarity(
            [sentence_embeddings[end_idx - 1].cpu().numpy()],
            sentence_embeddings[start_idx:end_idx - 1].cpu().numpy()
        )
        prediction.append(np.argmax(back_mapping))

## Evaluation

In [None]:
# Get the labels and evaluate
label = dataset['test']['label']
print('F1_macro: ', f1_score(prediction, label, average='macro'))
print('F1: ', f1_score(prediction, label, average=None))
print('Accuracy: ', accuracy_score(prediction, label))

# 3. Fine-tune a pretrained LM (>90% accuracy on test set for full credit)

* Introduction: Fine-tuning adjusts the weights of a pretrained model specifically to the task at hand, improving performance by adapting the model's deep knowledge to your specific dataset.
* Task: Fine-tune a pretrained model on the Ag News dataset.
* Details: Choose a transformer model and fine-tune it using the training split of the Ag News dataset. Adjust the learning rate, batch size, and other hyperparameters as necessary.
* Model Flexibility: Any transformer-based model that is supported by the Google Colab environment can be used. Ensure the chosen model is manageable within the resource constraints of Colab.


In [None]:
!pip install datasets
!pip install accelerate -U # [Important Note] this package requires to restart runtime/session after install

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import random_split

# Check if a GPU is available and choose device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("You are currently using: ", device)

# Load the AG News dataset
dataset = load_dataset('ag_news')

# Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

##### Your implementation starts here #####
# tokenization

##### Your implementation ends here #####

# Split the training set to create a validation set
train_size = int(0.9 * len(tokenized_datasets['train']))
val_size = len(tokenized_datasets['train']) - train_size
train_dataset, val_dataset = random_split(tokenized_datasets['train'], [train_size, val_size])

##### Your Optional implementation starts here #####
# The size of the training set is much larger than what is needed to get a full score for this assignment.
# In the pursuit of training speed or debug reason, you may discard a portion of the training data here.

##### Your Optional implementation ends here #####

## Load the model and trainer setup

In [None]:
# Load the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4).to(device)

##### Your Optional implementation starts here #####
# define the training_args

##### Your Optional implementation ends here #####

# Compute metrics
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    accuracy = (preds == p.label_ids).astype(float).mean()
    return {'accuracy': accuracy}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model
results = trainer.evaluate(tokenized_datasets['test'])
print(results)