In [1]:
!pip install datasets rouge evaluate transformers wandb nltk rouge_score



In [2]:
import os
import numpy as np
import pandas as pd
import torch
import evaluate
import nltk
import rouge
import wandb

from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from sklearn.metrics import accuracy_score, f1_score
from nltk.tokenize import sent_tokenize
from torch.utils.data import DataLoader

In [3]:
# Download necessary NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Set up Weights & Biases for tracking experiments (optional but recommended)
wandb.init(project="multiple-dataset-fine-tuning")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msyedanida-khader[0m ([33msyedanida-khader-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [6]:
# Set seeds for reproducibility
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed()

In [7]:
# Load model and tokenizer
# Using T5 as it can handle multiple tasks in a seq2seq format
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
# Load datasets
# 1. CNN/DailyMail for summarization
cnn_dataset = load_dataset("cnn_dailymail", "3.0.0")

In [9]:
# 2. GLUE SST-2 for sentiment classification
sst2_dataset = load_dataset("glue", "sst2")

print("Datasets loaded successfully!")
print(f"CNN/DailyMail - Train: {len(cnn_dataset['train'])}, Validation: {len(cnn_dataset['validation'])}, Test: {len(cnn_dataset['test'])}")
print(f"SST-2 - Train: {len(sst2_dataset['train'])}, Validation: {len(sst2_dataset['validation'])}, Test: {len(sst2_dataset['test'])}")

Datasets loaded successfully!
CNN/DailyMail - Train: 287113, Validation: 13368, Test: 11490
SST-2 - Train: 67349, Validation: 872, Test: 1821


In [13]:
# Preprocess the datasets
cnn_processed = cnn_dataset.map(preprocess_cnn_dailymail, batched=True)  # Preprocess CNN/DailyMail
sst2_processed = sst2_dataset.map(preprocess_sst2, batched=True)      # Preprocess SST-2

cnn_sample_size = min(len(cnn_processed["train"]), 2000)
sst2_sample_size = min(len(sst2_processed["train"]), 2000)

cnn_train_subset = cnn_processed["train"].shuffle(seed=42).select(range(cnn_sample_size))
sst2_train_subset = sst2_processed["train"].shuffle(seed=42).select(range(sst2_sample_size))

# Combine datasets for training
combined_train = concatenate_datasets([cnn_train_subset, sst2_train_subset])
combined_val = concatenate_datasets([
    cnn_processed["validation"].shuffle(seed=42).select(range(min(len(cnn_processed["validation"]), 500))),
    sst2_processed["validation"].shuffle(seed=42).select(range(min(len(sst2_processed["validation"]), 500)))
])

print(f"Combined training set size: {len(combined_train)}")
print(f"Combined validation set size: {len(combined_val)}")

# Create a custom data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="max_length",
    max_length=512
)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]



Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Combined training set size: 4000
Combined validation set size: 1000


In [14]:
# Function to preprocess CNN/DailyMail for summarization
def preprocess_cnn_dailymail(examples):
    # Add task prefix to distinguish this as a summarization task
    inputs = ["summarize: " + doc for doc in examples["article"]]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=256, truncation=True)

    # Tokenize targets (summaries)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=64, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    # Add task type identifier
    model_inputs["task_type"] = ["summarization"] * len(inputs)

    return model_inputs

In [15]:
# Function to preprocess SST-2 for sentiment classification - optimized version
def preprocess_sst2(examples):
    batch_size = len(examples["sentence"])

    # Add task prefix in a more efficient way
    inputs = [f"classify sentiment: {sentence}" for sentence in examples["sentence"]]

    # Tokenize inputs - use padding=False to avoid unnecessary padding during preprocessing
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding=False  # Change from "max_length" to False
    )

    # Simplify label conversion
    text_labels = ["negative" if label == 0 else "positive" for label in examples["label"]]

    # Tokenize targets with padding=False
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            text_labels,
            max_length=8,
            truncation=True,
            padding=False  # Change from "max_length" to False
        )

    model_inputs["labels"] = labels["input_ids"]

    # Add task type identifier efficiently
    model_inputs["task_type"] = ["classification"] * batch_size

    return model_inputs

In [17]:
# Set up metrics
rouge = evaluate.load("rouge")
accuracy = evaluate.load("accuracy")

In [18]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

In [19]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predictions and labels with better error handling
    decoded_preds = []
    try:
        # Convert to int32 and clip values to valid token range
        max_id = tokenizer.vocab_size - 1
        clipped_preds = np.clip(predictions, 0, max_id).astype(np.int32)
        decoded_preds = tokenizer.batch_decode(clipped_preds, skip_special_tokens=True)
    except Exception as e:
        # If batch decoding fails, fall back to individual decoding with safeguards
        for pred in predictions:
            try:
                # Clip values to valid token range
                clipped_pred = np.clip(pred, 0, tokenizer.vocab_size - 1).astype(np.int32)
                decoded_pred = tokenizer.decode(clipped_pred, skip_special_tokens=True)
                decoded_preds.append(decoded_pred)
            except Exception as inner_e:
                # If a prediction can't be decoded, use an empty string
                print(f"Warning: Failed to decode prediction: {inner_e}")
                decoded_preds.append("")

    # Process labels (which are usually more stable)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Clean up predictions and labels
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Rest of the function remains the same...
    classification_preds = []
    classification_labels = []
    summarization_preds = []
    summarization_labels = []

    for pred, label in zip(decoded_preds, decoded_labels):
        if "positive" in label[0] or "negative" in label[0]:
            # This is a classification task
            classification_preds.append(pred)
            classification_labels.append(label[0])
        else:
            # This is a summarization task
            summarization_preds.append(pred)
            summarization_labels.append(label[0])

    # Results dictionary
    results = {}

    # Compute ROUGE for summarization if we have any summarization examples
    if summarization_preds:
        rouge_output = rouge.compute(
            predictions=summarization_preds,
            references=[[label] for label in summarization_labels],
            use_stemmer=True
        )
        results.update({k: v for k, v in rouge_output.items()})

    # Compute classification metrics if we have any classification examples
    if classification_preds:
        # Convert text predictions to binary labels for accuracy
        binary_preds = ["positive" in pred for pred in classification_preds]
        binary_labels = ["positive" in label for label in classification_labels]

        results["classification_accuracy"] = accuracy_score(binary_labels, binary_preds)
        results["classification_f1"] = f1_score(binary_labels, binary_preds, average='binary')

    return results

In [20]:
# Define training arguments with corrected steps
training_args = Seq2SeqTrainingArguments(
    fp16=True,
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=200,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=200,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=2,
    predict_with_generate=True,
    generation_max_length=64,
    report_to="wandb",
    load_best_model_at_end=True,
    metric_for_best_model="rouge1" if len(cnn_processed["validation"]) > 0 else "classification_accuracy",
    push_to_hub=False,
    dataloader_num_workers=4,
    optim="adamw_torch",
    gradient_checkpointing=True,
)

In [21]:
# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=combined_train,
    eval_dataset=combined_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [23]:
# Save the fine-tuned model
model_path = "./fine_tuned_multi_task_model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

Model saved to ./fine_tuned_multi_task_model


In [24]:
# Test the model on both tasks
def test_model_on_both_tasks(model, tokenizer):
    model.eval()

    # Test summarization
    article = """
    The COVID-19 pandemic has dramatically changed the way we live and work.
    Many companies have shifted to remote work, and schools have adopted
    online learning models. Public health measures including social distancing
    and mask-wearing have become commonplace in many regions. Vaccines were
    developed in record time, but distribution challenges and vaccine hesitancy
    remain obstacles to achieving herd immunity.
    """

    summarization_input = tokenizer("summarize: " + article, return_tensors="pt").to(device)
    summary_ids = model.generate(
        summarization_input["input_ids"],
        max_length=75,
        min_length=30,
        no_repeat_ngram_size=3,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Test classification
    review = "The movie was absolutely fantastic with great performances and an engaging storyline."
    classification_input = tokenizer("classify sentiment: " + review, return_tensors="pt").to(device)
    sentiment_ids = model.generate(
        classification_input["input_ids"],
        max_length=10,
        early_stopping=True
    )
    sentiment = tokenizer.decode(sentiment_ids[0], skip_special_tokens=True)

    return {
        "summarization_example": article,
        "generated_summary": summary,
        "classification_example": review,
        "predicted_sentiment": sentiment
    }

# Test the model
test_results = test_model_on_both_tasks(model, tokenizer)
print("\nTest Results:")
print(f"Summarization Example: \n{test_results['summarization_example'][:100]}...")
print(f"Generated Summary: \n{test_results['generated_summary']}")
print(f"\nClassification Example: \n{test_results['classification_example']}")
print(f"Predicted Sentiment: {test_results['predicted_sentiment']}")




Test Results:
Summarization Example: 

    The COVID-19 pandemic has dramatically changed the way we live and work.
    Many companies hav...
Generated Summary: 
the COVID-19 pandemic has dramatically changed the way we live and work. many companies have shifted to remote work - and schools have adopted online learning models.

Classification Example: 
The movie was absolutely fantastic with great performances and an engaging storyline.
Predicted Sentiment: Der Film war absolut fantastig mit tollen
