<a href="https://colab.research.google.com/github/vlassner/dsml_4220_project/blob/main/dsml4220_prj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Learning Project
By Victoria Lassner
DSML 4220

**Goal**: Fine tune a model for abstractive Summarization.

**Model:** T5-Base with its Tokenizer

Websites: https://huggingface.co/docs/transformers/tasks/summarization

**Future Models to Compare:**

https://wandb.ai/mostafaibrahim17/ml-articles/reports/Fine-Tuning-LLaMa-2-for-Text-Summarization--Vmlldzo2NjA1OTAy

https://wandb.ai/mostafaibrahim17/ml-articles/reports/Crafting-Superior-Summaries-The-ChatGPT-Fine-Tuning-Guide--Vmlldzo1Njc5NDI1

**Definitions:**

Abstractive summarization = oncise summary of a text by understanding its meaning and creating new sentences, rather than simply extracting phrases from the original text.

*****
**Dataset:**
CNN/DailyMail: https://paperswithcode.com/dataset/cnn-daily-mail-1


In [None]:
# disables weights and biases
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# downloads packages for model, dataset, rouge and tokenzier
# --Quiet limits output of messages
!pip install transformers datasets evaluate sentencepiece rouge_score --quiet

In [None]:
!pip install --upgrade huggingface-hub

from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Download packages
from datasets import load_dataset, concatenate_datasets
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer, T5Tokenizer
import torch
from torch.utils.data import DataLoader
import torch

In [None]:
# Load CNN/Daily Mail Dataset from dataset package

train_sample_limit = 5000
val_sample_limit = 2000

dataset = load_dataset("cnn_dailymail", "3.0.0")
train_data = dataset["train"].shuffle().select(range(train_sample_limit))
val_data = dataset["validation"].shuffle().select(range(val_sample_limit))


In [None]:
# preprocess data for model
model_T5 = "vlassner01/t5_cnn_model_base_v4"
tokenizer = T5Tokenizer.from_pretrained(model_T5)

# limit length of input articles and output summary
max_input_length = 512
max_target_length = 250

chunk_size = 1000


# process text into tokens usin T5tokenizer
def preprocess(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    targets = examples["highlights"]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            truncation=True,
            padding="max_length"
        )

    # Replace pad token with -100 to ignore in loss
    # Previously had errors due to padding with rouge eval
    labels["input_ids"] = [
      [(label if label != tokenizer.pad_token_id else -100) for label in label_seq]
      for label_seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# break up data into chunks for easier training
def process_in_chunks(dataset, chunk_size, preprocess_fn):
    total_len = len(dataset)
    processed_chunks = []

    for i in range(0, total_len, chunk_size):
        chunk = dataset.select(range(i, min(i + chunk_size, total_len)))
        processed_chunk = chunk.map(
            preprocess_fn,
            batched=True,
            remove_columns=["article", "highlights", "id"]
        )
        processed_chunks.append(processed_chunk)

    return concatenate_datasets(processed_chunks)

# process the training and validation data into chunks
train_dataset = process_in_chunks(train_data, chunk_size, preprocess)
val_dataset = process_in_chunks(val_data, chunk_size, preprocess)

In [None]:
#Adds ROUGE to evaluate model
import evaluate
import numpy as np

rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # If preds are logits, convert to token IDs
    if isinstance(preds, tuple):
        preds = preds[0]

    if preds.ndim == 3:  # logits
        preds = np.argmax(preds, axis=-1)

    # clip token IDs to vocab size
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v * 100, 2) for k, v in result.items()}

In [None]:
# Load model T5-base
model = T5ForConditionalGeneration.from_pretrained(model_T5)

from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_cnn_model_base",
    eval_steps=500,
    per_device_train_batch_size=4,    #batch size for training
    per_device_eval_batch_size=4,     #batch size for evaluation
    predict_with_generate=True,       #decoding
    generation_max_length=128,        #max tokens for generated sequences during eval/prediction
    logging_steps=100,                #reports loss every 100 steps
    save_steps=1000,                  #saves model every 1000 steps
    num_train_epochs=3,
    fp16=True                          #GPU
)


In [None]:
# adds padding so shorter sequences match the longest one
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
# train model using hugging face's trainer class
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

In [None]:
# evaluate model using ROUGE
metrics = trainer.evaluate()
print(metrics)

In [None]:
#saves current state of model and tokenzier locally
model.save_pretrained("/content/t5_cnn_model_base_v4")
tokenizer.save_pretrained("/content/t5_cnn_model_base_v4")

In [None]:
# save model to huggingface for easier access for web app
from huggingface_hub import HfApi, HfFolder
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Save to HuggingFace
model.push_to_hub("vlassner01/t5_cnn_model_base_v4",  commit_message="Version_4")
tokenizer.push_to_hub("vlassner01/t5_cnn_model_base_v4")

In [None]:
# CHATGPT's work around for spiece.model not uploading to huggingface
# Not needed for retraining


# !mkdir -p /content/hf_tokenizer_upload
# !cp -r /content/t5_cnn_model_base_v3/* /content/hf_tokenizer_upload/

# from huggingface_hub import upload_file

# repo_name = "vlassner01/t5_cnn_model_base_v4"

# folder_path = '/content/hf_tokenizer_upload'

# upload_file(
#     path_or_fileobj=f"{folder_path}/spiece.model",  # Replace with actual file path
#     path_in_repo="spiece.model",  # Path in the Hugging Face repo
#     repo_id=repo_name,
#     commit_message="Upload spiece.model"
# )

# upload_file(
#     path_or_fileobj=f"{folder_path}/tokenizer_config.json",  # Replace with actual file path
#     path_in_repo="tokenizer_config.json",  # Path in the Hugging Face repo
#     repo_id=repo_name,
#     commit_message="Upload tokenizer_config.json"
# )

# upload_file(
#     path_or_fileobj=f"{folder_path}/special_tokens_map.json",  # Replace with actual file path
#     path_in_repo="special_tokens_map.json",  # Path in the Hugging Face repo
#     repo_id=repo_name,
#     commit_message="Upload special_tokens_map.json"
# )

# upload_file(
#     path_or_fileobj=f"{folder_path}/tokenizer.json",  # Replace with actual file path
#     path_in_repo="tokenizer.json",  # Path in the Hugging Face repo
#     repo_id=repo_name,
#     commit_message="Upload tokenizer.json"
# )