# NMSU CSCI-5435 Assignment 6 Task 1

## Relevent Information

In [None]:
#Name:               Tianjie Chen
#Email:              tvc5586@nmsu.edu
#File Creation Date: Apr/23/2025
#Purpose of File:    NMSU CSCI-5435 Assignment 6 Task 1
#Last Edit Date:     Apr/23/2025
#Last Edit Note:     File creation
#GenAI used:         False

## Load Libraries

In [1]:
import os

import pandas as pd
import evaluate
import numpy as np
import torch

from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split

## Setup

In [2]:
# USING GPU
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

print(torch.cuda.device_count())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

1


In [3]:
DATA_PATH = "news_summary.csv"

df = pd.read_csv(DATA_PATH)

## Preprocessing

In [4]:
train, test = train_test_split(
    df, test_size=0.2, random_state=42
)

In [5]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=list(examples["headlines"]), max_length=128, truncation=True)

    processed_data = []

    for i in range(len(inputs)):
        _ = {"text": inputs[i], "input_ids": model_inputs[i].ids, "labels": labels[i].ids}
        processed_data.append(_)
        
    return processed_data

In [7]:
tokenized_train = preprocess_function(train)
tokenized_test  = preprocess_function(test)

In [8]:
print(tokenized_train.__getitem__(0))

{'text': 'summarize: Hollywood actress Kate Winslet has joined the cast of \'Titanic\' director James Cameron\'s upcoming films in the \'Avatar\' franchise, making this their first venture together after 20 years of Titanic\'s release. Cameron said, "Kate and I have been looking for something to do together...since our collaboration on Titanic...I can\'t wait to see her bring the character of Ronal to life."', 'input_ids': [21603, 10, 8426, 15676, 11845, 4871, 7, 1655, 65, 3311, 8, 4061, 13, 3, 31, 382, 155, 9, 2532, 31, 2090, 2549, 18501, 31, 7, 3, 4685, 4852, 16, 8, 3, 31, 188, 900, 2046, 31, 7884, 6, 492, 48, 70, 166, 6086, 544, 227, 460, 203, 13, 13622, 447, 31, 7, 1576, 5, 18501, 243, 6, 96, 439, 342, 11, 27, 43, 118, 479, 21, 424, 12, 103, 544, 233, 27296, 69, 3561, 30, 13622, 447, 233, 196, 54, 31, 17, 1749, 12, 217, 160, 830, 8, 1848, 13, 10297, 138, 12, 280, 535, 1], 'labels': [11845, 4871, 7, 1655, 12, 161, 28, 13622, 447, 13762, 227, 460, 203, 1]}


In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## Define Metrics

In [10]:
rouge = evaluate.load("rouge")

In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

## Define & Train Model

In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="NLP_A6",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.2533,1.699746,0.4904,0.2574,0.4476,0.4477,15.5858
2,1.9219,1.614432,0.5027,0.2685,0.4605,0.4606,15.6704
3,1.8494,1.581562,0.5078,0.2723,0.4652,0.4653,15.6825
4,1.8225,1.573169,0.5087,0.2733,0.4662,0.4663,15.6762


TrainOutput(global_step=2460, training_loss=1.9315718239885036, metrics={'train_runtime': 914.2379, 'train_samples_per_second': 344.418, 'train_steps_per_second': 2.691, 'total_flos': 1.0530844066185216e+16, 'train_loss': 1.9315718239885036, 'epoch': 4.0})

## Application

In [14]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [15]:
inputs = tokenizer(text, return_tensors="pt").input_ids

inputs = inputs.to(device)

In [16]:
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [17]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Inflation Reduction Act lowers prescription drug costs, health care costs'