# NMSU CSCI-5435 Assignment 6 Task 2

## Relevent Information

In [None]:
#Name:               Tianjie Chen
#Email:              tvc5586@nmsu.edu
#File Creation Date: Apr/23/2025
#Purpose of File:    NMSU CSCI-5435 Assignment 6 Task 2
#Last Edit Date:     Apr/23/2025
#Last Edit Note:     File creation
#GenAI used:         False

## Load Libraries

In [1]:
import os
import string

import pandas as pd
import numpy as np
import torch
import nltk
import nltk.data
import evaluate

from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split

## Setup

In [2]:
# USING GPU
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

print(torch.cuda.device_count())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

1


In [3]:
DATA_PATH = "eng_-french.csv"

df = pd.read_csv(DATA_PATH)

## Preprocessing

In [4]:
data = np.array(df)[:,[0,1]]

In [5]:
np.random.shuffle(data)
print(data[0])

["Don't say a word to anyone." 'Ne dis mot à personne.']


In [6]:
nltk.download("punkt_tab")
tokenizer_en = nltk.data.load("tokenizers/punkt/english.pickle")
tokenizer_fr = nltk.data.load("tokenizers/punkt/french.pickle")

en_list = []
fr_list = []

for x in data:
    x1 = tokenizer_en.tokenize(x[0])
    x2 = tokenizer_fr.tokenize(x[1])
    if len(x1) == len(x2):
        en_list += x1
        fr_list += x2

data = np.column_stack((en_list, fr_list))

[nltk_data] Downloading package punkt_tab to /home/tchen/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
data = np.char.lower(data)
data = np.char.replace(data, "-", " ")
for x in string.punctuation.replace("'", ""):
    data = np.char.replace(data, x, "")
for x in "«»":
    train_data = np.char.replace(data, x, "")
data = np.char.strip(data)

In [8]:
data = pd.DataFrame(data)
data = data.rename(columns={0: "English", 1: "French"})

In [9]:
train, test = train_test_split(
    data, test_size=0.2, random_state=42
)

In [10]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [11]:
prefix = "translate English to French: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["English"]]
    model_inputs = tokenizer(inputs, max_length=100, truncation=True)

    labels = tokenizer(text_target=list(examples["French"]), max_length=100, truncation=True)

    processed_data = []

    for i in range(len(inputs)):
        _ = {"text": inputs[i], "input_ids": model_inputs[i].ids, "labels": labels[i].ids}
        processed_data.append(_)
        
    return processed_data

In [12]:
tokenized_train = preprocess_function(train)
tokenized_test  = preprocess_function(test)

In [13]:
print(tokenized_train[1])

{'text': 'translate English to French: you can only use it once', 'input_ids': [13959, 1566, 12, 2379, 10, 25, 54, 163, 169, 34, 728, 1], 'labels': [3, 17, 76, 3, 29, 15, 3, 16162, 3, 40, 31, 16578, 546, 31, 444, 2529, 1]}


In [14]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## Define Metrics

In [15]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

## Define & Train Model

In [16]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir="NLP_A6",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.9551,0.74471,0.6618,0.4803,0.6523,0.6521,12.0749
2,0.8834,0.706512,0.6695,0.4905,0.66,0.6598,12.0851
3,0.8595,0.690237,0.6736,0.4966,0.6643,0.6642,12.0693
4,0.8439,0.685822,0.6751,0.4982,0.6657,0.6656,12.0717


TrainOutput(global_step=4412, training_loss=0.8985115321030535, metrics={'train_runtime': 885.9131, 'train_samples_per_second': 637.354, 'train_steps_per_second': 4.98, 'total_flos': 3922106015023104.0, 'train_loss': 0.8985115321030535, 'epoch': 4.0})

## Application

In [22]:
text = "translate English to French: I have no idea if this sentence is translated correctly."

In [23]:
inputs = tokenizer(text, return_tensors="pt").input_ids

inputs = inputs.to(device)

In [24]:
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [25]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"je n'ai aucune idée si cette phrase est traduite correctement."