In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"
import wandb
wandb.init(mode="disabled")



In [3]:
import numpy as np
import pandas as pd
import nltk, torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, DataCollatorForSeq2Seq
from transformers import IntervalStrategy, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset, load_metric, DatasetDict
import evaluate

2024-11-23 17:21:24.075979: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-23 17:21:24.076122: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-23 17:21:24.219282: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Read Data

In [6]:
train_df = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv")[:700]
val_df = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/validation.csv")[:100]
test_df = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv")[:100]

In [7]:
train_df['highlights'][0]

'Bishop John Folda, of North Dakota, is taking time off after being diagnosed .\nHe contracted the infection through contaminated food in Italy .\nChurch members in Fargo, Grand Forks and Jamestown could have been exposed .'

In [8]:
train_df['article'][0]

"By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained 

In [9]:
train_df.shape

(700, 3)

## Configurations

In [10]:
#Model name
checkpoint = "google-t5/t5-small"
#Model training configs
batch_size = 2
#tokens configs
max_input_length = 512
max_target_length = 128
## select device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [11]:
device

'cuda'

## Prepare Data

In [12]:
train_dataset = Dataset.from_pandas(train_df[['article', 'highlights']])
val_dataset = Dataset.from_pandas(val_df[['article', 'highlights']])

In [13]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

### Tokenize the data

In [14]:
def tokenize_the_batch(batch, tokenizer, max_input_length, max_output_length):
    x_train, y_train = batch['article'], batch['highlights']
    x_tokenizer = tokenizer(x_train, padding="longest", truncation=True, max_length= max_input_length)
    y_tokenizer = tokenizer(y_train, padding="longest", truncation=True, max_length= max_input_length)
    labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in sequences]
        for sequences in y_tokenizer.input_ids
    ]
    return {
        "input_ids": x_tokenizer.input_ids,
        "attention_mask": x_tokenizer.attention_mask,
        "labels": labels
    }

In [15]:
train_dataset = train_dataset.map(lambda batch: tokenize_the_batch(batch, tokenizer, max_input_length, 
                                                   max_target_length),
                  batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(lambda batch: tokenize_the_batch(batch, tokenizer, max_input_length, 
                                                   max_target_length),
                  batched=True, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

## Model Building

### Evaluation Metric Method.

In [50]:
from rouge_score.scoring import AggregateScore


In [58]:
def calculate_metrics(eval_preds):
    preds, labels = eval_preds
    preds = preds[0]
    metric = load_metric("rouge",trust_remote_code=True)
    decoded_preds = tokenizer.batch_decode(np.argmax(preds, axis=1), skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    results = metric.compute(predictions=decoded_preds, references= decoded_labels, use_stemmer=True)
    formatted_results = {}
    for key, value in results.items():
        if isinstance(value, AggregateScore):  # Handle rouge score objects
            formatted_results[key] = {
                "low": value.low.fmeasure,
                "mid": value.mid.fmeasure,
                "high": value.high.fmeasure,
            }
        else:
            formatted_results[key] = value
    
    return formatted_results

### Define Training Arguments

In [18]:
!export WANDB_MODE="disabled"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
training_args = TrainingArguments(
   output_dir="model_t5",
    num_train_epochs=7,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_dir="t5_logs",
    load_best_model_at_end=True,
    evaluation_strategy='steps'
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


### Setup Model

In [20]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [22]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [59]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=calculate_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [60]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
500,0.5468,2.754663,"{'low': 0.00123602384885198, 'mid': 0.0014006019529148447, 'high': 0.0015699750668151395}","{'low': 4.321146785837204e-05, 'mid': 6.251580526259079e-05, 'high': 8.295863959458576e-05}","{'low': 0.0010802876273331649, 'mid': 0.0012070416469904953, 'high': 0.0013389210752098458}","{'low': 0.0010831702171062714, 'mid': 0.0012085466947378129, 'high': 0.0013505888805136774}"
1000,1.432,2.241818,"{'low': 0.0012327147842396285, 'mid': 0.0014029602753873834, 'high': 0.0015973511837506847}","{'low': 4.124332286668053e-05, 'mid': 6.0269218843356063e-05, 'high': 8.433155827700168e-05}","{'low': 0.0010752980315448374, 'mid': 0.0012128896690657117, 'high': 0.0013434830673446432}","{'low': 0.001086847338846601, 'mid': 0.0012069360651804965, 'high': 0.0013553306680767353}"
1500,1.5737,2.121425,"{'low': 0.0012480500604650528, 'mid': 0.001410814479691715, 'high': 0.001594147072375713}","{'low': 4.194900133542802e-05, 'mid': 6.124253019704144e-05, 'high': 8.462378032608126e-05}","{'low': 0.001097871102654706, 'mid': 0.0012337943321394293, 'high': 0.0013570191554520283}","{'low': 0.001105388911797087, 'mid': 0.0012287257636633391, 'high': 0.0013758090284608482}"
2000,1.5066,2.11369,"{'low': 0.0012432816453841178, 'mid': 0.001409401426700475, 'high': 0.001598860893847056}","{'low': 4.597258524733764e-05, 'mid': 6.593164406005207e-05, 'high': 8.977068871331949e-05}","{'low': 0.0010877402025906392, 'mid': 0.001221488258222399, 'high': 0.0013443958449343932}","{'low': 0.001097361635941964, 'mid': 0.0012172564880698663, 'high': 0.0013617666935130377}"


Trainer is attempting to log a value of "{'low': 0.00123602384885198, 'mid': 0.0014006019529148447, 'high': 0.0015699750668151395}" of type <class 'dict'> for key "eval/rouge1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'low': 4.321146785837204e-05, 'mid': 6.251580526259079e-05, 'high': 8.295863959458576e-05}" of type <class 'dict'> for key "eval/rouge2" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'low': 0.0010802876273331649, 'mid': 0.0012070416469904953, 'high': 0.0013389210752098458}" of type <class 'dict'> for key "eval/rougeL" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'low': 0.0010831702171062714, 'mid': 0.0012085466947378129, 'high': 0.0013505888805136774}" of t

TrainOutput(global_step=2450, training_loss=1.3020902423469387, metrics={'train_runtime': 739.8932, 'train_samples_per_second': 6.623, 'train_steps_per_second': 3.311, 'total_flos': 663174827212800.0, 'train_loss': 1.3020902423469387, 'epoch': 7.0})

In [61]:
import os

In [70]:
os.listdir("./")

['model_t5', 'model', '.virtual_documents', 't5_logs']

In [71]:
os.makedirs(
    "./model", exist_ok=True
)

In [72]:
trainer.save_model("./model/")

In [73]:
os.listdir("./model")

['tokenizer.json',
 'tokenizer_config.json',
 'spiece.model',
 'model.safetensors',
 'generation_config.json',
 'training_args.bin',
 'config.json',
 'special_tokens_map.json']

In [75]:
trainer.push_to_hub("zeeshanakram992/news-suumarization-t5Small",
                   token="")

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/zeeshanakram992/model_t5/commit/2ebb27c8032e1076f02f5ee21c2e1dab8ec7365f', commit_message='zeeshanakram992/news-suumarization-t5Small', commit_description='', oid='2ebb27c8032e1076f02f5ee21c2e1dab8ec7365f', pr_url=None, pr_revision=None, pr_num=None)