In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/d/iitm21f1000641/logicloom/LABELLED_DEV.csv
/kaggle/input/d/iitm21f1000641/logicloom/LABELLED_TRAIN.csv


In [2]:
!pip install transformers accelerate evaluate rouge_score



In [3]:
from datasets import load_dataset
train = load_dataset("csv",data_files="/kaggle/input/d/iitm21f1000641/logicloom/LABELLED_TRAIN.csv")

In [4]:
print(type(train))

<class 'datasets.dataset_dict.DatasetDict'>


In [5]:
eval = load_dataset("csv",data_files="/kaggle/input/d/iitm21f1000641/logicloom/LABELLED_DEV.csv")

In [6]:
train_dataset = train.filter(lambda example: example["News Article"] is not None and example["Caption"] is not None)

In [7]:
eval_dataset = eval.filter(lambda example: example["News Article"] is not None and example["Caption"] is not None)

In [8]:
import re

def clean_text(text):
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", text)  # Remove special characters
    return text.strip()

In [9]:
def normalize_whitespace(text):
    return " ".join(text.split())
def to_lowercase(text):
    return text.lower()
def preprocess_text(text):
    text = clean_text(text)
    text = normalize_whitespace(text)
    text = to_lowercase(text)  # Optional, based on use case
    return text


In [10]:


def preprocess_function(examples):
    inputs = [preprocess_text(text) for text in examples["News Article"]]
    targets = [preprocess_text(text) for text in examples["Caption"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
def preprocess_function(examples):
    inputs = examples["News Article"]
    targets = examples["Caption"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True,padding="max_length",
                             return_tensors="pt")
    labels = tokenizer(targets, max_length=128, truncation=True,padding="max_length",
                      return_tensors="pt")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [12]:

tokenized_dataset_eval = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [14]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer,DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small")

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    logging_dir='./logs',
)

In [15]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_eval,
    tokenizer=tokenizer,
)
# trainer.train()

In [19]:
from tqdm import tqdm
import torch
from torch.optim import Adam

# Initialize the optimizer
optimizer = Adam(model.parameters(), lr=5e-5)  # Adjust learning rate as needed

# Use the dataset already tokenized and passed to the trainer
# train_dataset = tokenized_dataset_train

# Initialize the data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
)

# Create DataLoader
train_dataloader = torch.utils.data.DataLoader(
    tokenized_dataset_train["train"],
    batch_size=8,  # Adjust batch size as needed
    collate_fn=lambda batch: {
        key: torch.tensor([item[key] for item in batch], dtype=torch.long) 
        if isinstance(batch[0].get(key), list) else batch[0][key] 
        for key in batch[0]
    }
)
val_dataloader = torch.utils.data.DataLoader(
    tokenized_dataset_eval["train"],
    batch_size=8,  # Adjust batch size as needed
    collate_fn=lambda batch: {
        key: torch.tensor([item[key] for item in batch], dtype=torch.long) 
        if isinstance(batch[0].get(key), list) else batch[0][key] 
        for key in batch[0]
    }
)

model = trainer.model
model.train()


MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [20]:
print(type(tokenized_dataset_train))
tokenized_dataset_train


<class 'datasets.dataset_dict.DatasetDict'>


DatasetDict({
    train: Dataset({
        features: ['ID', 'News Article', 'Caption', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [18]:
print(type(tokenized_dataset_eval))
tokenized_dataset_eval

<class 'datasets.dataset_dict.DatasetDict'>


DatasetDict({
    train: Dataset({
        features: ['ID', 'News Article', 'Caption', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [23]:
# Initialize the best validation loss to a large number
best_val_loss = float('inf')

# Custom training loop
for epoch in range(int(training_args.num_train_epochs)):
    print(f"Epoch {epoch + 1}/{int(training_args.num_train_epochs)}")
    epoch_loss = 0  # To track the epoch loss
    
    for batch in tqdm(train_dataloader):
        # Move data to the appropriate device (GPU/CPU)
        batch = {k: v.to(training_args.device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
      
        # Remove any keys that are not needed by the model
        batch = {k: v for k, v in batch.items() if k in ["input_ids", "attention_mask", "labels"]}
   
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        epoch_loss += loss.item()

        # Backward pass
        loss.backward()

        # Optimizer step
        optimizer.step()
        optimizer.zero_grad()

    # Print training loss after each epoch
    print(f"Training loss after epoch {epoch + 1}: {epoch_loss / len(train_dataloader)}")
    
    # Now calculate validation loss after each epoch
    model.eval()  # Set model to evaluation mode
    val_loss = 0
    with torch.no_grad():  # Disable gradients for validation to save memory
        for batch in val_dataloader:
            batch = {k: v.to(training_args.device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}  

            batch = {k: v for k, v in batch.items() if k in ["input_ids", "attention_mask", "labels"]}

            outputs = model(**batch)
            loss = outputs.loss
            val_loss += loss.item()

    # Print validation loss after each epoch
    print(f"Validation loss after epoch {epoch + 1}: {val_loss / len(val_dataloader)}")
    
    # Check if the current validation loss is better (lower) than the best so far
    if val_loss / len(val_dataloader) < best_val_loss:
        best_val_loss = val_loss / len(val_dataloader)
        print(f"Validation loss decreased to {best_val_loss}. Saving model...")

        # Save the model's state_dict and tokenizer
        torch.save(model.state_dict(), "best_model.pth")  # Save the model state_dict
        tokenizer.save_pretrained("best_model")  # Save the tokenizer

        # Optionally, you can also save the optimizer if you want to resume training later
        torch.save(optimizer.state_dict(), "best_model_optimizer.pt")
        print("Best model saved.")

Epoch 1/3


100%|██████████| 375/375 [04:06<00:00,  1.52it/s]


Training loss after epoch 1: 0.9971842927932739
Validation loss after epoch 1: 0.9833337903022766
Validation loss decreased to 0.9833337903022766. Saving model...
Best model saved.
Epoch 2/3


100%|██████████| 375/375 [04:06<00:00,  1.52it/s]


Training loss after epoch 2: 0.8326277588208516
Validation loss after epoch 2: 0.9113809800148011
Validation loss decreased to 0.9113809800148011. Saving model...
Best model saved.
Epoch 3/3


100%|██████████| 375/375 [04:06<00:00,  1.52it/s]


Training loss after epoch 3: 0.8502487783432007
Validation loss after epoch 3: 0.9155899205207825


In [None]:
print(train_dataset[0])  # Replace with your dataset object
