In [None]:
#!pip install transformers torch torchvision pytorch_lightning sentencepiece
#!pip uninstall pytorch-lightning -y
#!pip install pytorch-lightning==2.0.4
## Install this for gradient checkpointing
#!pip install git+https://github.com/PyTorchLightning/pytorch-lightning.git@master --upgrade

In [None]:
import json
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from torch.optim import AdamW # Newer way to do it, but havent tried it yet
from transformers import T5Tokenizer, T5ForConditionalGeneration#, AdamW
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
from pytorch_lightning import LightningModule
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from pytorch_lightning.callbacks import ProgressBar
import os

In [None]:
import torch

nltk.download('stopwords')
nltk.download('punkt')

class Process_Dataset(Dataset):
    def __init__(self, dataframe):
        self.tokenizer = T5Tokenizer.from_pretrained('t5-small',model_max_length=2000)
        self.dataframe = dataframe

    def __getitem__(self, idx):
        inputs = self.tokenizer.encode_plus(
        self.dataframe.iloc[idx]['keywords'],
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
        )
        labels = self.tokenizer.encode_plus(
        self.dataframe.iloc[idx]['text'],
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
        )['input_ids']
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': labels.flatten()
        }

    def __len__(self):
        return len(self.dataframe)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from torch.utils.checkpoint import checkpoint

class New_T5_Trainer(LightningModule):
    def __init__(self, train_dataloader, val_dataloader):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained('t5-small')
        self._train_dataloader = train_dataloader
        self._val_dataloader = val_dataloader

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids, attention_mask=attention_mask, labels=labels)


        # Use the checkpoint function to enable gradient checkpointing
        return checkpoint(forward_function, input_ids, attention_mask, labels)

    def training_step(self, batch, batch_idx):
        print(next(self.model.parameters()).device)
        if batch_idx == 0:  # Print only for the first batch of each epoch
            print(f"batch: {batch}")
        outputs = self(**batch)
        print(outputs.loss.grad_fn)
        loss = outputs.loss
        print(f"Epoch: {self.current_epoch}, Batch: {batch_idx}, Loss: {loss.item()}")
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return {"loss": loss, "checkpoint_on": loss}

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        with torch.no_grad():
            outputs = self.model.generate(input_ids, attention_mask=attention_mask, max_length=512)
        val_loss = self.model(input_ids, attention_mask=attention_mask, labels=labels).loss
        self.log('val_loss', val_loss, on_step=True, on_epoch=True, prog_bar=True)
        return {"val_loss": val_loss}

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=1e-5)

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader






In [None]:
from google.colab import drive
import os

gdrive_dir = '/content/gdrive/'

drive.mount(gdrive_dir, force_remount=True)

Mounted at /content/gdrive/


In [None]:
# Load data and preprocess
with open(os.path.join(gdrive_dir, 'My Drive', 'coco', 'annotations', 'captions_train2017.json')) as t:
    train_raw = json.load(t)
with open(os.path.join(gdrive_dir, 'My Drive', 'coco', 'annotations','captions_val2017.json')) as v:
    test_raw = json.load(v)

caption_data = train_raw['annotations']
train_df = pd.DataFrame(caption_data)

caption_data = test_raw['annotations']
test_df = pd.DataFrame(caption_data)

captions_train = train_df['caption']
captions_test = test_df['caption']

stop_words = stopwords.words('english')
keywords = []
for caption in captions_train:
    words = word_tokenize(caption)
    words = [word for word in words if word.lower() not in stop_words]
    words = [word for word in words if word.isalpha()]
    keywords.append(' '.join(words))

final_df = pd.DataFrame({'keywords': keywords, 'text': captions_train})

train_df, val_df = train_test_split(final_df, test_size=0.2, random_state=42)

train_dataset = Process_Dataset(train_df)
val_dataset = Process_Dataset(val_df)

# it takes about 1 min 5 sec to finish preprocessing

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [None]:
final_df

Unnamed: 0,keywords,text
0,bicycle replica clock front wheel,A bicycle replica with a clock as the front wh...
1,room blue walls white sink door,A room with blue walls and a white sink and door.
2,car seems parked illegally behind legally park...,A car that seems to be parked illegally behind...
3,large passenger airplane flying air,A large passenger airplane flying through the ...
4,GOL plane taking partly cloudy sky,There is a GOL plane taking off in a partly cl...
...,...,...
591748,slice bread covered sour cream quacamole,a slice of bread is covered with a sour cream ...
591749,long plate hold fries sliders next,A long plate hold some fries with some sliders...
591750,Two women sit pose stuffed animals,Two women sit and pose with stuffed animals.
591751,White Plate lot guacamole extra large dollop s...,White Plate with a lot of guacamole and an ext...


In [None]:
train_df

Unnamed: 0,keywords,text
374373,Zebras seen eating hay large stall,Zebras are seen eating hay in a large stall.
72296,Luggage piled lobby area two people sit bench ...,Luggage is piled up in a lobby area while two ...
176162,Flowers inside vase top counter,Flowers inside a vase on top of a counter.
234195,Delta jet airplane runway airport,A Delta jet airplane on the runway at an airport.
214993,empty room toilet large window wooden floors,an empty room with a toilet a large window and...
...,...,...
110268,couple bears walking grassy area,A couple of bears walking down a grassy area.
259178,plane flying high air blue sky,a plane flying high in the air below a blue sky
365838,person knelt hand bag umbrella rainy sidewalk,A person knelt down with a hand bag and umbrel...
131932,woman sitting restaurant slice pizza hand,A woman sitting in a restaurant with a slice o...


In [None]:
val_df

Unnamed: 0,keywords,text
264066,Two young men sitting bench lady standing next,Two young men sitting on a bench and a lady st...
31867,Seagulls flight person feeding one lighthouse ...,Seagulls in flight with a person feeding one w...
279034,Three spotted furry animals gazing field,Three spotted furry animals gazing in a field.
258963,white truck lamb sitting flatbed,A white truck with a lamb sitting on it's flat...
91524,plate pasta meat broccoli together,A plate of pasta with meat and broccoli together
...,...,...
58368,Cows pasture near lake mountains distance,Cows in a pasture near a lake with mountains i...
409896,pile vegetables display grocery store,A pile of vegetables on display at a grocery s...
385930,person city street catching frisbee,a person on a city street catching a frisbee
20976,couple snugging together wooden bench,A couple snugging together on a wooden bench.


In [None]:


train_dataloader = DataLoader(train_dataset, batch_size=40, shuffle=True, num_workers=100)
val_dataloader = DataLoader(val_dataset, batch_size=24, num_workers=256)



In [None]:
#import torch
from pytorch_lightning import Trainer

# Create a progress bar that shows progress during sanity checking.
trainer = Trainer(max_epochs=3, devices=1, precision=16, gradient_clip_val=1.0, accumulate_grad_batches=16)


  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
model = New_T5_Trainer(train_dataloader,val_dataloader)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
<NllLossBackward0 object at 0x7f6288912e00>
Epoch: 0, Batch: 10169, Loss: 0.053735844790935516
cuda:0
<NllLossBackward0 object at 0x7f628075bf40>
Epoch: 0, Batch: 10170, Loss: 0.0533110573887825
cuda:0
<NllLossBackward0 object at 0x7f6406b0eef0>
Epoch: 0, Batch: 10171, Loss: 0.05130619928240776
cuda:0
<NllLossBackward0 object at 0x7f6406b0dc60>
Epoch: 0, Batch: 10172, Loss: 0.059489745646715164
cuda:0
<NllLossBackward0 object at 0x7f6406b0d3f0>
Epoch: 0, Batch: 10173, Loss: 0.06255593150854111
cuda:0
<NllLossBackward0 object at 0x7f6406b0ec50>
Epoch: 0, Batch: 10174, Loss: 0.05605607479810715
cuda:0
<NllLossBackward0 object at 0x7f6406b0f640>
Epoch: 0, Batch: 10175, Loss: 0.054695647209882736
cuda:0
<NllLossBackward0 object at 0x7f6406b0e290>
Epoch: 0, Batch: 10176, Loss: 0.054339874535799026
cuda:0
<NllLossBackward0 object at 0x7f6406b0da50>
Epoch: 0, Batch: 10177, Loss: 0.05909845978021622
cuda:0
<NllLossBackward0 objec

Validation: 0it [00:00, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
<NllLossBackward0 object at 0x7f640681e4a0>
Epoch: 1, Batch: 10169, Loss: 0.026173729449510574
cuda:0
<NllLossBackward0 object at 0x7f640681e3e0>
Epoch: 1, Batch: 10170, Loss: 0.028144728392362595
cuda:0
<NllLossBackward0 object at 0x7f640681efe0>
Epoch: 1, Batch: 10171, Loss: 0.024133743718266487
cuda:0
<NllLossBackward0 object at 0x7f640681eb30>
Epoch: 1, Batch: 10172, Loss: 0.028260868042707443
cuda:0
<NllLossBackward0 object at 0x7f640681f160>
Epoch: 1, Batch: 10173, Loss: 0.027299493551254272
cuda:0
<NllLossBackward0 object at 0x7f640681db40>
Epoch: 1, Batch: 10174, Loss: 0.02591698430478573
cuda:0
<NllLossBackward0 object at 0x7f640681fac0>
Epoch: 1, Batch: 10175, Loss: 0.026711037382483482
cuda:0
<NllLossBackward0 object at 0x7f640681db10>
Epoch: 1, Batch: 10176, Loss: 0.028267350047826767
cuda:0
<NllLossBackward0 object at 0x7f640681fc70>
Epoch: 1, Batch: 10177, Loss: 0.028857961297035217
cuda:0
<NllLossBackward0 

Validation: 0it [00:00, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
<NllLossBackward0 object at 0x7f6406a53e80>
Epoch: 2, Batch: 10169, Loss: 0.02734476700425148
cuda:0
<NllLossBackward0 object at 0x7f62899ded40>
Epoch: 2, Batch: 10170, Loss: 0.022494319826364517
cuda:0
<NllLossBackward0 object at 0x7f6406a52e90>
Epoch: 2, Batch: 10171, Loss: 0.024216875433921814
cuda:0
<NllLossBackward0 object at 0x7f64063a6b30>
Epoch: 2, Batch: 10172, Loss: 0.026141613721847534
cuda:0
<NllLossBackward0 object at 0x7f64063a6b30>
Epoch: 2, Batch: 10173, Loss: 0.029082676395773888
cuda:0
<NllLossBackward0 object at 0x7f64064959c0>
Epoch: 2, Batch: 10174, Loss: 0.02211102284491062
cuda:0
<NllLossBackward0 object at 0x7f64063a68c0>
Epoch: 2, Batch: 10175, Loss: 0.02743302844464779
cuda:0
<NllLossBackward0 object at 0x7f6406af44f0>
Epoch: 2, Batch: 10176, Loss: 0.022035421803593636
cuda:0
<NllLossBackward0 object at 0x7f6406af6200>
Epoch: 2, Batch: 10177, Loss: 0.023307302966713905
cuda:0
<NllLossBackward0 ob

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [None]:
# Save the model weights to file
torch.save(model.state_dict(), "t5_coco.pt")


In [None]:
state_dict = torch.load("t5_coco.pt")
model = New_T5_Trainer(train_dataloader, val_dataloader)  # initialize The model
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
# Generate Captions function
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')

def generate_caption(model, input_keywords):
    # Prepare the input data
    input_text = ' '.join(input_keywords)
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate output from the model
    with torch.no_grad():
        output = model.generate(input_ids)

    # Decode the output tokens to text
    output_text = tokenizer.decode(output[0])
    return output_text


In [None]:
# Load the trained model
model = New_T5_Trainer()
model.load_state_dict(torch.load("t5_coco.pt"))
model.to('cuda')  # Move the model to GPU

New_T5_Trainer(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_featu

In [None]:
class New_T5_Trainer(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained('t5-small')

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids, input_ids if labels is None else labels, attention_mask=attention_mask, labels=labels)


    def generate(self, input_ids, decoder_input_ids=None, **kwargs):
        return self.model.generate(input_ids, decoder_input_ids=decoder_input_ids, **kwargs)






In [None]:
def generate_caption(model, input_keywords):
    # Prepare the input data
    input_text = 'generate caption: ' + ' '.join(input_keywords)
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Detect the device of the model
    device = next(model.parameters()).device

    # Move the input tensors to the same device as the model
    input_ids = input_ids.to(device)

    # Generate output from the model
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=80,
            min_length=20,
            num_beams=5,
            temperature=0.7,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            early_stopping=True,
        )

    # Decode the output tokens to text
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return output_text




In [None]:
# Instantiate the model
model_instance = New_T5_Trainer()

# Load the model weights from the saved file
model_instance.load_state_dict(torch.load('t5_coco.pt'))

# Move the model to GPU if available
model_instance.to('cuda')

# use loaded model to generate captions
input_keywords = ['Zebras', 'seen', 'eating', 'hay', 'large', 'stall']
caption = generate_caption(model_instance, input_keywords)
print("Generated Caption:", caption)


Generated Caption: A caption: Zebras seen eating hay in a large stall..


In [None]:
# t5_coco.py Module For Streamlit
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch import nn
import torch

class New_T5_Trainer(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained('t5-small')

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    def generate(self, input_ids, decoder_input_ids=None, **kwargs):
        return self.model.generate(input_ids, decoder_input_ids=decoder_input_ids, **kwargs)

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = New_T5_Trainer()
model.load_state_dict(torch.load('t5_coco.pt'))  # load the weights

def generate_caption(model, input_keywords):
    # Prepare the input data
    input_text = 'generate caption: ' + ' '.join(input_keywords)
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Detect the device of the model
    device = next(model.parameters()).device

    # Move the input tensors to the same device as the model
    input_ids = input_ids.to(device)

    # Generate output from the model
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=80,
            min_length=20,
            num_beams=5,
            temperature=0.7,
            no_repeat_ngram_size=2,
            do_sample=True,
            top_k=50,
            early_stopping=True,
        )

    # Decode the output tokens to text
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return output_text


    # Instantiate the model
model_instance = New_T5_Trainer()

# Load the model weights from the saved file
model_instance.load_state_dict(torch.load('t5_coco.pt'))

# Move the model to GPU if available
model_instance.to('cuda')

# use loaded model to generate captions
input_keywords = ['Zebras', 'seen', 'eating', 'hay', 'large', 'stall']
caption = generate_caption(model_instance, input_keywords)
print("Generated Caption:", caption)



Find the Metrics

In [None]:
class Process_Dataset(Dataset):
    def __init__(self, dataframe):
        self.tokenizer = T5Tokenizer.from_pretrained('t5-small',model_max_length=2000)
        self.dataframe = dataframe

    def __getitem__(self, idx):
        inputs = self.tokenizer.encode_plus(
        self.dataframe.iloc[idx]['keywords'],
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
        )
        labels = self.tokenizer.encode_plus(
        self.dataframe.iloc[idx]['text'],
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
        )['input_ids']
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': labels.flatten(),
            'lm_labels': labels.flatten()  # Return the true labels
        }

    def __len__(self):
        return len(self.dataframe)


In [None]:
# Metrics for Performance:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from pytorch_lightning import LightningModule

# Instantiate dataloaders with updated configurations
train_dataloader = DataLoader(train_dataset, batch_size=40, shuffle=True, num_workers=100)
val_dataloader = DataLoader(val_dataset, batch_size=24, num_workers=256)

# Instantiate the model
model = New_T5_Trainer()

# Load the state dict
state_dict = torch.load("t5_coco.pt")

# Load the state dict into the model
model.load_state_dict(state_dict)

# Move the model to the desired device
model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Create an instance of the Process_Dataset class with the test data
test_dataset = Process_Dataset(final_df)

# Create a DataLoader for the test data
test_dataloader = DataLoader(test_dataset, batch_size=32, num_workers=4)

# Make sure model is in evaluation mode
model.eval()

# We'll store the true and predicted labels here
true_labels = []
pred_labels = []

# Perform inference on test data
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        lm_labels = batch['lm_labels'].to('cuda')  # Extract the true labels from the batch

        # We won't use 'labels' for the forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        true_labels.extend(lm_labels.tolist())
        pred_labels.extend(preds.tolist())


# Now we can calculate metrics
accuracy = accuracy_score(true_labels, pred_labels)
precision = precision_score(true_labels, pred_labels, average='macro')
recall = recall_score(true_labels, pred_labels, average='macro')
f1 = f1_score(true_labels, pred_labels, average='macro')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")


ValueError: ignored