# Load Imports

In [1]:
!pip install "deeplake<4"
!pip install --upgrade transformers 
!pip install nltk
!pip install torchvision



In [9]:
import torch
import torch.nn as nn

import numpy as np

import deeplake
import pandas as pd
import matplotlib.pyplot as plt

from torchvision import transforms
from transformers import BertTokenizer, BertModel
import nltk
import random

from torch.utils.data import Dataset
from PIL import Image
from torch.utils.data import DataLoader, random_split

# Data Creation 

- Sandbox Cat Dataset 
- Process image : Normalizing and resizing 
- Process captions : Tokenize caption/ pad 
- Bind it in a Dataset Class
- Create Train_Dataloader, Test_Dataloader, Eval_Dataloader... 

In [108]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

SSLError: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/tokenizer_config.json (Caused by SSLError(SSLZeroReturnError(6, 'TLS/SSL connection has been closed (EOF) (_ssl.c:1129)')))"), '(Request ID: d5a285fa-ff66-44c8-be22-b9af38a78126)')

In [109]:
caption_max_length = 20
vocab_size = len(tokenizer)

In [110]:
def process_image(image):
  """
  Image transformation functions
  Resizes images to 224x224, converts to tensor, and normalizes
  The normalization parameters are mean, SD of of r, g, b pixel values
  """
  image_transform = transforms.Compose([
      transforms.Resize((5, 5)),  #### CHANGES
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
  ])
  image = image.numpy()
  image = Image.fromarray(image)
  image = image_transform(image)
  return image


def process_caption(caption_text):
  """
  Tokenizes captions
  Returns tensor of tokenized captions, with a max length of 20,
  padding until this length, and truncating if the caption is longer
  The individual entries in the tensor are integers
  """

  tokenized = tokenizer(caption_text,
                        max_length=caption_max_length,
                        padding='max_length',
                        truncation=True,
                        return_tensors='pt')

  input_ids = tokenized['input_ids']
  # attention_mask = tokenized['attention_mask']
  return input_ids

def untokenize(tokenized_text):
    """
    Untokenizes a tensor
    Takes a tensor of integers, and according to tokenizer, returns
    original words
    """
    return tokenizer.decode(tokenized_text, skip_special_tokens=True)


In [91]:
class Flickr30kDataset(Dataset):
    def __init__(self, images, captions, image_transform=None, caption_transform = None):
      """
      Initializes dataset
      """
      self.images = images
      self.captions = captions
      self.image_transform = image_transform
      self.caption_transform = caption_transform

    def __len__(self):
      """
      Returns length of the dataset
      """
      return len(self.captions)

    def __getitem__(self, idx):
      """
      Returns a single item (image, and caption processed according to
      caption_transform) from the dataset
      """
      image = self.images[idx]
      image = self.image_transform(image)

      caption = self.captions[idx].numpy()[0]
      caption_embedding = self.caption_transform(caption)
      return image, caption_embedding

In [92]:
#################### CAT DATASET ########################

# Example image and captions

# Custom Tensor class to simulate the original dataset structure
class CustomTensor:
    def __init__(self, key, data):
        self.key = key
        self.data = data

    def numpy(self):
        if isinstance(self.data, str):
            return np.array([self.data], dtype=object)
        return np.array(self.data)

# Create sandbox dataset
image_raw = [
    "./sandbox_dataset/cat.jpg", 
    "./sandbox_dataset/cat.jpg", 
    "./sandbox_dataset/cat2.jpg", 
    "./sandbox_dataset/cat3.jpg"
]

captions = [
    "this is a cat",
    "this cat has fur",
    "this is a furry cat",
    "this is a cat not a dog",
]

# Format images and captions into tensors
images = [CustomTensor("image", np.array(Image.open(img).convert("RGB"))) for img in image_raw]
captions = [CustomTensor("caption_0", cap) for cap in captions]

# Initialize the dataset
cat_dataset = Flickr30kDataset(images, captions, image_transform=process_image, caption_transform=process_caption)
cat_train_dataloader = DataLoader(cat_dataset, batch_size=2, shuffle=False)


# Model Creation (Transformer) 
Encoder: 
1. Feature Extraction using CNN (outputs: )

Decoder: 

2. Word Embedding Layer - Token and Positional Embedding 

3. Masked Self Attention 

4. Cross Attention 

5. FFNN  

6. A liner layer + Softmax 

## Feature Extraction

In [115]:
class CNN_feature_extraction(nn.Module):
  def __init__(self):
    """
    Initializes convolutional neural network
    """
    super(CNN_feature_extraction, self).__init__()
    self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1) # (224,224,64)
    self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)# (224,224,128)
    self.conv3 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)# (224,224,256)

    self.relu = nn.ReLU()
    self.fc = nn.Linear(256 * 1 * 1, 512)  #### CHANGES 256 * 28 * 28 --> 256 * 1 * 1 
    self.fc2 = nn.Linear(512, 4) ##### CHANGES (512, 256) -> (512, 4) 
    self.flatten = nn.Flatten()

    self.maxPool = nn.MaxPool2d(2, 2)

    self.batchNorm1 = nn.BatchNorm2d(64)
    self.batchNorm2 = nn.BatchNorm2d(128)
    self.batchNorm3 = nn.BatchNorm2d(256)

  def forward(self, x):
    """
    Forward pass of convolutional neural network
    Note: Output is already flattened
    """
    layer1 = self.maxPool(self.relu(self.batchNorm1(self.conv1(x)))) # batch_size, 64, 112, 112 (b, c , h, w )
    layer2 = self.maxPool(self.relu(self.batchNorm2(self.conv2(layer1)))) # batch_size, 128, 56, 56
    # layer3 = self.maxPool(self.relu(self.batchNorm3(self.conv3(layer2)))) # batch_size, 256, 28, 28 ### CHANGES this is original
    layer3 = self.relu(self.batchNorm3(self.conv3(layer2)))
    x1 = self.flatten(layer3)  # 1, 200704 = 256*28*28
    print(layer1.shape)
    print(layer2.shape)
    print(layer3.shape)
    print(x1.shape)
    hidden = self.relu(self.fc(x1))
    x = self.fc2(hidden) # 1, 4
    print(x.shape) # batch, 4 
    return x

In [116]:
data_iter = iter(cat_train_dataloader)
batch = next(data_iter)
images, labels = batch

cnn= CNN_feature_extraction()
cnn(images)

torch.Size([2, 64, 2, 2])
torch.Size([2, 128, 1, 1])
torch.Size([2, 256, 1, 1])
torch.Size([2, 256])
torch.Size([2, 4])


tensor([[-0.0195, -0.0527,  0.0143,  0.0231],
        [-0.0195, -0.0527,  0.0143,  0.0231]], grad_fn=<AddmmBackward0>)

## Embedding Layer 

In [117]:
## Hyperparameter
embed_dim = 4 ##### CHANGES Embed_dim = 4

In [118]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, max_length, dim):
        """
        Combines token and positional embeddings to produce sequence embeddings for Transformers.
        Input:
          vocab_size: size of vocab. Used for token embedding
          max_length: max length of caption. Used for positional embedding
          dim: Dimension of embedding vectors. Dimension of both token and positional Embedding
        """

        super().__init__()
        self.pos_embedding = nn.Embedding(num_embeddings=max_length, embedding_dim=dim) # TODO removes .to(device)
        self.token_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=dim, padding_idx=0) #.to(device) # ie padding_idx that = 0 will not be trained

        self.dropout = nn.Dropout(0.1)

    def forward(self, caption):
        """
          Input: Caption (batch, max_length = 20)
          Output: Embedding (batch, max_length, dim)
        """
       
        batch_size, max_length = caption.shape
        token_embed = self.token_embedding(caption) 

        positional_indices = torch.arange(max_length, device=caption.device).unsqueeze(0) # 1, 20
        position_embed = self.pos_embedding(positional_indices)
        print("position embed shape 1, seq dim" , position_embed.shape) # 1, seq, dim 

        return self.dropout(token_embed + position_embed) ### CHANGES --> added self.dropout 

## Decoder 

In [96]:
## Hyperparameters
num_heads = 1
dropout =  0.1

self attention

In [97]:
class SelfAttention(nn.Module):
    """
    Self-Attention mechanism for token embeddings. Capture dependencies between words.

    Inputs:
        embed_dim: Embedding dimension = embed_dim
        num_heads: The number of attention heads. Each head learns different aspects of the relationships between tokens.
        dropout: Dropout rate used to prevent overfitting
    """
    def __init__(self,  num_heads = 1, embed_dim = embed_dim, dropout=0.1):
        super(SelfAttention, self).__init__()

        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout) #.to(device)
        self.layernorm = nn.LayerNorm(embed_dim) #.to(device)

    def forward(self, x_token, debug = False):
        """
        Takes in the input from embedding --> 
        input shape : (batch, max_length, dim)
        output shape : (batch, max_length, dim)
        """
        x_token = x_token.transpose(0, 1) # transpose to (seqlen, batch, embed)
        seq_len = x_token.size(0) 
        # assert seq_len == 20 

        # causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(x_token.device) 
        causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).to(x_token.device)  ### CHANGES - added Casual Mask (impt)
        causal_mask = causal_mask.masked_fill(causal_mask == 1, float("-inf"))


        attn_output, attn_output_weights = self.attention(query=x_token, key=x_token, value=x_token , attn_mask=causal_mask)
        attn_output_weights = torch.softmax(attn_output_weights, dim=-1) ### CHANGES


        x = x_token + attn_output  #Residual connection prevent vanishing grad
        x = self.layernorm(x)
        
        # if debug: 
        #     # help to print out the informations 
        #     print("SelfAttention Weights: \n")
        #     print("self attention weights shape is ", attn_output_weights.shape)
        #     print(attn_output_weights)
            
        return x.transpose(0,1)


cross attention

In [98]:
class CrossAttention(nn.Module):
    """
    Cross Attention between caption and image.

    Input:
        Caption Embedding : (batch, seq_length, embed_dim)
        Image Feature Extraction : (batch, 256)
    """

    def __init__(self, embed_dim, num_heads=1, dropout=0.1):
        super(CrossAttention, self).__init__()
        self.mha = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)
        self.layernorm = nn.LayerNorm(embed_dim)

    def forward(self, caption, image, debug = False):
        batch_size_c, seq_length, embed_dim = caption.shape
        batch_size_i, image_embed = image.shape
        assert batch_size_c == batch_size_i, "Batch Dimension of image and caption does not match"
        # assert image_embed == 256, "Image Shape Incorrect"
        assert image_embed == embed_dim , "Image dimension does not match Token dimension"
        # create the same image for seq_length
        image_broadcasted = image.unsqueeze(1).repeat(1, seq_length, 1) # batch, seq_length, image_dim ## ORIGINAL
        attn_output, attention_scores = self.mha(query=caption, key=image_broadcasted, value=image_broadcasted)
        caption = caption + attn_output  # Residual connection
        self.last_attention_scores = attention_scores
        if debug: 
            # help to print out the informations 
            print("Cross Attention Attention Weights: \n")
            print("cross attention weights shape is ", attention_scores.shape)
            print(attention_scores)
        return self.layernorm(caption)  #(batch, max_length, image_dim)

ffnn

In [99]:
class FeedForward(nn.Module):
    """
    Initializes feed forward layer, which passed data along to the 
    next layer
    """
    def __init__(self, embed_dim, dropout_rate=0.1):
        super(FeedForward, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(embed_dim, 2 * embed_dim),
            nn.ReLU(),
            nn.Linear(2 * embed_dim, embed_dim),
            nn.Dropout(dropout_rate)
        )
        self.layernorm = nn.LayerNorm(embed_dim)

    def forward(self, x):
        x = x + self.layer(x)
        return self.layernorm(x)


combine

In [100]:
class DecoderLayer(nn.Module):
    """
    Initializes full decoder layer, including self/cross attention layers
    and feed forward network
    """
    def __init__(self, embed_dim, num_heads=1, dropout=0.1):
        super(DecoderLayer, self).__init__()

        self.self_attention = SelfAttention(num_heads, embed_dim, dropout)
        self.cross_attention = CrossAttention(embed_dim, num_heads, dropout)
        self.ffnn = FeedForward(embed_dim, dropout)

    """
    Runs forward pass of the deocder layer
    """
    def forward(self, inputs):
        image, caption = inputs
        caption_1 = self.self_attention(caption, debug = True)
        output_seq = self.cross_attention(caption_1, image, debug = True)
        self.last_attention_scores = self.cross_attention.last_attention_scores
        
        out_seq = self.ffnn(output_seq)

        return out_seq # batch, seq_length, embed_dim

## PostProcessing

In [101]:
import collections
from tqdm import tqdm

In [174]:
class PostProcessing(nn.Module):
    """
    Initializes post processing layer, which biases against some tokens
    and incorporates frequency information about the text
    """
    def __init__(self, embed_dim, tokenizer, banned_tokens=('[UNK]', '[PAD]', '[SEP]', '[CLS]', '[MASK]', 'the', 'a', 'in')):

        super(PostProcessing, self).__init__()
        self.vocab_size = tokenizer.vocab_size

        self.tokenizer = tokenizer
        self.banned_tokens = banned_tokens

        ## DIFFERENCE 
        self.fc = nn.Linear(embed_dim, out_features=tokenizer.vocab_size) ## Goal is to map to vocab size # batch, seq_length, vocab_size

        self.bias = torch.zeros(self.vocab_size)

    def adapt(self, dataset):
        """
            vocab_dict: a dictionary to map vocab word to assigned index (from 0 to vocab_size)
            counts : a dictionary holding frequency of each word(token), key: id value :freq
        """
        ## Get a counter for each vocab in the token
        counts = collections.Counter()
        vocab_dict = {name: id for id, name in enumerate(self.tokenizer.vocab)} # map name to id

        for _, tokens in tqdm(dataset):
            counts.update(tokens.cpu().numpy().flatten())  # Update freq for token in the sentence

        # Turn Frequency into an array of all vocabulary
        # fill vocab array with freq
        # array indices = token_id
        counts_arr = np.zeros(shape=(self.vocab_size,))
        indices = np.array(list(counts.keys()), dtype=np.int32)
        counts_arr[indices] = list(counts.values())

        counts_arr = counts_arr[:]
        for token in self.banned_tokens:
            id = vocab_dict[token]
            counts_arr[id] = 0

        total = counts_arr.sum()
        p = counts_arr / total
        p[counts_arr == 0] = 1.0  # Prevent log(0)
        log_p = np.log(p)  # log(1) == 0

        entropy = -(log_p * p).sum()

        self.bias = torch.tensor(log_p, dtype=torch.float32, device=device)
        self.bias[counts_arr == 0] = -1e9  # Set large negative values for banned tokens

    def forward(self, x):
        """
        input : x batch, seqlength, embed dim 
        """
        # input = input.float()
        x = self.fc(x) 
        x = x + self.bias.to(x.device)  # Ensure bias is on the same device as the input
        return x 

In [175]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [176]:
post_processor= PostProcessing(embed_dim, tokenizer)
post_processor.adapt(cat_dataset) ## EXAMPLE 

100%|██████████| 4/4 [00:00<00:00, 162.67it/s]


In [177]:
banned_tokens=('[UNK]', '[PAD]', '[SEP]', '[CLS]', '[MASK]', 'the', 'a', 'in')

class ImageCaption(nn.Module):
    """
    Combines all portions to initialize entire image captioner
    """
    def __init__(self, tokenizer, vocab_size, train_dataset, num_layers=1, embed_dim=256, max_length=20, num_heads=2, dropout=0.1):
        """
        train_dataset: the dataset not the dataloader
        """
        super(ImageCaption, self).__init__()

        self.feature_extractor_model = CNN_feature_extraction()
        self.embedding = Embedding(vocab_size , max_length, embed_dim)
        self.decoder_layer = DecoderLayer(embed_dim)

        self.post_processing_model = PostProcessing(embed_dim, tokenizer, banned_tokens = banned_tokens)
        self.post_processing_model.adapt(train_dataset)

     

        self.tokenizer = tokenizer

        vocab = tokenizer.vocab
        self.word_to_index = {word: idx for idx, word in enumerate(vocab)}
        self.index_to_word = {idx: word for idx, word in enumerate(vocab)}

        # self.decoder_layers = nn.ModuleList([
        #     DecoderLayer(embed_dim)
        #     for _ in range(num_layers)
        # ])


    def forward(self, inputs):
        """ 
        Expected input shape 
        image: batch, 3, 224, 224 
        caption: batch, maxlength
        """
        image, caption = inputs
        # print("######### STARTING MODEL ###########")
        # print("image is of shape", image.shape)
        # print("caption is of shape", caption.shape)

        # print("##########  IMAGE EXTRACTION  ###############")
        # Step 1: Extract Features
        extracted_image = self.feature_extractor_model(image)
        # print("Post image Extraction shape:", extracted_image.shape)


        # Step 2: Embed the tokens
        # print("##########  CAPTION EMBEDDING  ###############")
        tokens = self.embedding(caption)
        # print("Post Embedding shape: ", tokens.shape)

        # Step 3: Decode
        # print("##########  DECODING LAYER  ###############")
        token_output = self.decoder_layer((extracted_image, tokens))
        # print("token output shape, " , token_output.shape)
        attention_scores = self.decoder_layer.last_attention_scores

        # Debug Prints
        # print("\n==== Debug Information ====")
        # print(f"Attention Scores Shape: {attention_scores.shape}")

        # print(f"Attention Scores: \n{attention_scores}")


        output = self.post_processing_model(token_output)

        ## DO WE NEED a 4th layer of linear  to output to match target 
        

        return output

In [178]:
model = ImageCaption(tokenizer, vocab_size, cat_dataset, embed_dim = 4 ) 
# model.index_to_word[2]


100%|██████████| 4/4 [00:00<00:00, 362.84it/s]


# Train LOOP

In [184]:
import torch
import torch.nn as nn
import torch.optim as optim

# Initialize model, tokenizer, and other components
# model = ImageCaption(tokenizer, vocab_size=tokenizer.vocab_size, train_dataset=cat_dataset)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training Loop
def train_model(model, dataloader, criterion, optimizer, num_epochs=10, device="cuda"):
    model.train()
    model.to(device)

    for epoch in range(num_epochs):
        total_loss = 0
        
        for batch_idx, (images, target_captions) in enumerate(dataloader):

            images = images.to(device)  # Shape: (batch_size, 3, 224, 224)
            batch_size,_,_,_ = images.shape
            target_captions = target_captions.to(device) #.long()  # Shape: (batch_size, max_length)
            target_captions = target_captions.squeeze(1)
            # Full Teacher Forcing: Use entire target captions as decoder input
            decoder_input = target_captions[:, :-1]  # Remove last token (e.g., [SEP] or EOS)
            targets = target_captions[:, 1:]  # Remove first token (e.g., [CLS] or SOS)
            
            pad_column = torch.full((batch_size, 1), tokenizer.pad_token_id, dtype=torch.long)
            decoder_input = torch.cat((decoder_input, pad_column), dim=1)
            targets = torch.cat((targets, pad_column), dim=1)

            # Forward Pass
            outputs = model((images, decoder_input))  # Shape: (batch_size, max_length - 1, vocab_size)



            # Compute Loss
            print("in train loop, outpus shape", outputs.shape)
            print(" PRDEICTED , ", outputs.view(-1, tokenizer.vocab_size))
            print( " expected, " , targets.view(-1))
            print("predicted shape", outputs.view(-1, tokenizer.vocab_size).shape, outputs.shape)
            print("SHAPE< of expected,", targets.view(-1).shape, targets.shape) # batch * 20 

            preds = outputs.view(-1, tokenizer.vocab_size) # 2* 20, cvocabsize
            target = targets.view(-1) # 2* 20 -- index 
            # then the prediction should be index of the argmax 
            prediction = [int(torch.argmax(batch)) for batch in preds] 

            pred_word = [model.index_to_word[i] for i in prediction]
            print("prediction", prediction)
            
            print("pred word", pred_word)
            # loss = criterion(outputs.view(-1, tokenizer.vocab_size), targets.view(-1))
            print("TARGET", target)
            loss = criterion(torch.tensor(prediction, dtype = float), torch.tensor(target))
            total_loss += loss.item()
            print("LoSs ITEM = ", loss.item())

            # Backpropagation and Optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Print Progress
            if batch_idx % 10 == 0:
                print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx}/{len(dataloader)}], Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch [{epoch+1}/{num_epochs}] Completed - Average Loss: {avg_loss:.4f}")

# Example: Train the model
train_model(model, cat_train_dataloader, criterion, optimizer, num_epochs=10, device="cpu")


torch.Size([2, 64, 2, 2])
torch.Size([2, 128, 1, 1])
torch.Size([2, 256, 1, 1])
torch.Size([2, 256])
torch.Size([2, 4])
position embed shape 1, seq dim torch.Size([1, 20, 4])
Cross Attention Attention Weights: 

cross attention weights shape is  torch.Size([20, 2, 2])
tensor([[[0.5556, 0.0000],
         [0.5556, 0.5556]],

        [[0.0000, 0.5556],
         [0.5556, 0.5556]],

        [[0.5556, 0.5556],
         [0.5556, 0.5556]],

        [[0.5556, 0.5556],
         [0.5556, 0.5556]],

        [[0.5556, 0.5556],
         [0.5556, 0.5556]],

        [[0.5556, 0.5556],
         [0.5556, 0.5556]],

        [[0.5556, 0.5556],
         [0.5556, 0.5556]],

        [[0.5556, 0.5556],
         [0.5556, 0.5556]],

        [[0.5556, 0.5556],
         [0.5556, 0.5556]],

        [[0.0000, 0.5556],
         [0.0000, 0.5556]],

        [[0.5556, 0.5556],
         [0.5556, 0.5556]],

        [[0.5556, 0.5556],
         [0.5556, 0.5556]],

        [[0.5556, 0.5556],
         [0.5556, 0.5556]],

   

  loss = criterion(torch.tensor(prediction, dtype = float), torch.tensor(target))


RuntimeError: Expected floating point type for target with class probabilities, got Long

In [182]:
vocab_size

30522

In [None]:
a = torch.tensor([[11,3,4,10,3],[2,3,4,4,10]])
a

tensor([[11.,  3.,  4., 10.,  3.],
        [ 2.,  3.,  4.,  4., 10.]], dtype=torch.float64)

In [170]:
[(torch.argmax(batch, dtype = float)) for batch in a] 

TypeError: argmax() got an unexpected keyword argument 'dtype'