<a href="https://colab.research.google.com/github/tareq056/Visual-Commonsense-Reasoning-in-Bangla-Text/blob/main/Resnet50_with_ROI_ALLIGN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.parallel
from torchvision.models import resnet


**Flattener**

In [2]:
class Flattener(torch.nn.Module):
    def __init__(self):
        """
        Flattens last 3 dimensions to make it only batch size, -1
        """
        super(Flattener, self).__init__()

    def forward(self, x):
        return x.view(x.size(0), -1)

In [3]:
!pip install torch torchvision --upgrade


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from torchvision.ops import roi_align

In [5]:
import torch.utils.model_zoo as model_zoo

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import os
USE_IMAGENET_PRETRAINED = True # otherwise use detectron, but that doesnt seem to work?!?

In [8]:
import os
VCR_IMAGES_DIR = os.path.join('/content/drive/MyDrive/BERT/VCR', 'vcr1images')

In [9]:
VCR_IMAGES_DIR

'/content/drive/MyDrive/BERT/VCR/vcr1images'

In [10]:
VCR_ANNOTS_DIR = os.path.join('/content/drive/MyDrive/BERT/VCR', 'vcr1annots')

In [11]:
from torch.nn import functional as F

**Pad_Sequence**

In [12]:
def pad_sequence(sequence, lengths):
    """
    :param sequence: [\sum b, .....] sequence
    :param lengths: [b1, b2, b3...] that sum to \sum b
    :return: [len(lengths), maxlen(b), .....] tensor
    """
    output = sequence.new_zeros(len(lengths), max(lengths), *sequence.shape[1:])
    start = 0
    for i, diff in enumerate(lengths):
        if diff > 0:
            output[i, :diff] = sequence[start:(start + diff)]
        start += diff
    return output


In [13]:
import torch
import torchvision.models as models
import torch.hub as hub

**Load_Resnet**

In [14]:
def _load_resnet(pretrained=True):
    # huge thx to https://github.com/ruotianluo/pytorch-faster-rcnn/blob/master/lib/nets/resnet_v1.py
    backbone = models.resnet50(pretrained=False)
    if pretrained:
        state_dict = hub.load_state_dict_from_url(
            'https://s3.us-west-2.amazonaws.com/ai2-rowanz/resnet50-e13db6895d81.th',
            progress=True
        )
        backbone.load_state_dict(state_dict)
    for i in range(2, 4):
        getattr(backbone, 'layer%d' % i)[0].conv1.stride = (2, 2)
        getattr(backbone, 'layer%d' % i)[0].conv2.stride = (1, 1)
    return backbone

_load_resnet()

Downloading: "https://s3.us-west-2.amazonaws.com/ai2-rowanz/resnet50-e13db6895d81.th" to /root/.cache/torch/hub/checkpoints/resnet50-e13db6895d81.th
100%|██████████| 97.7M/97.7M [00:02<00:00, 36.1MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

**load_resnet_imagenet**

In [15]:
def _load_resnet_imagenet(pretrained=True):
    # huge thx to https://github.com/ruotianluo/pytorch-faster-rcnn/blob/master/lib/nets/resnet_v1.py
    backbone = resnet.resnet50(pretrained=pretrained)
    for i in range(2, 4):
        getattr(backbone, 'layer%d' % i)[0].conv1.stride = (2, 2)
        getattr(backbone, 'layer%d' % i)[0].conv2.stride = (1, 1)
    # use stride 1 for the last conv4 layer (same as tf-faster-rcnn)
    backbone.layer4[0].conv2.stride = (1, 1)
    backbone.layer4[0].downsample[0].stride = (1, 1)

    # # Make batchnorm more sensible
    # for submodule in backbone.modules():
    #     if isinstance(submodule, torch.nn.BatchNorm2d):
    #         submodule.momentum = 0.01

    return backbone
_load_resnet_imagenet()

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 156MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

**Simple Detector**

In [25]:
import torch
import torchvision.ops as ops

class SimpleDetector(nn.Module):
    def __init__(self, pretrained=True, average_pool=True, semantic=True, final_dim=1024):
        """
        :param average_pool: whether or not to average pool the representations
        :param pretrained: Whether we need to load from scratch
        :param semantic: Whether or not we want to introduce the mask and the class label early on (default Yes)
        """
        super(SimpleDetector, self).__init__()
        # huge thx to https://github.com/ruotianluo/pytorch-faster-rcnn/blob/master/lib/nets/resnet_v1.py
        backbone = _load_resnet_imagenet(pretrained=pretrained) if USE_IMAGENET_PRETRAINED else _load_resnet(
            pretrained=pretrained)

        self.backbone = nn.Sequential(
            backbone.conv1,
            backbone.bn1,
            backbone.relu,
            backbone.maxpool,
            backbone.layer1,
            backbone.layer2,
            backbone.layer3,
            # backbone.layer4
        )
        self.roi_align = ops.RoIAlign((7, 7) if USE_IMAGENET_PRETRAINED else (14, 14),
                                  spatial_scale=1 / 16, sampling_ratio=0)

        if semantic:
            self.mask_dims = 32
            self.object_embed = torch.nn.Embedding(num_embeddings=81, embedding_dim=128)
            self.mask_upsample = torch.nn.Conv2d(1, self.mask_dims, kernel_size=3,
                                                  stride=1,
                                                  padding=1, bias=True)

        else:
            self.object_embed = None
            self.mask_upsample = None

        after_roi_align = [backbone.layer4]
        self.final_dim = final_dim
        if average_pool:
            after_roi_align += [nn.AvgPool2d(7, stride=1), Flattener()]

        self.after_roi_align = torch.nn.Sequential(*after_roi_align)
        print("1211213232142353467568769785653412498776543235687654321458765432")
        self.obj_downsample = torch.nn.Sequential(
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(2048 + (128 if semantic else 0), final_dim),
            torch.nn.ReLU(inplace=True),
        )
        self.regularizing_predictor = torch.nn.Linear(2048, 81)

    def forward(self,
                images: torch.Tensor,
                boxes: torch.Tensor,
                box_mask: torch.LongTensor,
                classes: torch.Tensor = None,
                segms: torch.Tensor = None,
                ):
        """
        :param images: [batch_size, 3, im_height, im_width]
        :param boxes:  [batch_size, max_num_objects, 4] Padded boxes
        :param box_mask: [batch_size, max_num_objects] Mask for whether or not each box is OK
        :return: object reps [batch_size, max_num_objects, dim]
        """
        # [batch_size, 2048, im_height // 32, im_width // 32
       
        img_feats = self.backbone(images)
        box_inds = box_mask.nonzero()
        assert box_inds.shape[0] > 0
        rois = torch.cat((
            box_inds[:, 0, None].type(boxes.dtype),
            boxes[box_inds[:, 0], box_inds[:, 1]],
        ), 1)
        print(rois)
        # Object class and segmentation representations
        roi_align_res = self.roi_align(img_feats, rois.float())  # cast rois to float
        if self.mask_upsample is not None:
            assert segms is not None
            segms_indexed = segms[box_inds[:, 0], None, box_inds[:, 1]] - 0.5
            print("ASASASASASASASSASASASASSASASASASASASASASASSASASASASAS")
            #roi_align_res[:, :self.mask_dims] += self.mask_upsample(segms_indexed)
        

        post_roialign = self.after_roi_align(roi_align_res)

        # Add some regularization, encouraging the model to keep giving decent enough predictions
        obj_logits = self.regularizing_predictor(post_roialign)
        obj_labels = classes[box_inds[:, 0], box_inds[:, 1]]
        cnn_regularization = F.cross_entropy(obj_logits, obj_labels, size_average=True)[None]

        feats_to_downsample = post_roialign if self.object_embed is None else torch.cat((post_roialign, self.object_embed(obj_labels)), -1)
        roi_aligned_feats = self.obj_downsample(feats_to_downsample)

        # Reshape into a padded sequence - this is expensive and annoying but easier to implement and debug...
        obj_reps = pad_sequence(roi_aligned_feats, box_mask.sum(1).tolist())
        return {
            'obj_reps_raw': post_roialign,
            'obj_reps': obj_reps,
            'obj_logits': obj_logits,
            'obj_labels': obj_labels,
            'cnn_regularization_loss': cnn_regularization
        }



In [17]:
import torchvision

In [18]:
print(torchvision.__version__)

0.15.1+cu118


In [19]:
!pip install --upgrade torchvision

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
!pip install torch torchvision --upgrade


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [21]:
# Example input data
batch_size = 2
im_height = 224
im_width = 224
max_num_objects = 5

images = torch.randn(batch_size, 3, im_height, im_width)
boxes = torch.randn(batch_size, max_num_objects, 4)
box_mask = torch.ones(batch_size, max_num_objects, dtype=torch.long)
classes = torch.randint(0, 80, (batch_size, max_num_objects))
segms = torch.randn(batch_size, max_num_objects, im_height // 32, im_width // 32)


**Load** **Image**

In [22]:
from PIL import Image
import torch
import torchvision.transforms as transforms

# Load image using Pillow
image = Image.open("/content/drive/MyDrive/BERT/FOOD IMAGE/archive (1)/evaluation/food/0.jpg")

# Define image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])


# Apply transformation to the image
tensor_image = transform(image)

# Print the shape of the tensor
#print(tensor_image.shape)

# Assuming tensor_image has shape (C, H, W)
tensor_image_batch1 = tensor_image.unsqueeze(0)  # Add batch dimension of 1
tensor_image_batch2 = tensor_image.unsqueeze(0)  # Add batch dimension of 1

# Concatenate the two tensors along the batch dimension
tensor_image_batch2 = torch.cat([tensor_image_batch1, tensor_image_batch2], dim=0)

# Print the shape of the new tensor
print(tensor_image_batch2.shape)


torch.Size([2, 3, 224, 224])


In [23]:
import json

# Open the JSONL file for reading
with open('/content/drive/MyDrive/BERT/VCR/Images/lsmdc_0001_American_Beauty/0001_American_Beauty_00.00.56.224-00.01.03.394@0.json', 'r') as f:
  # Loop through each line in the file
  for line in f:
    # Load the line as a JSON object
    data = json.loads(line)
    # Do something with the data
    print(data)

import tensorflow as tf

# Convert the bounding box list to a tensor
boxes = tf.constant(data["boxes"], dtype=tf.float32)

# Create a boolean mask for valid bounding boxes
valid_boxes = tf.reduce_all(boxes > 0, axis=-1)

# Convert the boolean mask to integer values
valid_boxes = tf.cast(valid_boxes, tf.int32)

# Pad the mask with zeros to match the maximum number of objects
max_num_objects = 5
box_mask = tf.pad(valid_boxes, [[0, max_num_objects - tf.shape(valid_boxes)[0]]])
box_mask

{'boxes': [[966.9033203125, 204.00875854492188, 1899.1513671875, 794.978515625, 0.8180223107337952], [274.30218505859375, 85.06173706054688, 1221.482421875, 818.9290771484375, 0.983723521232605], [5.1060333251953125, 409.7415771484375, 327.98944091796875, 812.7490234375, 0.886288046836853]], 'segms': [[[[1345, 230], [1329, 231], [1314, 232], [1303, 233], [1293, 234], [1284, 235], [1274, 239], [1267, 241], [1263, 242], [1258, 243], [1248, 244], [1244, 245], [1237, 247], [1234, 248], [1231, 249], [1226, 251], [1218, 255], [1215, 257], [1211, 260], [1205, 266], [1202, 270], [1200, 273], [1198, 277], [1196, 283], [1192, 290], [1189, 294], [1179, 304], [1176, 308], [1174, 311], [1170, 319], [1169, 324], [1167, 334], [1166, 340], [1165, 352], [1164, 359], [1163, 362], [1162, 365], [1161, 367], [1158, 372], [1155, 376], [1147, 384], [1145, 387], [1141, 395], [1139, 400], [1138, 403], [1137, 406], [1133, 424], [1132, 431], [1130, 446], [1129, 461], [1126, 482], [1125, 488], [1124, 491], [1115,

<tf.Tensor: shape=(5,), dtype=int32, numpy=array([1, 1, 1, 0, 0], dtype=int32)>

**OUTPUT CODE **

In [27]:
import torch

# define inputs
images = torch.randn(2, 3, 224, 224) # batch of 2 images, each with 3 channels and size 224x224
boxes = torch.tensor([[[10, 20, 50, 100], [200, 300, 400, 500], [0, 0, 0, 0]], 
                      [[50, 60, 100, 200], [0, 0, 0, 0], [0, 0, 0, 0]]]) # boxes for each image with max_num_objects=3
box_mask = torch.tensor([[1, 1, 0], [1, 0, 0]]) # mask for each box to indicate if it is valid or not
classes = torch.tensor([[0, 1, 2], [3, 0, 0]]) # class labels for each box
segms = torch.randn(2, 3, 28, 28) # segmentation masks for each box (optional)

# instantiate model
model = SimpleDetector(pretrained=True, average_pool=True, semantic=True, final_dim=1024)

# run model on inputs
object_reps = model(images, boxes, box_mask, classes, segms)

# print output shape
print(object_reps) # should be [2, 3, 1024]


1211213232142353467568769785653412498776543235687654321458765432
tensor([[  0,  10,  20,  50, 100],
        [  0, 200, 300, 400, 500],
        [  1,  50,  60, 100, 200]])
ASASASASASASASSASASASASSASASASASASASASASASSASASASASAS
{'obj_reps_raw': tensor([[0.3642, 0.4707, 0.8326,  ..., 0.9159, 0.7272, 0.4376],
        [0.0023, 0.1574, 0.0000,  ..., 0.0000, 0.0071, 0.0840],
        [0.7961, 0.8222, 0.5894,  ..., 0.2019, 0.5385, 0.4440]],
       grad_fn=<ViewBackward0>), 'obj_reps': tensor([[[0.0000, 0.1233, 0.3067,  ..., 0.3751, 0.0000, 0.0000],
         [0.1490, 0.0000, 0.1786,  ..., 0.2580, 0.2544, 0.0658]],

        [[0.1525, 0.4346, 0.0000,  ..., 0.3023, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]]],
       grad_fn=<CopySlices>), 'obj_logits': tensor([[-1.7619e-01,  5.9068e-01,  2.2263e-01, -4.3001e-01, -4.4621e-01,
          1.5568e-01,  2.6427e-01, -3.2443e-01, -2.7317e-01, -4.2288e-01,
          8.1489e-01, -2.4572e-01,  2.2680e-01, -7.7052e-01, -2.

**RUN FORWARD**

In [None]:
# create an instance of SimpleDetector
detector = SimpleDetector()

# define input tensors
#images = torch.randn((2, 3, 224, 224))  # batch of 2 images with 3 channels, height=224, and width=224
images =tensor_image_batch2
#boxes = data["boxes"]  # boxes for each image, padded to max_num_objects=2
#box_mask = torch.tensor([[1, 1], [1, 0]])  # mask indicating which boxes are valid for each image
classes = torch.tensor([[2, 3], [4, 1]])  # object class labels for each box
segms = data["segms"]  # object segmentation masks for each box, padded to max_num_objects=2

# pass input to the detector module
output = detector(images, boxes, box_mask, classes, segms)

# print the output
print(output)

In [None]:
import torch

# create an instance of SimpleDetector
detector = SimpleDetector()

# define input tensors
images = torch.randn((2, 3, 224, 224))  # batch of 2 images with 3 channels, height=224, and width=224
#images =tensor_image_batch2
boxes = torch.tensor([[[10, 20, 100, 150], [50, 100, 150, 200]], [[5, 10, 80, 120], [30, 60, 100, 160]]])  # boxes for each image, padded to max_num_objects=2
box_mask = torch.tensor([[1, 1], [1, 0]])  # mask indicating which boxes are valid for each image
classes = torch.tensor([[2, 3], [4, 1]])  # object class labels for each box
segms = torch.randn((2, 2, 28, 28))  # object segmentation masks for each box, padded to max_num_objects=2

# pass input to the detector module
output = detector(images, boxes, box_mask, classes, segms)

# print the output
print(output)


In [None]:
import torch
import torchvision.ops as ops

roi_align = ops.RoIAlign(output_size=(7, 7), spatial_scale=1.0, sampling_ratio=-1)

In [None]:
roi_align

In [None]:
from torch.nn.utils.rnn import pad_sequence

# create an instance of SimpleDetector
detector = SimpleDetector()


batch_size = 2
im_height = 224
im_width = 224
max_num_objects = 5


# create demo inputs
images = tensor_image_batch2
boxes = torch.tensor([[[50, 50, 150, 150], [100, 100, 200, 200]], [[75, 75, 175, 175], [125, 125, 225, 225]]])
box_mask = torch.LongTensor([[1, 1], [1, 0]])
classes = torch.tensor([[3, 2], [1, 0]])
segms = torch.randn(2, 2, 224, 224)

# run forward pass
detector(images, boxes, box_mask, classes, segms)
