In [1]:
# Basic Imports.....
import argparse
import json
import os
from pathlib import Path
from threading import Thread

import numpy as np
import torch
import yaml
from tqdm import tqdm

from models.experimental import attempt_load
from models.yolo_zsd import Model
from utils.datasets import create_dataloader
from utils.general import coco80_to_coco91_class, check_dataset, check_file, check_img_size, check_requirements, \
    box_iou, non_max_suppression, scale_coords, xyxy2xywh, xywh2xyxy, set_logging, increment_path, colorstr
from utils.metrics import ap_per_class, ConfusionMatrix
from utils.plots import plot_images, output_to_target, plot_study_txt
from utils.torch_utils import select_device, time_synchronized, TracedModel
from utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first, is_parallel

In [2]:
model_weights = "./og_test_yolov7.pt"

In [3]:
model = attempt_load(model_weights, map_location= torch.device('cpu'))

Fusing layers... 
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block


In [4]:
gs = max(int(model.stride.max()), 32) 
# Function that converts the input imgsz to the appropriate size
imgsz = 224
imgsz = check_img_size(imgsz, s=gs)
print(model.training)

False


In [12]:
y = model(torch.zeros(1, 3, imgsz, imgsz).to(torch.device('cpu')))
print(model.training)
y[0].shape

False


torch.Size([1, 3087, 85])

In [14]:
model.train()
y_train = model(torch.zeros(1, 3, imgsz, imgsz).to(torch.device('cpu')))

In [15]:
print(y_train[0].shape)
print(y_train[1].shape)
print(y_train[2].shape)

torch.Size([1, 3, 28, 28, 85])
torch.Size([1, 3, 14, 14, 85])
torch.Size([1, 3, 7, 7, 85])


In [56]:
test_model = Model(cfg='./cfg/training/yolov7.yaml', ch=3, nc = 512)
test_model

Model(
  (model): Sequential(
    (0): Conv(
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (1): Conv(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (2): Conv(
      (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (3): Conv(
      (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (4): Conv(
      (conv): Conv2d(128, 64, kernel_size=(1, 1), 

In [12]:
ckpt = torch.load(model_weights, map_location=torch.device('cpu'))

In [14]:
ckpt['model']

Model(
  (model): Sequential(
    (0): Conv(
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (1): Conv(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (2): Conv(
      (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (3): Conv(
      (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(128, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU()
    )
    (4): Conv(
      (conv): Conv2d(128, 64, kernel_size=(1, 1), 

In [17]:
model.state_dict().keys()

odict_keys(['model.0.conv.weight', 'model.0.conv.bias', 'model.1.conv.weight', 'model.1.conv.bias', 'model.2.conv.weight', 'model.2.conv.bias', 'model.3.conv.weight', 'model.3.conv.bias', 'model.4.conv.weight', 'model.4.conv.bias', 'model.5.conv.weight', 'model.5.conv.bias', 'model.6.conv.weight', 'model.6.conv.bias', 'model.7.conv.weight', 'model.7.conv.bias', 'model.8.conv.weight', 'model.8.conv.bias', 'model.9.conv.weight', 'model.9.conv.bias', 'model.11.conv.weight', 'model.11.conv.bias', 'model.13.conv.weight', 'model.13.conv.bias', 'model.14.conv.weight', 'model.14.conv.bias', 'model.15.conv.weight', 'model.15.conv.bias', 'model.17.conv.weight', 'model.17.conv.bias', 'model.18.conv.weight', 'model.18.conv.bias', 'model.19.conv.weight', 'model.19.conv.bias', 'model.20.conv.weight', 'model.20.conv.bias', 'model.21.conv.weight', 'model.21.conv.bias', 'model.22.conv.weight', 'model.22.conv.bias', 'model.24.conv.weight', 'model.24.conv.bias', 'model.26.conv.weight', 'model.26.conv.bia

In [29]:
model = Model("./cfg/training/yolov7.yaml", ch=3, nc=80)
model.state_dict()["model.105.m.0.weight"]

tensor([[[[-0.01902]],

         [[ 0.01580]],

         [[-0.04976]],

         ...,

         [[-0.01917]],

         [[ 0.03670]],

         [[ 0.03920]]],


        [[[ 0.01132]],

         [[ 0.04639]],

         [[-0.04671]],

         ...,

         [[-0.03675]],

         [[ 0.03078]],

         [[ 0.03986]]],


        [[[ 0.03603]],

         [[-0.01845]],

         [[-0.03204]],

         ...,

         [[-0.00340]],

         [[-0.05503]],

         [[-0.03195]]],


        ...,


        [[[-0.04144]],

         [[ 0.01215]],

         [[-0.02212]],

         ...,

         [[-0.01736]],

         [[-0.05861]],

         [[-0.05397]]],


        [[[-0.03542]],

         [[-0.04964]],

         [[ 0.05919]],

         ...,

         [[-0.00202]],

         [[-0.02869]],

         [[ 0.01571]]],


        [[[ 0.04434]],

         [[-0.02619]],

         [[-0.02958]],

         ...,

         [[-0.00756]],

         [[-0.02344]],

         [[ 0.06105]]]])

In [35]:
# LAYER 105 for corresponds to the object detection HEAD that we are trying to modify....
exclude = []
exclude.extend(['model.105.anchors', 'model.105.anchor_grid', 'model.105.m.0.weight', 
                'model.105.m.0.bias', 'model.105.m.1.weight', 'model.105.m.1.bias', 
                'model.105.m.2.weight', 'model.105.m.2.bias'])

# loading only the backbone weights + neck weights for supervised YOLO
state_dict = ckpt['model'].float().state_dict()  # to FP32

# Only loading weights for base & neck.....
state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude)  # intersect

In [39]:
model.load_state_dict(state_dict, strict=False)

_IncompatibleKeys(missing_keys=['model.105.anchors', 'model.105.anchor_grid', 'model.105.m.0.weight', 'model.105.m.0.bias', 'model.105.m.1.weight', 'model.105.m.1.bias', 'model.105.m.2.weight', 'model.105.m.2.bias', 'model.105.ia.0.implicit', 'model.105.ia.1.implicit', 'model.105.ia.2.implicit', 'model.105.im.0.implicit', 'model.105.im.1.implicit', 'model.105.im.2.implicit'], unexpected_keys=[])

In [43]:
model.state_dict()["model.0.conv.weight"]

tensor([[[[-1.10550e-02,  5.22156e-02, -1.50604e-02],
          [ 8.38623e-02,  2.07764e-01,  8.77686e-02],
          [ 2.59399e-02,  9.82666e-02,  2.08282e-02]],

         [[-4.36401e-02,  8.66699e-03, -4.78821e-02],
          [ 1.53580e-02,  1.31592e-01,  2.42767e-02],
          [-3.37219e-02,  3.34473e-02, -3.86963e-02]],

         [[-3.58887e-02,  9.89532e-03, -3.73230e-02],
          [ 3.12805e-03,  9.55811e-02,  1.18027e-02],
          [-3.11737e-02,  1.12381e-02, -3.57971e-02]]],


        [[[-3.35388e-02, -1.30493e-01, -7.70874e-02],
          [-1.79932e-01, -3.63770e-01, -2.70752e-01],
          [-2.58789e-02, -1.71021e-01, -1.04492e-01]],

         [[ 5.33752e-02,  1.16577e-01,  8.00171e-02],
          [ 1.08582e-01,  1.83228e-01,  1.42334e-01],
          [ 6.33545e-02,  1.20850e-01,  9.49097e-02]],

         [[-2.47650e-02,  4.57153e-02, -1.59607e-02],
          [ 8.53271e-02,  1.82983e-01,  1.13281e-01],
          [-1.65100e-02,  4.22363e-02, -1.79291e-03]]],


        [[[-

In [44]:
state_dict["model.0.conv.weight"]

tensor([[[[-1.10550e-02,  5.22156e-02, -1.50604e-02],
          [ 8.38623e-02,  2.07764e-01,  8.77686e-02],
          [ 2.59399e-02,  9.82666e-02,  2.08282e-02]],

         [[-4.36401e-02,  8.66699e-03, -4.78821e-02],
          [ 1.53580e-02,  1.31592e-01,  2.42767e-02],
          [-3.37219e-02,  3.34473e-02, -3.86963e-02]],

         [[-3.58887e-02,  9.89532e-03, -3.73230e-02],
          [ 3.12805e-03,  9.55811e-02,  1.18027e-02],
          [-3.11737e-02,  1.12381e-02, -3.57971e-02]]],


        [[[-3.35388e-02, -1.30493e-01, -7.70874e-02],
          [-1.79932e-01, -3.63770e-01, -2.70752e-01],
          [-2.58789e-02, -1.71021e-01, -1.04492e-01]],

         [[ 5.33752e-02,  1.16577e-01,  8.00171e-02],
          [ 1.08582e-01,  1.83228e-01,  1.42334e-01],
          [ 6.33545e-02,  1.20850e-01,  9.49097e-02]],

         [[-2.47650e-02,  4.57153e-02, -1.59607e-02],
          [ 8.53271e-02,  1.82983e-01,  1.13281e-01],
          [-1.65100e-02,  4.22363e-02, -1.79291e-03]]],


        [[[-

# Understanding BUILD_TARGETS

* 3 scales of output.

In [25]:
from utils.loss import ComputeLoss_CLIP

In [36]:
with open("/Users/vedant_j/Desktop/open_world/yolov7/data/hyp.scratch.p5.yaml") as f:
    hyp = yaml.load(f, Loader=yaml.SafeLoader)

device = torch.device('cpu')
model_weights = "./og_test_yolov7.pt"
model = attempt_load(model_weights, map_location= torch.device('cpu'))

hyp['box'] *= 3. / 3  # scale to layers
hyp['cls'] *= 80 / 80. * 3. / 3  # scale to classes and layers
hyp['obj'] *= (imgsz / 640) ** 2 * 3. / 3  # scale to image size and layers
model.nc = 80  # attach number of classes to model
model.hyp = hyp  # attach hyperparameters to model
model.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)

Fusing layers... 
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block


In [34]:
model.train()
random_preds =  model(torch.zeros(12, 3, imgsz, imgsz).to(device))
print("Number of outputs : ",len(random_preds))

Number of outputs :  3


In [35]:
compute_loss = ComputeLoss_CLIP(model, clip_text_vectors = None)

In [37]:
torch.tensor([[0, 0],
                [1, 0], [0, 1], [-1, 0], [0, -1],  # j,k,l,m
                # [1, 1], [1, -1], [-1, 1], [-1, -1],  # jk,jm,lk,lm
                ], device=device).float()*0.5

tensor([[ 0.00000,  0.00000],
        [ 0.50000,  0.00000],
        [ 0.00000,  0.50000],
        [-0.50000,  0.00000],
        [ 0.00000, -0.50000]])

In [38]:
random_preds[0].shape

torch.Size([12, 3, 28, 28, 85])

In [53]:
# creating an anchor index of na x nt anchors....

targets = torch.from_numpy(np.array([[1.0,2.0,0.3,0.3,0.4,0.5,1,0,0,0,0,0],
                                     [1.0,2.0,0.3,0.3,0.4,0.5,1,0,0,0,0,0],]))
na =3 
nt = targets.shape[0]
ai = torch.arange(na, device= device).float().view(na, 1).repeat(1, nt)  # same as .repeat_interleave(nt)
# repeating target labels for each anchor : na x nt x (total_label_dim + 1) & append anchor indices
targets = torch.cat((targets.repeat(na, 1, 1), ai[:, :, None]), 2)  


gain = torch.ones(7, device=device).long()
gain[2:6]=torch.tensor(random_preds[0].shape)[[3, 2, 3, 2]]
targets * gain

RuntimeError: The size of tensor a (13) must match the size of tensor b (7) at non-singleton dimension 2