In [1]:
import torch
from torch.utils.data import DataLoader

In [2]:
from pathlib import Path
import yaml

In [3]:
import sys
sys.path.append("..")
sys.path.append("../source")
from helper import init_model
from datasets.ptz_dataset import PTZImageDataset

INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [4]:
with open("../configs/Config_file.yaml", 'r') as y_file:
    args = yaml.load(y_file, Loader=yaml.FullLoader)
use_bfloat16 = args['meta']['use_bfloat16']
model_name = args['meta']['model_name']
pred_depth = args['meta']['pred_depth']
pred_emb_dim = args['meta']['pred_emb_dim']
patch_size = args['mask']['patch_size']  # patch-size for model training
crop_size = args['data']['crop_size']
batch_size = args['data']['batch_size']
camera_brand = args['meta']['camera_brand']


In [31]:
import copy

In [32]:
encoder, predictor = init_model(
        device="cpu",
        patch_size=patch_size,
        crop_size=crop_size,
        pred_depth=pred_depth,
        pred_emb_dim=pred_emb_dim,
        model_name=model_name)
target_encoder = copy.deepcopy(encoder)

INFO:root:VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(14, 14), stride=(14, 14))
  )
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): MLP(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
)


In [33]:
import importlib
import source
importlib.reload(source)
from source.run_jepa import get_position_from_label, forward_context,\
                            arrange_inputs, forward_target

In [34]:
import logging
import sys

In [35]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger()

In [41]:
logger.info("Loading dataset")

INFO:root:Loading dataset


In [43]:
logger.info('loaded context encoder from epoch %s with msg: %s' % (0, 'haha'))

INFO:root:loaded context encoder from epoch 0 with msg: haha


In [36]:
r_path = Path("/Users/yufengluo/Research/anl/su24/model/jepa-latest.pth.tar")

In [37]:
checkpoint = torch.load(r_path, map_location=torch.device('cpu'))
epoch = checkpoint['epoch']

In [38]:
pretrained_dict = checkpoint['encoder']
msg = encoder.load_state_dict(pretrained_dict)
pretrained_dict = checkpoint['predictor']
msg = predictor.load_state_dict(pretrained_dict)
pretrained_dict = checkpoint['target_encoder']
msg = target_encoder.load_state_dict(pretrained_dict)

In [8]:
from source.datasets.ptz_dataset import PTZImageDataset

In [9]:
from torchvision.transforms import ToTensor

In [10]:
from source.transforms import make_transforms

In [11]:
data = PTZImageDataset('../../collected_imgs_bu2', transform=make_transforms())
dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)
ipe = len(dataloader)

INFO:root:making ptz image data transforms


In [12]:
img_lab = next(iter(dataloader))

In [15]:
pos = get_position_from_label(img_lab[1])

In [16]:
context_imgs, context_poss, target_imgs, target_poss = arrange_inputs(img_lab[0], pos, "cpu")

In [17]:
z = forward_context(context_imgs, context_poss, target_poss, encoder, predictor, camera_brand)

In [45]:
import numpy as np

In [46]:
np.array([[10, 20, 30]])

array([[10, 20, 30]])

In [75]:
z = forward_target(target_imgs, target_encoder)

In [77]:
z.

torch.Size([16, 256, 192])

In [79]:
dat = np.load("/Users/yufengluo/Research/anl/su24/PTZJEPA/output/embeds_contx_encoder.npy",
              allow_pickle=True)

In [83]:
np.vstack(dat).shape

(176, 256, 192)

In [54]:
sys.path.append("..")

In [84]:
import gen_embed
import importlib
importlib.reload(gen_embed)

<module 'gen_embed' from '/Users/yufengluo/Research/anl/su24/PTZJEPA/notebooks/../source/gen_embed.py'>

In [85]:
from gen_embed import generate_embedding

In [86]:
generate_embedding(config_fpath="../configs/Config_file.yaml",
                   checkpoint_fpath="../../model/jepa-latest.pth.tar",
                   img_dir="../../collected_imgs_bu2",
                   output_dir="../output")

INFO:root:Loading parameters
INFO:root:Loading model
INFO:root:VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(14, 14), stride=(14, 14))
  )
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): MLP(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
)
INFO:ro

KeyboardInterrupt: 