Importing Necessary Libraries

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np 
from common.nets.module import BackboneNet, PoseNet
from common.nets.loss import JointHeatmapLoss, HandTypeLoss, RelRootDepthLoss
from main.config import cfg
from PIL import Image
from torchvision import transforms
from main.model import Model 
from common.utils.vis import vis_keypoints, vis_3d_keypoints
from common.utils.preprocessing import load_img, load_skeleton, process_bbox, generate_patch_image, transform_input_to_output_space, trans_point2d

Checkpoint Loading

In [2]:
# Load the checkpoint
checkpoint = torch.load(r"C:\\Users\\user\\Desktop\\Codes\\SignLanguageProject\\InterHand2.6M\\snapshot.tar", map_location='cpu')

network = checkpoint['network']

Creating an Instance of the Model Architecture

In [3]:
# Create an instance of the model architecture
# You should replace 'ModelClass' with the actual class of your model
backbone_net = BackboneNet()
pose_net = PoseNet(21)
model_instance = Model(backbone_net, pose_net)

Loading the Model's state_dict

In [4]:
# Load the model's state_dict from the 'network' key
model_instance.load_state_dict(network,strict=False)

_IncompatibleKeys(missing_keys=['backbone_net.resnet.conv1.weight', 'backbone_net.resnet.bn1.weight', 'backbone_net.resnet.bn1.bias', 'backbone_net.resnet.bn1.running_mean', 'backbone_net.resnet.bn1.running_var', 'backbone_net.resnet.layer1.0.conv1.weight', 'backbone_net.resnet.layer1.0.bn1.weight', 'backbone_net.resnet.layer1.0.bn1.bias', 'backbone_net.resnet.layer1.0.bn1.running_mean', 'backbone_net.resnet.layer1.0.bn1.running_var', 'backbone_net.resnet.layer1.0.conv2.weight', 'backbone_net.resnet.layer1.0.bn2.weight', 'backbone_net.resnet.layer1.0.bn2.bias', 'backbone_net.resnet.layer1.0.bn2.running_mean', 'backbone_net.resnet.layer1.0.bn2.running_var', 'backbone_net.resnet.layer1.0.conv3.weight', 'backbone_net.resnet.layer1.0.bn3.weight', 'backbone_net.resnet.layer1.0.bn3.bias', 'backbone_net.resnet.layer1.0.bn3.running_mean', 'backbone_net.resnet.layer1.0.bn3.running_var', 'backbone_net.resnet.layer1.0.downsample.0.weight', 'backbone_net.resnet.layer1.0.downsample.1.weight', 'back

Model Moving

In [5]:
# Move the model to the appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_instance.to(device)

Model(
  (backbone_net): BackboneNet(
    (resnet): ResNetBackbone(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu):

Loading Image of the Hand

In [6]:
# Now you can use the 'model_instance' for inference or further training
imsize = 256
loader = transforms.Compose([transforms.ToTensor()])

img_path = r"C:\Users\user\Desktop\Codes\SignLanguageProject\InterHand2.6M\input.jpg"
image = load_img(img_path)
original_img_height, original_img_width = image.shape[:2]

joint_num = 21 # single hand
root_joint_idx = {'right': 20, 'left': 41}
joint_type = {'right': np.arange(0,joint_num), 'left': np.arange(joint_num,joint_num*2)}
skeleton = load_skeleton(r"C:\Users\user\Desktop\Codes\SignLanguageProject\InterHand2.6M\skeleton.txt", joint_num*2)

Preparing BBOX

In [7]:
bbox = [69, 137, 165, 153] # xmin, ymin, width, height
bbox = process_bbox(bbox, (original_img_height, original_img_width, original_img_height))
img, trans, inv_trans = generate_patch_image(image, bbox, False, 1.0, 0.0, cfg.input_img_shape)
transform = transforms.ToTensor()
img = transform(img.astype(np.float32))/255
img = img[None,:,:,:]
model_instance.eval()
inputs = {'img': img}
targets = {}
meta_info = {}
with torch.no_grad():
    out = model_instance(inputs, targets, meta_info, 'test')
img = img[0].cpu().numpy().transpose(1,2,0) # cfg.input_img_shape[1], cfg.input_img_shape[0], 3
joint_coord = out['joint_coord'][0].cpu().numpy() # x,y pixel, z root-relative discretized depth
rel_root_depth = out['rel_root_depth'][0].cpu().numpy() # discretized depth
hand_type = out['hand_type'][0].cpu().numpy() # handedness probability


In [8]:
# restore joint coord to original image space and continuous depth space
joint_coord[:,0] = joint_coord[:,0] / cfg.output_hm_shape[2] * cfg.input_img_shape[1]
joint_coord[:,1] = joint_coord[:,1] / cfg.output_hm_shape[1] * cfg.input_img_shape[0]
joint_coord[:,:2] = np.dot(inv_trans, np.concatenate((joint_coord[:,:2], np.ones_like(joint_coord[:,:1])),1).transpose(1,0)).transpose(1,0)
joint_coord[:,2] = (joint_coord[:,2]/cfg.output_hm_shape[0] * 2 - 1) * (cfg.bbox_3d_size/2)

In [9]:
# restore right hand-relative left hand depth to continuous depth space
rel_root_depth = (rel_root_depth/cfg.output_root_hm_shape * 2 - 1) * (cfg.bbox_3d_size_root/2)

# right hand root depth == 0, left hand root depth == rel_root_depth
joint_coord[joint_type['left'],2] += rel_root_depth

In [10]:
# handedness
joint_valid = np.zeros((joint_num*2), dtype=np.float32)
right_exist = False
if hand_type[0] > 0.5: 
    right_exist = True
    joint_valid[joint_type['right']] = 1
left_exist = False
if hand_type[1] > 0.5:
    left_exist = True
    joint_valid[joint_type['left']] = 1

print('Right hand exist: ' + str(right_exist) + ' Left hand exist: ' + str(left_exist))


Right hand exist: False Left hand exist: True


In [11]:
# visualize joint coord in 2D space
filename = 'result_2d.jpg'
vis_img = image.copy()[:,:,::-1].transpose(2,0,1)
vis_img = vis_keypoints(vis_img, joint_coord, joint_valid, skeleton, filename, save_path='.')
# visualize joint coord in 3D space
# The 3D coordinate in here consists of x,y pixel and z root-relative depth.
# To make x,y, and z in real unit (e.g., mm), you need to know camera intrincis and root depth.
# The root depth can be obtained from RootNet (https://github.com/mks0601/3DMPPE_ROOTNET_RELEASE)
filename = 'result_3d.jpg'
vis_3d_keypoints(joint_coord, joint_valid, skeleton, filename)
print(len(skeleton))

42


In [12]:
import os
import os.path as osp
import cv2
import numpy as np
import matplotlib
from common.utils.vis import get_keypoint_rgb
matplotlib.use('tkagg')
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib as mpl
from main.config import cfg
from PIL import Image, ImageDraw

In [13]:
# visualize joint coord in 2D space
filename = 'result_2d.jpg'
vis_img = image.copy()[:,:,::-1].transpose(2,0,1)
rgb_dict = get_keypoint_rgb(skeleton)
_img = Image.fromarray(vis_img.transpose(1,2,0).astype('uint8')) 
draw = ImageDraw.Draw(_img)

In [14]:
for i in range(len(skeleton)):
    joint_name = skeleton[i]['name']
    pid = skeleton[i]['parent_id']
    parent_joint_name = skeleton[pid]['name']
    kps_i = (joint_coord[i][0].astype(np.int32), joint_coord[i][1].astype(np.int32))
    kps_pid = (joint_coord[pid][0].astype(np.int32), joint_coord[pid][1].astype(np.int32))
    if  joint_valid[i] >  0.4 and  joint_valid[pid] > 0.4 and pid != -1:
        print([(joint_coord[i][0], joint_coord[i][1]), (joint_coord[pid][0], joint_coord[pid][1])])
        draw.line([(joint_coord[i][0], joint_coord[i][1]), (joint_coord[pid][0], joint_coord[pid][1])], fill=rgb_dict[parent_joint_name], width=3)
    if  joint_valid[i] > 0.4:
        draw.ellipse((joint_coord[i][0]-3, joint_coord[i][1]-3, joint_coord[i][0]+3, joint_coord[i][1]+3), fill=rgb_dict[joint_name])
    if  joint_valid[pid] > 0.4 and pid != -1:
        draw.ellipse((joint_coord[pid][0]-3, joint_coord[pid][1]-3, joint_coord[pid][0]+3, joint_coord[pid][1]+3), fill=rgb_dict[parent_joint_name])
    

[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]
[(48.375, 110.375), (48.375, 110.375)]


In [15]:
_img.save("deneme1.jpg")

In [16]:
bbox = [69, 137, 165, 153] # xmin, ymin, width, height
bbox = process_bbox(bbox, (original_img_height, original_img_width, original_img_height))
img, trans, inv_trans = generate_patch_image(image, bbox, False, 1.0, 0.0, cfg.input_img_shape)
transform = transforms.ToTensor()
img = transform(img.astype(np.float32))/255
img = img[None,:,:,:]

In [17]:
out['joint_coord'][0]

tensor([[  48.3750,  110.3750,  -31.2500],
        [  48.3750,  110.3750,  193.7500],
        [  48.3750,  110.3750,  -93.7500],
        [  48.3750,  110.3750, -118.7500],
        [  48.3750,  110.3750,  -43.7500],
        [  48.3750,  110.3750,  137.5000],
        [  48.3750,  110.3750,   -6.2500],
        [  48.3750,  110.3750,  162.5000],
        [  48.3750,  110.3750,  137.5000],
        [  48.3750,  110.3750,   -6.2500],
        [  48.3750,  110.3750, -193.7500],
        [  48.3750,  110.3750,  106.2500],
        [  48.3750,  110.3750,  -87.5000],
        [  48.3750,  110.3750, -150.0000],
        [  48.3750,  110.3750, -150.0000],
        [  48.3750,  110.3750, -137.5000],
        [  48.3750,  110.3750,  -37.5000],
        [  48.3750,  110.3750, -150.0000],
        [  48.3750,  110.3750,   18.7500],
        [  48.3750,  110.3750,  -43.7500],
        [  48.3750,  110.3750,  -43.7500],
        [  48.3750,  110.3750,  103.3860],
        [  48.3750,  110.3750,   78.3860],
        [  

: 