In [1]:
import keras

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [3]:
import zipfile

zip_ref = zipfile.ZipFile("/gdrive/MyDrive/vpn-weight/weights_88.zip")
zip_ref.extractall()
zip_ref.close()

## Load models

**VPN**

In [4]:
!git clone https://github.com/tranminhduc4796/vpn_action_recognition.git
!mv vpn_action_recognition/VPN /content/

Cloning into 'vpn_action_recognition'...
remote: Enumerating objects: 97, done.[K
remote: Counting objects: 100% (97/97), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 97 (delta 52), reused 73 (delta 32), pack-reused 0[K
Unpacking objects: 100% (97/97), done.


In [5]:
cd /content/

/content


In [81]:
from keras.models import load_model
import numpy as np
import cv2


"""Helper functions"""
def compute_adjacent_mat(alpha=5, beta=2):
  adj = np.zeros([14, 14])
  intrinsic_connections = ((0, 13), (1, 13), (2, 1), (3, 2), 
                           (4, 13), (5, 4), (6, 5), (7, 13),
                           (8, 7), (9, 8), (10, 13), (11, 10), (12, 11))
  
  extrinsic_connections = [[6, 12], [5, 11], [4, 10], [3, 9], [2, 8], 
                           [1, 7], [2, 11], [5, 8], [3, 12], [6, 9]]
  
  for connection in intrinsic_connections:
      adj[connection[0]][connection[1]] = alpha
      adj[connection[1]][connection[0]] = alpha

  for connection in extrinsic_connections:
      adj[connection[0]][connection[1]] = beta
      adj[connection[1]][connection[0]] = beta

  for connection in range(0, 14):
      adj[connection][connection] = 0
  return adj


def normalize_adj_numpy(adj, symmetric=True):
    if symmetric:
        d = np.diag(np.power(np.array(adj.sum(1)), -0.5).flatten(), 0)
        a_norm = adj.dot(d).transpose().dot(d)
    else:
        d = np.diag(np.power(np.array(adj.sum(1)), -1).flatten(), 0)
        a_norm = d.dot(adj)
    return a_norm


def preprocess_adj_tensor_with_identity(adj_tensor, symmetric=True):
    adj_out_tensor = []
    for i in range(adj_tensor.shape[0]):
        adj = adj_tensor[i]
        adj = adj + np.eye(adj.shape[0])
        adj = normalize_adj_numpy(adj, symmetric)
        adj = np.concatenate([np.eye(adj.shape[0]), adj], axis=0)
        adj_out_tensor.append(adj)
    adj_out_tensor = np.array(adj_out_tensor)
    return adj_out_tensor


def compute_aggregated_mat(batch_size):
  a_mat = compute_adjacent_mat()
  a_mat = np.repeat(a_mat, batch_size, axis=0)
  a_mat = np.reshape(a_mat, [batch_size, a_mat.shape[1], a_mat.shape[1]])
  aggregated_mat = preprocess_adj_tensor_with_identity(a_mat)
  return aggregated_mat


def resize_imgs(imgs):
  """
  video_imgs: np.array with batch_size x 16 steps x height x width x 3
  """
  resized = []
  for batch in imgs:
    for step_img in batch:
      resized_img = cv2.resize(step_img, (224, 224))
      resized.append(resized_img)
  return np.asarray(resized).reshape(imgs.shape[0], 16, 224, 224, 3)
  

"""General functions"""

def normalize_imgs(imgs):
  imgs = imgs.astype(np.float)
  imgs /= 127.5
  imgs -= 1
  return imgs


def normalize_skeleton(skeleton, im_h, im_w):
  """
  skeleton: np.array with shapes batch_size x 16 steps x 14 joints x 2 coord
  """
  # skeleton[:, :, :, 0] /= im_w
  # skeleton[:, :, :, 1] /= im_h
  skeleton -= skeleton[:, 0, -1, :]
  return skeleton


def decode_predict(one_hot_predict):
  max_idx = np.argmax(one_hot_predict)
  classes = ['sit_down', 'stand_up', 'jump_up', 'shake_hand', 'walk_towards', 'walk_apart']
  return classes[max_idx]


def preprocess_vpn_input(video_imgs, skeleton):
  """
  video_imgs: np.array with batch_size x 16 steps x height x width x 3
  skeleton: np.array with shapes batch_size x 16 steps x 14 joints x 2 coord
  """
  batch_size, _, im_h, im_w, _ = video_imgs.shape
  
  aggregated_mat = compute_aggregated_mat(batch_size)

  skeleton = normalize_skeleton(skeleton, im_h, im_w)
  
  video_imgs = resize_imgs(video_imgs)
  video_imgs = normalize_imgs(video_imgs)


  return [skeleton[:, 0, :, :], skeleton[:, 1, :, :], 
          skeleton[:, 2, :, :], skeleton[:, 3, :, :], 
          skeleton[:, 4, :, :], skeleton[:, 5, :, :], 
          skeleton[:, 6, :, :], skeleton[:, 7, :, :], 
          skeleton[:, 8, :, :], skeleton[:, 9, :, :], 
          skeleton[:, 10, :, :], skeleton[:, 11, :, :], 
          skeleton[:, 12, :, :], skeleton[:, 13, :, :],
          skeleton[:, 14, :, :], skeleton[:, 15, :, :], 
          skeleton, aggregated_mat, video_imgs]

## Inference

In [7]:
import os
import cv2

def load_one_video(video_path):
  imgs = []

  for img_id in os.listdir(video_path):
    img_path = os.path.join(video_path, img_id)
    imgs.append(cv2.imread(img_path))
  imgs = np.asarray(imgs)
  return np.expand_dims(imgs, axis=0)


def load_one_skeletons(sklt_path):
  sklt = np.load(sklt_path)[:, :28]
  return sklt.reshape(1, -1, 14, 2)

In [8]:
# Down sample from drive
!cp -r /gdrive/MyDrive/NTU_RGB_video/S001C001P001R001A060/ .

In [73]:
cd /content

/content


In [74]:
cd VPN/

/content/VPN


In [75]:
from keras.models import load_model
from embedding_gcnn_attention_model import *
from compute_adjacency import *
from utils import *

In [76]:
model = keras.models.load_model('../best')

In [50]:
imgs = np.load('/gdrive/MyDrive/vpn-weight/image_arr.npy')
imgs = np.expand_dims(imgs, axis=0)
imgs.shape

(1, 79, 1920, 1080, 3)

In [51]:
sklts = load_one_skeletons('/content/skeleton.npy')
sklts.shape

(1, 79, 14, 2)

In [52]:
# Remove frames to be able to split video into 16 steps
imgs = imgs[:, :imgs.shape[1] - (imgs.shape[1] % 16), :, :, :]
sklts = sklts[:, :imgs.shape[1] - (imgs.shape[1] % 16), :, :]

In [53]:
print(imgs.shape)
print(sklts.shape)

(1, 64, 1920, 1080, 3)
(1, 64, 14, 2)


In [54]:
splited_imgs = np.asarray(np.split(imgs, 16, axis=1))
splited_sklts = np.asarray(np.split(sklts, 16, axis=1))
print(splited_imgs.shape)
print(splited_sklts.shape)

(16, 1, 4, 1920, 1080, 3)
(16, 1, 4, 14, 2)


In [55]:
splited_imgs = np.moveaxis(splited_imgs, 0, 1)
splited_sklts = np.moveaxis(splited_sklts, 0, 1)

print(splited_imgs.shape)
print(splited_sklts.shape)

(1, 16, 4, 1920, 1080, 3)
(1, 16, 4, 14, 2)


In [56]:
sample_imgs = []
sample_sklts = []

n_frames_each_step = splited_imgs.shape[2]

for batch_idx in range(splited_imgs.shape[0]):
  new_batch_imgs = []
  new_batch_sklts = []
  for i in range(16):
    sampling_idx = np.random.choice(n_frames_each_step)
    sample_img = splited_imgs[batch_idx, i, sampling_idx]
    sample_sklt = splited_sklts[batch_idx, i, sampling_idx]
    new_batch_imgs.append(sample_img)
    new_batch_sklts.append(sample_sklt)
  new_batch_imgs = np.asarray(new_batch_imgs)
  new_batch_sklts = np.asarray(new_batch_sklts)
  sample_imgs.append(new_batch_imgs)
  sample_sklts.append(new_batch_sklts)

sample_imgs = np.asarray(sample_imgs, dtype=np.float32)
sample_sklts = np.asarray(sample_sklts, dtype=np.float32)



In [57]:
print(sample_imgs.shape)
print(sample_sklts.shape)

(1, 16, 1920, 1080, 3)
(1, 16, 14, 2)


In [32]:
# Rescale skeletons' values due to the data is processed
sample_sklts[:, :, :, 0] *= 1920.
sample_sklts[:, :, :, 1] *= 1080

In [58]:
inputs = preprocess_vpn_input(sample_imgs, sample_sklts)

In [37]:
np.argmax(model.predict(inputs)[0])

4

In [39]:
action_id, _ = model.predict(inputs)

In [40]:
decode_predict(action_id)

'walk_towards'

In [None]:
for layer in model.layers:
    weights = layer.get_weights()
    print(layer.name)
    print(weights)

In [41]:
!pip install tensorflow
!pip install collection
!apt-get install swig -y
!pip install numpy
!pip install matplotlib
!pip install opencv-python
!git clone https://github.com/gsethi2409/tf-pose-estimation
!pip install -r tf-pose-estimation/requirements.txt
!pip install ffmpeg-python

Collecting collection
  Downloading https://files.pythonhosted.org/packages/81/a7/12577601fd60036732cd5b078ed3aa9a2f888169ea70d84997b3607e63b4/collection-0.1.6.tar.gz
Building wheels for collected packages: collection
  Building wheel for collection (setup.py) ... [?25l[?25hdone
  Created wheel for collection: filename=collection-0.1.6-cp36-none-any.whl size=5116 sha256=c1ddc9623bb8b81503362ae32861c37aac48d5ef0c8e0d28cafe373dae544416
  Stored in directory: /root/.cache/pip/wheels/9e/f2/2b/a611b0dc83b770763e7962500ef158c09dc8161d3fce6e73de
Successfully built collection
Installing collected packages: collection
Successfully installed collection-0.1.6
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  swig3.0
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  swig swig3.0
0 upgraded, 2 newly installed, 0 to remove and 

Collecting ffmpeg-python
  Downloading https://files.pythonhosted.org/packages/d7/0c/56be52741f75bad4dc6555991fabd2e07b432d333da82c11ad701123888a/ffmpeg_python-0.2.0-py3-none-any.whl
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


In [42]:
cd tf-pose-estimation/tf_pose/pafprocess/

/content/VPN/tf-pose-estimation/tf_pose/pafprocess


In [43]:
!swig -python -c++ pafprocess.i && python setup.py build_ext --inplace

running build_ext
building '_pafprocess' extension
swigging pafprocess.i to pafprocess_wrap.cpp
swig -python -c++ -o pafprocess_wrap.cpp pafprocess.i
creating build
creating build/temp.linux-x86_64-3.6
x86_64-linux-gnu-gcc -pthread -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -fPIC -I/usr/local/lib/python3.6/dist-packages/numpy/core/include -I. -I/usr/include/python3.6m -c pafprocess.cpp -o build/temp.linux-x86_64-3.6/pafprocess.o
x86_64-linux-gnu-gcc -pthread -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -fPIC -I/usr/local/lib/python3.6/dist-packages/numpy/core/include -I. -I/usr/include/python3.6m -c pafprocess_wrap.cpp -o build/temp.linux-x86_64-3.6/pafprocess_wrap.o
x86_64-linux-gnu-g++ -pthread -shared -Wl,-O1 -Wl,-Bsymbolic-functions -Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-Bsymbolic-functions -Wl,-z,relro -g -fstack-protecto

In [44]:
!pip install git+https://github.com/adrianc-a/tf-slim.git@remove_contrib

Collecting git+https://github.com/adrianc-a/tf-slim.git@remove_contrib
  Cloning https://github.com/adrianc-a/tf-slim.git (to revision remove_contrib) to /tmp/pip-req-build-626lae4c
  Running command git clone -q https://github.com/adrianc-a/tf-slim.git /tmp/pip-req-build-626lae4c
  Running command git checkout -b remove_contrib --track origin/remove_contrib
  Switched to a new branch 'remove_contrib'
  Branch 'remove_contrib' set up to track remote branch 'remove_contrib' from 'origin'.
Building wheels for collected packages: tf-slim
  Building wheel for tf-slim (setup.py) ... [?25l[?25hdone
  Created wheel for tf-slim: filename=tf_slim-1.0-cp36-none-any.whl size=275049 sha256=f8fe34e17da0d8c264b868ab64576a710e767d62e3cbcfadd0a6a6853a1544c5
  Stored in directory: /tmp/pip-ephem-wheel-cache-lbn91r14/wheels/be/41/65/cd259010573e2dd0fba258f9a466059b48e38118aeec65e09f
Successfully built tf-slim
Installing collected packages: tf-slim
Successfully installed tf-slim-1.0


In [45]:
cd ../..

/content/VPN/tf-pose-estimation


In [46]:
import sys
sys.path.append('.')
import time
import logging
import numpy as np
import matplotlib.pyplot as plt
import cv2
from tf_pose import common
from tf_pose.estimator import TfPoseEstimator
from tf_pose.networks import get_graph_path, model_wh
from imutils import paths
import os
import random
import ffmpeg
import imutils
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import scipy.ndimage.interpolation as inter
from scipy.signal import medfilt 
from scipy.spatial.distance import cdist
from google.colab.patches import cv2_imshow


2021-01-17 10:05:01,878 INFO Generating grammar tables from /usr/lib/python3.6/lib2to3/Grammar.txt
2021-01-17 10:05:01,897 INFO Generating grammar tables from /usr/lib/python3.6/lib2to3/PatternGrammar.txt


In [47]:
model='mobilenet_v2_large'
#show_process = True # for debug purpose, if enabled, speed for inference is dropped.
#logger.debug('initialization %s : %s' % (model, get_graph_path(model)))
#1920x1040
resolution='432x368' 
w, h = model_wh(resolution)
e = TfPoseEstimator(get_graph_path(model), target_size=(w, h))

[2021-01-17 10:05:08,395] [TfPoseEstimator] [INFO] loading graph from /content/VPN/tf-pose-estimation/models/graph/mobilenet_v2_large/graph_opt.pb(default size=432x368)
2021-01-17 10:05:08,395 INFO loading graph from /content/VPN/tf-pose-estimation/models/graph/mobilenet_v2_large/graph_opt.pb(default size=432x368)


TfPoseEstimator/image
TfPoseEstimator/MobilenetV2/Conv/BatchNorm/Const
TfPoseEstimator/MobilenetV2/Conv/BatchNorm/Const_1
TfPoseEstimator/MobilenetV2/expanded_conv/depthwise/BatchNorm/Const
TfPoseEstimator/MobilenetV2/expanded_conv/depthwise/BatchNorm/Const_1
TfPoseEstimator/MobilenetV2/expanded_conv/project/BatchNorm/Const
TfPoseEstimator/MobilenetV2/expanded_conv/project/BatchNorm/Const_1
TfPoseEstimator/MobilenetV2/expanded_conv_1/expand/BatchNorm/Const
TfPoseEstimator/MobilenetV2/expanded_conv_1/expand/BatchNorm/Const_1
TfPoseEstimator/MobilenetV2/expanded_conv_1/depthwise/BatchNorm/Const
TfPoseEstimator/MobilenetV2/expanded_conv_1/depthwise/BatchNorm/Const_1
TfPoseEstimator/MobilenetV2/expanded_conv_1/project/BatchNorm/Const
TfPoseEstimator/MobilenetV2/expanded_conv_1/project/BatchNorm/Const_1
TfPoseEstimator/MobilenetV2/expanded_conv_2/expand/BatchNorm/Const
TfPoseEstimator/MobilenetV2/expanded_conv_2/expand/BatchNorm/Const_1
TfPoseEstimator/MobilenetV2/expanded_conv_2/depthwise/

In [63]:
#check rotation
def check_rotation(path_video_file):
     # this returns meta-data of the video file in form of a dictionary
     meta_dict = ffmpeg.probe(path_video_file)

     # from the dictionary, meta_dict['streams'][0]['tags']['rotate'] is the key
     # we are looking for
     rotateCode = None
     if int(meta_dict['streams'][0]['tags']['rotate']) == 90:
         rotateCode = cv2.ROTATE_90_CLOCKWISE
     elif int(meta_dict['streams'][0]['tags']['rotate']) == 180:
         rotateCode = cv2.ROTATE_180
     elif int(meta_dict['streams'][0]['tags']['rotate']) == 270:
         rotateCode = cv2.ROTATE_90_COUNTERCLOCKWISE
         
     return rotateCode

def correct_rotation(frame, rotateCode):  
     return cv2.rotate(frame, rotateCode) 

def norm_inner_feat(X):
  temp_X = X - X[:, :, 0, :].reshape(X.shape[0], X.shape[1], 1, X.shape[-1])
  return temp_X[:, :, 1:, :]

def norm_outer_feat(X, fps=15):
  n_samples = X.shape[0]
  n_frame = X.shape[1]
  n_joint = X.shape[2]
  v = np.zeros((n_samples, n_frame - 1, n_joint, 2))
  time = 1 / fps
  for n in range(n_samples):
      for f in range(n_frame-1):
          v[n,f] = (X[n,f+1] - X[n,f]) / time
  return v
    
ACTION_MAP_ID = {
    5: 'Sit down', # sit down
    0: 'Stand up', # stand up
    1: 'Jump', # jump
    2: 'Shaking hands', # hand shake
    3: 'Walking towards',
    4: 'Walking apart'}

def open2ntu_pose(pose_18_joints):
  """
  pose_18_joints: array with shape (n_frames, 18 joints, 2 coords)
  """
  picked_joints = [0, 5, 6, 7, 2, 3, 4, 11, 12, 13, 8, 9, 10, 1]
  return pose_18_joints[picked_joints]

In [64]:
def get_skeleton_coor(human, flag=False):
  dict_coor = {}
  for index, body in enumerate(human):
    tmp_coor = []
    for key, val in body.body_parts.items():
      if key in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]:
        tmp_coor.append([val.x, val.y])
    if len(tmp_coor) == 14:
      tmp_coor = open2ntu_pose(np.array(tmp_coor))
      dict_coor[f'body_{index}'] = tmp_coor.flatten()
      flag = True
  return dict_coor, flag

In [82]:
#for save video visualize
video_path = '/content/IMG_5054.MOV'
showBG = True
cap = cv2.VideoCapture(video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH ))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT ))

try:
  rotateCode = check_rotation(video_path)
except:
  rotateCode = None

if cap.isOpened() is False:
  print('Error opening video stream or file')

fps_time = 0
out = cv2.VideoWriter('/content/bat_tay_output.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 30, (height, width))

body_dict_concat = {}
image_array = []
label_default = '' 
while True:
     
    ret_val, image = cap.read()
    if rotateCode is not None:
         image = correct_rotation(image, rotateCode)
    try:
      humans = e.inference(image,
                          resize_to_default=(w > 0 and h > 0),
                          upsample_size=4.0)
    except:
      break
    if not showBG:
      image = np.zeros(image.shape)

    image = TfPoseEstimator.draw_humans(image, humans, imgcopy=False)
    cv2.putText(image, "FPS: %f" % (1.0 / (time.time() - fps_time)), (30, 50),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    
    # #get x,y coordinate
    lst_location = []
    for i in humans:
      try:
        lst_location.append([i.body_parts[0].x, i.body_parts[0].y])
      except:
        pass

    #get keypoint of every people in a frame
    body_keypoint_coor_dict, flag = get_skeleton_coor(humans)
    if flag:
      image_array.append(image)

    #append these skeleton to a dict of all people skeleton
    for body, val in body_keypoint_coor_dict.items():
      if body not in body_dict_concat:
        body_dict_concat[body] = val
      else:
        try:
          temp_val = np.concatenate([body_dict_concat[body], val], axis=0)
        except:
          break
        body_dict_concat[body] = temp_val


    #if any body is equal to 16 -> predict label
    count = 0
    for body, val in body_dict_concat.items():
      if len(val) == 448:
        val_16frame = val.reshape(1, 16, 14, 2)
        np_image_array = np.asarray(image_array)
        np_image_array = np.expand_dims(np_image_array, axis=0)
        inputs = preprocess_vpn_input(np_image_array, val_16frame)
        action_id, _ = model.predict(inputs)
        # x, y = val_16frame[-1][0]
        #normalize data for 16 frames
        # X_0, X_1 = data_generator(val_16frame, C)
        # val_pred = np.argmax(DD_Net.predict([X_0, X_1]))
        # label_default = ACTION_MAP_ID[np.argmax(action_id)]
        label_default = decode_predict(action_id)
        print(label_default)
        body_dict_concat[body] = []
        image_array = []
        #write label to frames
        cv2.putText(image, f'[{label_default}]', (30, 100), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 0, 0), 3)
      else:
        try:
          cv2.putText(image, f'[{label_default}]', (int(lst_location[count][0]*width-70), int(lst_location[count][1]*height - int(lst_location[count][1]*height //2))), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 0, 0), 3)
        except:
          pass
      count += 1

    #write to video
    out.write(image)
    fps_time = time.time()
    if cv2.waitKey(1) and 0xFF == ord('q'):
        break
out.release()
cap.release()
cv2.destroyAllWindows()

walk_towards
shake_hand
walk_towards
walk_apart


In [85]:
image_array[0].shape

(1920, 1080, 3)