In [1]:
#Based on https://www.kaggle.com/code/hengck23/generate-your-own-pose-data-from-video

In [2]:
import cv2
import mediapipe as mp 
import numpy as np
import pandas as pd
from tensorflow.lite.python.interpreter import Interpreter
import json

In [3]:
mp_drawing = mp.solutions.drawing_utils
mp_holistic = mp.solutions.holistic 

In [4]:
Truth_value = 'COW'

In [5]:
video_file = 'sign_mp4/COW.mp4'
cap = cv2.VideoCapture(video_file)
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.1)

In [6]:
video_df = []
frame_no=0
while cap.isOpened():
    print('\r',frame_no,end='')
    success, image = cap.read()

    if not success: break
    image = cv2.resize(image, dsize=None, fx=4, fy=4)
    height,width,_ = image.shape

    #print(image.shape)
    image.flags.writeable = False
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    result = holistic.process(image)

    data = [] 
    fy = height/width

    if result.face_landmarks is None:
        for i in range(468): #
            data.append({
                'type' : 'face',
                'landmark_index' : i,
                'x' : np.nan,
                'y' : np.nan,
                'z' : np.nan,
            })
    else:
        assert(len(result.face_landmarks.landmark)==468)
        for i in range(468): #
            xyz = result.face_landmarks.landmark[i]
            data.append({
                'type' : 'face',
                'landmark_index' : i,
                'x' : xyz.x,
                'y' : xyz.y *fy,
                'z' : xyz.z,
            })

    if result.left_hand_landmarks is None:
        for i in range(21):  #
            data.append({
                'type': 'left_hand',
                'landmark_index': i,
                'x': np.nan,
                'y': np.nan,
                'z': np.nan,
            })
    else:
        assert (len(result.left_hand_landmarks.landmark) == 21)
        for i in range(21):  #
            xyz = result.left_hand_landmarks.landmark[i]
            data.append({
                'type': 'left_hand',
                'landmark_index': i,
                'x': xyz.x,
                'y': xyz.y *fy,
                'z': xyz.z,
            })

    if result.pose_landmarks is None:
        for i in range(33):  #
            data.append({
                'type': 'pose',
                'landmark_index': i,
                'x': np.nan,
                'y': np.nan,
                'z': np.nan,
            })
    else:
        assert (len(result.pose_landmarks.landmark) == 33)
        for i in range(33):  #
            xyz = result.pose_landmarks.landmark[i]
            data.append({
                'type': 'pose',
                'landmark_index': i,
                'x': xyz.x,
                'y': xyz.y *fy,
                'z': xyz.z,
            })

    if result.right_hand_landmarks is None:
        for i in range(21):  #
            data.append({
                'type': 'right_hand',
                'landmark_index': i,
                'x': np.nan,
                'y': np.nan,
                'z': np.nan,
            })
    else:
        assert (len(result.right_hand_landmarks.landmark) == 21)
        for i in range(21):  #
            xyz = result.right_hand_landmarks.landmark[i]
            data.append({
                'type': 'right_hand',
                'landmark_index': i,
                'x': xyz.x,
                'y': xyz.y *fy,
                'z': xyz.z,
            })
        zz=0

    frame_df = pd.DataFrame(data)
    frame_df.loc[:,'frame'] =  frame_no
    frame_df.loc[:, 'height'] = height/width
    frame_df.loc[:, 'width'] = width/width
    video_df.append(frame_df)

    frame_no +=1

 78

In [7]:
video_df = pd.concat(video_df)

In [8]:
video_df.head()

Unnamed: 0,type,landmark_index,x,y,z,frame,height,width
0,face,0,0.511491,0.176394,-0.009212,0,0.5625,1.0
1,face,1,0.511283,0.162703,-0.023297,0,0.5625,1.0
2,face,2,0.511493,0.166513,-0.011153,0,0.5625,1.0
3,face,3,0.508465,0.143842,-0.018881,0,0.5625,1.0
4,face,4,0.511475,0.157312,-0.025251,0,0.5625,1.0


In [9]:
ROWS_PER_FRAME = 543  # number of landmarks per frame
def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pq_path[data_columns] #pd.read_csv(pq_path, usecols=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

In [10]:
def read_json_file(file_path):
    try:
        # Open the file and load the JSON data into a Python object
        with open(file_path, 'r') as file:
            json_data = json.load(file)
        return json_data
    except FileNotFoundError:
        # Raise an error if the file path does not exist
        raise FileNotFoundError(f"File not found: {file_path}")
    except ValueError:
        # Raise an error if the file does not contain valid JSON data
        raise ValueError(f"Invalid JSON data in file: {file_path}")

In [11]:
interpreter = Interpreter("model.tflite")
found_signatures = list(interpreter.get_signature_list().keys())
prediction_fn = interpreter.get_signature_runner("serving_default")

In [12]:
decoder = {v: k for k, v in read_json_file("sign_to_prediction_index_map.json").items()}

In [13]:
output = prediction_fn(inputs=load_relevant_data_subset(video_df))
sign = np.argmax(output["outputs"])

print('Truth:',Truth_value)

print("PRED : ", decoder[sign] + '  (Probability: ', max(output['outputs']) , ')')
poss_pred = np.argsort(output["outputs"])[-10:][::-1]
poss_predict =[]
for i in poss_pred:
    poss_predict.append((decoder[i], output['outputs'][i]))
print('')
print("Top 10 Possible Prediction")
print(poss_predict)

Truth: COW
PRED :  cow  (Probability:  0.03423151 )

Top 10 Possible Prediction
[('cow', 0.03423151), ('owl', 0.0182408), ('drop', 0.013346677), ('drink', 0.011414438), ('balloon', 0.01050549), ('tomorrow', 0.0098453965), ('noisy', 0.0096206935), ('bird', 0.009519571), ('bug', 0.008686943), ('grandpa', 0.008581198)]
