In [1]:
import numpy as np
import matplotlib.pyplot as plt
import mediapipe as mp
import time
import os

In [2]:
import cv2 as cv

In [3]:
mp_drawing=mp.solutions.drawing_utils
mp_holistic=mp.solutions.holistic
holistic=mp_holistic.Holistic(min_detection_confidence=0.5,min_tracking_confidence=0.5)

In [4]:
style1=mp_drawing.DrawingSpec((131,255,0),1,1)
style2=mp_drawing.DrawingSpec((255,221,0),1,1)

In [5]:
def mediapipe_process(image,model):
    image=cv.cvtColor(image,cv.COLOR_BGR2RGB)  # convert image from bgr to rgb
    results=model.process(image)    # process the image or give to model
    image=cv.cvtColor(image,cv.COLOR_RGB2BGR) # again convert rgb to bgr
    return image,results   #return the image and the results
    

In [6]:
def draw_landmarks(image,results):
    mp_drawing.draw_landmarks(image,results.pose_landmarks,mp_holistic.POSE_CONNECTIONS,style1,style2)
    mp_drawing.draw_landmarks(image,results.left_hand_landmarks,mp_holistic.HAND_CONNECTIONS,style1,style2)
    mp_drawing.draw_landmarks(image,results.right_hand_landmarks,mp_holistic.HAND_CONNECTIONS,style1,style2)
    mp_drawing.draw_landmarks(image,results.face_landmarks,mp_holistic.FACEMESH_CONTOURS,style1,style2)

In [7]:
cap=cv.VideoCapture(0)
while cap.isOpened():
    _,frame=cap.read()
    image,results=mediapipe_process(frame,holistic)
    draw_landmarks(image,results)
    if cv.waitKey(1)==27:
        cap.release()
        cv.destroyAllWindows()
        break

    cv.imshow("frame",image)



In [17]:
results.face_landmarks.landmark  #  we get list of landamrks
results.face_landmarks.landmark[0]  # we get the first landmark object

x: 0.5417942
y: 0.70010716
z: -0.025448533

In [65]:
face=np.array([[landmark.x,landmark.y,landmark.z] for landmark in results.face_landmarks.landmark]).flatten()  if results.face_landmarks else np.zeros(1404)

In [66]:
face.shape


(1404,)

In [69]:
pose=np.array([[landmark.x,landmark.y,landmark.z , landmark.visibility] for landmark in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)

In [70]:
pose.shape,33*4

((132,), 132)

In [36]:
len(results.left_hand_landmarks.landmark)

21

In [38]:
results.left_hand_landmarks.landmark[0]

x: 0.76958835
y: 0.63348305
z: 2.7013255e-07

In [56]:
lh=np.array([[landmark.x,landmark.y,landmark.z] for landmark in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(63)

In [57]:
lh

array([ 7.69588351e-01,  6.33483052e-01,  2.70132546e-07,  6.98390722e-01,
        5.93119085e-01, -1.61111150e-02,  6.48746967e-01,  5.14213622e-01,
       -2.08378024e-02,  6.31027758e-01,  4.34459209e-01, -2.63571087e-02,
        6.28490269e-01,  3.70662779e-01, -3.13980430e-02,  6.86570883e-01,
        4.08099413e-01,  2.96997512e-03,  6.62723601e-01,  3.21024448e-01,
       -1.32737849e-02,  6.47231102e-01,  2.66576499e-01, -2.93239672e-02,
        6.35680676e-01,  2.18851924e-01, -4.17023003e-02,  7.25880146e-01,
        3.91054809e-01, -3.71778477e-03,  7.12408900e-01,  2.84705460e-01,
       -1.78524610e-02,  7.01466084e-01,  2.17522681e-01, -3.35482731e-02,
        6.92329586e-01,  1.61942691e-01, -4.48535495e-02,  7.65181482e-01,
        3.94818842e-01, -1.48388082e-02,  7.54871905e-01,  2.95593262e-01,
       -3.22471112e-02,  7.44168520e-01,  2.35549107e-01, -4.45502438e-02,
        7.33477473e-01,  1.85654715e-01, -5.27232215e-02,  8.07887375e-01,
        4.15152311e-01, -

In [54]:
rh=np.array([[landmark.x,landmark.y,landmark.z] for landmark in results.left_right_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(63)

In [55]:
rh

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
'''
1. we can see if left hand or the right hand is not detected then the results.left/right_hand will throw error
   so we need to create a blank array same as the length of the total array
   without flattening we get an array of shape = (21,3) 21: each landmark,3: x,y,z of each landmark
   after flattening we get array of length 21*3=63 

   // same for right hand
   after flattening for right hand we get array of length 21*3=63

   to avoid the error we create and array of all zeros of length 63  by using np.zeros(63)

2. without flattening the pose landamrks array then shape is (33,4)
   hence after flattening we get a single array of 33*4=132 lenght array

   

3. using face landmarks we can detect 468 landmarks on the face
    before flattening shape of array is (468,3)
    after flattening we get length=468*3=1404

    create a blank array of all zeros

4. np.concatenate([a,b,c]) will merge a,b,c arrays in one single array
    after concatenating our arrays of pose lh rh and face we will get a single big array of length=468*3+33*4+2(21*3)=1662
    total features

5. our basic idea is that we will be collecting 30 frames per seconds which is 30fps and will be making predictions on those 
    30 frames values
    for one class we will have n videos  ===== n actual data points
        one video consists of 30 frames
        each frame has 1662 values
        total values we will collect to make 1 prediction will be 30*1662=49860

        so our input shape to our model will be (30,1662)
        oops thats too big isnt it?

6. np.save(filename,array) will save the array as file with a npy extension
    this is usefull to extract datapoints from each frame and store accordingly
'''

In [73]:
468*3+33*4+2*(21*3)

1662

In [77]:
30*1662

49860

In [71]:
a=np.arange(0,10)
b=np.arange(10,20)

np.concatenate([a,b])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [8]:
def extract_landmarks(results):
    face=np.array([[landmark.x,landmark.y,landmark.z] for landmark in results.face_landmarks.landmark]).flatten()  if results.face_landmarks else np.zeros(1404)
    pose=np.array([[landmark.x,landmark.y,landmark.z , landmark.visibility] for landmark in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
    rh=np.array([[landmark.x,landmark.y,landmark.z] for landmark in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(63)
    lh=np.array([[landmark.x,landmark.y,landmark.z] for landmark in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(63)

    return np.concatenate([pose,face,lh,rh])

In [76]:
extract_landmarks(results).shape

(1662,)

In [78]:
# ###################################
        # Creating folders
####################################


In [9]:
data_dir='data'

In [10]:
actions=np.array(['hello','thanks','iloveyou','please','blank'])

In [11]:
no_of_videos=30  #will be n for isl
no_of_frames=30  # will be 30 only for isl

In [86]:
# create dirs
# for action in actions:
#     for i in range(no_of_videos):
#         try:
#             os.makedirs(os.path.join(data_dir,,str(i)))
#         except:
#             print('error')

In [88]:
# np.save('1',extract_landmarks(results))

In [12]:
# collect data
model=holistic

In [13]:
try:
    cap=cv.VideoCapture(0)
    for action in actions:  #  for each action
        print(action)
        for video in range(no_of_videos):  # for n videos
            for frame in range(no_of_frames):
                
    
                # read the frame from the camera/webcam
                _,framee=cap.read()
                image,results=mediapipe_process(framee,model)
                draw_landmarks(image,results)
                a=extract_landmarks(results)
    
                if frame==0:  # for the first frame
                    cv.putText(image,f'starting collection',(100,200),cv.FONT_HERSHEY_COMPLEX_SMALL,3,(0,0,255),1,cv.LINE_AA)
                    cv.putText(image,f'For {action}  Video {video} Frame {frame}',(120,32),cv.FONT_HERSHEY_COMPLEX_SMALL,1,(0,255,0),1,cv.LINE_AA)
                    cv.imshow('image',image)
                    cv.waitKey(4000)
                else:
                    cv.putText(image,f'For Video {action} {video} Frame {frame}',(120,32),cv.FONT_HERSHEY_COMPLEX_SMALL,1,(0,255,0),1,cv.LINE_AA)
                    cv.imshow('image',image)
    
                a_path=os.path.join(data_dir,action,str(video),str(frame))
                np.save(a_path,a)            
                if cv.waitKey(5)==27:
                    cap.release()
                    break
            
    
    cv.destroyAllWindows()
    cap.release()
except:
    print('error')

hello
thanks
iloveyou
please
blank


In [13]:
cap.release()