# Subsystem 1

# Model to get the landmark points and predict the gesture type using subsystem 2

In [1]:
from skimage.io import imread, imshow
from skimage.transform import resize
from skimage.feature import hog
from skimage import exposure
from sklearn import svm
from skimage import color
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
import matplotlib.pyplot as plt
import os
import numpy as np 
import joblib

%matplotlib inline

# This snippet allows to list all the frames saved to be used for training SVM classifier to differentiate palm and dorsal 

In [4]:
# Read the images dataset and prepare labels for Dorsal and Palm classifier
fnames = os.listdir('Images_data')
labels = []
for j in range(len(fnames)):
    val = ''
    temp = fnames[j].split('_')
    if 'palm' in temp:
        labels.append('palm')
    else:
        labels.append('dorsal')
print(len(fnames))
print(len(labels))

2400
2400


In [5]:
# Read the Images to extract the HOG features 

hog_features = []
img_length = 80
for i in fnames:
    file_name = 'Images_data' + '//' + i
    img = imread(file_name)
    #creating hog features  
    re_img = resize(img, (128,64))
    fd, hog_image = hog(re_img, orientations=9, pixels_per_cell=(8, 8), 
                    cells_per_block=(2, 2), visualize=True, multichannel=True)
    hog_features.append(fd)

In [8]:
# Reshape the vectors and shuffle the data

new_labels =  np.array(labels).reshape(len(labels),1)
new_hog_features = np.array(hog_features)
data_frame = np.hstack((new_hog_features,new_labels))
np.random.shuffle(data_frame)

In [9]:
# SPlit the dataset into train and test set

percentage = 80
partition = int(len(hog_features)*percentage/100)
x_train, x_test = data_frame[:partition,:-1],  data_frame[partition:,:-1]
y_train, y_test = data_frame[:partition,-1:].ravel() , data_frame[partition:,-1:].ravel()

In [10]:
# Set the SVM classifier model and fit the data to train

clf = svm.SVC()
clf.fit(x_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [14]:
# Save the SVM model

joblib.dump(clf, "Palmdorsal_class_model/Palmdorsal_Model") 

['Palmdorsal_class_model/Palmdorsal_Model']

In [11]:
# Evaluate the test set

y_pred = clf.predict(x_test)
print("Accuracy: "+str(accuracy_score(y_test, y_pred)))
print('\n')
print(classification_report(y_test, y_pred))

Accuracy: 1.0


              precision    recall  f1-score   support

      dorsal       1.00      1.00      1.00       246
        palm       1.00      1.00      1.00       234

    accuracy                           1.00       480
   macro avg       1.00      1.00      1.00       480
weighted avg       1.00      1.00      1.00       480



# This snippet lists all video files for which the gestures are to be predicted.

In [165]:
#Save training images for HOG features

import os
folder_loc = 'C:/Uppsala/Sem 2/Intelligent IS/Project/videos/'
arr = os.listdir(r'C:/Uppsala/Sem 2/Intelligent IS/Project/videos/.')
new_arr = []
for i in range(len(arr)):
    folder = folder_loc + arr[i] + '/.'
    fnames = os.listdir(folder)
    for j in range(len(fnames)):
        temp = folder_loc + arr[i] + '/' + fnames[j]
        new_arr.append(temp)
k=0
for i in range(len(new_arr)):
    cap = cv2.VideoCapture(new_arr[i])
    for j in range(40):
        ret,image = cap.read() 
        lst = new_arr[i].split('/')
        fname = lst[-1].split('.')[0] + '_' + str(k)
        cv2.imwrite('Images_data/%s.png' % fname,image)
        k+=1

In [318]:
# Detect palm or dorsal type 

input_source = r"C:\Uppsala\Sem 2\Intelligent IS\Project\videos\564\three_fingers_palm.webm"
cap = cv2.VideoCapture(input_source)
hasFrame, frame = cap.read()
re_img = resize(frame, (128,64))
fd, hog_image = hog(re_img, orientations=9, pixels_per_cell=(8, 8), 
                    cells_per_block=(2, 2), visualize=True, multichannel=True)
feature_pred = fd.reshape(1,len(fd))
res = clf.predict(feature_pred)
print(res)

['palm']


# This snippet is used to find the landmark points and draw them on the frames. Finally predict the gesture type using the saved model from susbsystem 2. 

In [2]:
# Landmarks extraction and drawing the keypoints for the gestures

import cv2
import time
import numpy as np
import joblib
import os,pickle
from keras.models import load_model

protoFile = "pose_deploy.prototxt"
weightsFile = "pose_iter_102000.caffemodel"
nPoints = 21
POSE_PAIRS = [ [0,2],[2,3],[3,4],[0,5],[5,6],[6,7],[7,8],[0,9],[9,10],[10,11],[11,12],[0,13],[13,14],[14,15],[15,16],[0,17],[17,18],[18,19],[19,20] ]

threshold = 0.2

lfolder = r"C:\Uppsala\Sem 2\Intelligent IS\Project"
scaler_filename = "Subsystem2_model/minmaxscaler.save"
subsys2_model = "Subsystem2_model/GestureRecogModel.tfl"
palm_dorsal_model = "Palmdorsal_class_model/Palmdorsal_Model"

gesture_types = ['Fist dorsal','Fist palm','Open dorsal','Open palm','Three fingers dorsal','Three fingers palm']

folder_loc = 'C:/Uppsala/Sem 2/Intelligent IS/Project/videos/'
ved_dirs = os.listdir(r'C:/Uppsala/Sem 2/Intelligent IS/Project/videos/.')
ved_file_names = []
for i in range(len(ved_dirs)):
    folder = folder_loc + ved_dirs[i] + '/.'
    fnames = os.listdir(folder)
    for j in range(len(fnames)):
        temp = folder_loc + ved_dirs[i] + '/' + fnames[j]
        ved_file_names.append(temp)
        
net = cv2.dnn.readNetFromCaffe(protoFile, weightsFile)
backend = cv2.CAP_ANY
fourcc_code = cv2.VideoWriter_fourcc(*'MP4V')
fps = 24
num_frames = 0
for l in range(len(ved_file_names)):
    print(ved_file_names[l])
    if ved_file_names[l] != r'C:/Uppsala/Sem 2/Intelligent IS/Project/videos/102/fist_palm.webm':
        continue
    print(num_frames)
    num_frames = 0
    annot_file_name = ved_file_names[l].split('/')[-1].split('.')[0] + '_' + str(l) + '.mp4'  
    annot_file_name = 'Annotated_2/' + annot_file_name
    print(annot_file_name)
    cap = cv2.VideoCapture(ved_file_names[l])

    frameWidth = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_size = (int(frameWidth),int(frameHeight))

    aspect_ratio = frameWidth/frameHeight

    inHeight = 368
    inWidth = int(((aspect_ratio*inHeight)*8)//8)
    vid_writer = cv2.VideoWriter(annot_file_name, backend, fourcc_code, int(fps), frame_size)
    hasFrame, frame = cap.read()
    print(ved_file_names[l])
    flag = 1
    while 1:
        t = time.time()  
        hasFrame, frame = cap.read()      
        if not hasFrame:
            break        
        frameCopy = np.copy(frame)
        num_frames += 1    
        # Use the above trained model for classifying the frame as "Palm" or "Dorsal"
        
        if flag:
            re_img = resize(frame, (128,64))
            fd, hog_image = hog(re_img, orientations=9, pixels_per_cell=(8, 8), 
                            cells_per_block=(2, 2), visualize=True, multichannel=True)
            feature_pred = fd.reshape(1,len(fd))
            palm_dorsal_clas = joblib.load(palm_dorsal_model)
            region_type = palm_dorsal_clas.predict(feature_pred)
            flag = 0
            #print(region_type)    

        inpBlob = cv2.dnn.blobFromImage(frame, 1.0 / 255, (inWidth, inHeight),
                              (0, 0, 0), swapRB=False, crop=False)
    
        net.setInput(inpBlob)
        output = net.forward()

        # Empty list to store the detected keypoints
        points = []
        temp = []
        for i in range(nPoints):
            # confidence map of corresponding body's part.
            probMap = output[0, i, :, :]
            probMap = cv2.resize(probMap, (frameWidth, frameHeight))

            # Find global maxima of the probMap.
            minVal, prob, minLoc, point = cv2.minMaxLoc(probMap)

            if prob > threshold :
                cv2.circle(frameCopy, (int(point[0]), int(point[1])), 6, (0, 255, 255), thickness=-1, lineType=cv2.FILLED)
                cv2.putText(frameCopy, "{}".format(i), (int(point[0]), int(point[1])), cv2.FONT_HERSHEY_SIMPLEX, .8, (0, 0, 255), 2, lineType=cv2.LINE_AA)

                # Add the point to the list if the probability is greater than the threshold
                points.append((int(point[0]), int(point[1])))
                temp.append(int(point[0]))
                temp.append(int(point[1]))
            else :
                points.append(None)
                temp.append(0)
                temp.append(0)
            
            # Format the landmark data to be passed to subsystem 2 for predicting the gesture type        
        del temp[2:4]
        if region_type == 'palm':
            lst = [0] * 40       #If palm type - then insert zeros for the dorsal keypoints
            temp.extend(lst)
        else:
            lst = [0] * 40       #If dorsal type - then insert zeros for the palm keypoints
            lst.extend(temp)     
            temp = lst
    
        # Draw Skeleton
        for pair in POSE_PAIRS:
            partA = pair[0]
            partB = pair[1]
            if points[partA] and points[partB]:
                cv2.line(frame, points[partA], points[partB], (0, 255, 255), 2, lineType=cv2.LINE_AA)
                cv2.circle(frame, points[partA], 5, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
                cv2.circle(frame, points[partB], 5, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)

        
    
        scaler = joblib.load(scaler_filename)
        temp = np.array(temp).reshape(1,-1)
        ges_pred = scaler.transform(temp)
        
        model_loaded = load_model(subsys2_model)
        gesture = model_loaded.predict_classes(ges_pred)
        
        cv2.putText(frame,gesture_types[gesture[0]], (50, 50), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 2, lineType=cv2.LINE_AA)
        cv2.imshow('Output-Skeleton', frame)
        key = cv2.waitKey(1)
        if key == 27:
            break

        print("Time Taken for frame = {}".format(time.time() - t))
        print(num_frames)
        vid_writer.write(frame)

vid_writer.release()

Using TensorFlow backend.


C:/Uppsala/Sem 2/Intelligent IS/Project/videos/102/fist_dorsal.webm
C:/Uppsala/Sem 2/Intelligent IS/Project/videos/102/fist_palm.webm
0
Annotated_2/fist_palm_1.mp4
C:/Uppsala/Sem 2/Intelligent IS/Project/videos/102/fist_palm.webm
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Time Taken for frame = 5.595217704772949
1
Time Taken for frame = 2.323603868484497
2
Time Taken for frame = 2.3979580402374268
3
Time Taken for frame = 2.5100085735321045
4
Time Taken for frame = 2.6271724700927734
5
Time Taken for frame = 5.997422933578491
6
Time Taken for frame = 6.4571373462677
7
Time Taken for frame = 6.818955421447754
8
Time Taken for frame = 6.493856191635132
9
Time Taken for frame = 6.948557376861572
10
Time Taken for frame = 7.110571622848511
11
Time Taken for frame = 7.452145338058472
12
Time Taken for frame = 7.593806505203247
13
Time Taken for frame = 7.77611