## Imports

`load_model`: Used to load in the models we saved  
`datetime`: Used to help name screenshots  
`mediapipe`: What will be drawing our hands for easier recognition by models  
`numpy`: Needed for just a conversion and finding the what action was chosen  
`pyautogui`: What will actually do the actions specficied   
`time`: Will be used as a sleep timer as well as naming screenshots  
`cv2`: OpenCv is what's used to access the webcam

In [1]:
from tensorflow.keras.models import load_model
from datetime import datetime
import mediapipe as mp
import numpy as np
import pyautogui
import time
import cv2

### Make Sure actions are in the same order as read in from `ImageDataGenerator` as order might have changed (alphabetically) from when first made
---
**Original order was this**
<img src = './notebook_imgs/original_indicies.png'>
---
**`ImageDataGenerator` changed the order to this**
<img src = './notebook_imgs/class_indicies.png'>

In [2]:
actions = ['forward', 'play_pause', 'rewind', 'screenshot', 'volume_down', 'volume_up']

### Load in the model

In [5]:
mn = load_model('../models/mobilenet.h5')

### Set up the MediaPipe hands model

In [4]:
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.7)

## Running everything together

In [7]:
# empty list to store the seq and action seq's
action_seq = []

# instantiate OpenCv
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()

    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) # convert image from BGR to RGB to work with mediapipe
    image = cv2.flip(image, 1) # flip on horizontal
    image.flags.writeable = False    # set flag to False
    results = hands.process(image)   # actually makes the detections
    image.flags.writeable = True     # set flag back to True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # set color back to BGR
    
    # draw the mediapipe hands on there
    if results.multi_hand_landmarks is not None:
        for num, hand in enumerate(results.multi_hand_landmarks):
            mp_drawing.draw_landmarks(image, hand, mp_hands.HAND_CONNECTIONS,
                                     mp_drawing.DrawingSpec(color=(51, 51, 255), thickness = 2, circle_radius=2),
                                     mp_drawing.DrawingSpec(color=(0, 0, 0), thickness = 2, circle_radius=2))
        
        # Our model is expecting an image of size 224 x 224
        small_image =  cv2.resize(image, (224,224), interpolation = cv2.INTER_AREA) 
        
        # passing image to the model
        y_pred = mn.predict((small_image.reshape(1, 224, 224, 3)/255).astype(np.float32))

        # get the image prediction 
        i_pred = int(np.argmax(y_pred))

        conf = y_pred[0][i_pred]

        # start loop over again if confidence isn't above 90
        if conf < 0.90:
            continue

        # add the prediction to the action_seq list 
        action = actions[i_pred]
        action_seq.append(action)

        # Show '?' if model doesn't know what to do yet if there are multiple poses in action_seq
        this_action = '?'

        # Make sure there are at least 3 predictions inside the action_seq list 
        if len(action_seq) > 3:
            
            # Need to make the same prediction 3 times before making 'this_action' = 'action'
            if action_seq[-1] == action_seq[-2] == action_seq[-3]:
                this_action = action

                # make it do the actions 
                if this_action == 'volume_up': # volume up
                    pyautogui.press('volumeup')

                elif this_action  == 'volume_down': # volume down
                    pyautogui.press('volumedown')

                elif this_action  == 'screenshot': # screen shot
                    now = datetime.now()
                    now = time.strftime("%m_%d_%Y_%H_%M_%S")
                    pyautogui.screenshot(f'../screenshots/{now}.png')

                elif this_action  == 'play_pause': # play
                    pyautogui.press('space')
                    time.sleep(0.05)
                    
                elif this_action  == 'forward': # forward
                    pyautogui.press('right')

                elif this_action  == 'rewind': # rewind
                    pyautogui.press('left')

            # print out the action that is being done
            cv2.putText(image, f'{this_action.upper()}', org= (10, 50), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=2, color=(52, 79, 235), thickness=2)
        else:
            # if nothing was found, then print out '?'
            cv2.putText(image, f'{this_action.upper()}', org= (10, 50), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=2, color=(52, 79, 235), thickness=2)
        
        # the model takes around 0.9 seconds to make a prediction 
        time.sleep(.1)
        
    cv2.imshow('Gesture Control', image)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
        
cap.release() 
cv2.destroyAllWindows()