# Hand Gesture Decoder

## Table of Content
0. [Import Dependencies](#install)
1. [Hands Detection using MediaPipe](#detection) 
2. [Feature Extraction](#feature-extraction) 
    1. [Write Columns Head in CSV File](#csv-header)
    2. [Extract Features of Assigned Class](#save-coordinates)
3. [Train Neural Network Using Tensorflow](#model)
    1. [Load and Preprocess Input Data](#load-input)
    2. [Train A MLP Model](#training)
    3. [Evaluate and Serialize Model](#evaluate)
4. [Real-time Detections with Model](#real-time-detection)

## 0. Import Dependencies <a id="install"></a>

In [None]:
%load_ext autotime

In [None]:
from mediapipe import solutions as mp
import cv2
import time
import os
import csv
import os
import numpy as np
import pandas as pd
import keyboard  
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
import nltk
from nltk import pos_tag, word_tokenize, RegexpParser

## 1. Hands Detection using MediaPipe <a id="detection"></a>

To connect your phone camera:
- Install `DroidCam` App on your phone and `DroidCam Client` on your laptop. 
- Put the IP address in the `cv2.VideoCapture()` below.
- Or use other phone camera IP service provider.

In [None]:
# Using phone camera ip with DroidCam
# Webcam is 0
#cap = cv2.VideoCapture("http://192.168.100.6:4747/mjpegfeed?640x480")
# cap = cv2.VideoCapture("http://192.168.0.160:4747/video?640x480")
cap = cv2.VideoCapture(0)

pTime = 0

# represents the top left corner of rectangle
start_point = (300, 100)
  
# represents the bottom right corner of rectangle
end_point = (600, 400)
  
# Blue color in BGR
color = (255, 0, 0)
  
# Line thickness of 2 px
thickness = 2

with mp.hands.Hands(max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:
    while True:
        success, frame = cap.read()
        
        if not success:
            break

        # Recolor feed
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Make detections
        results = hands.process(image)

        if results.multi_hand_landmarks:
            for hand_landmark in results.multi_hand_landmarks:
                
                # Draw hand landmarks
                mp.drawing_utils.draw_landmarks(image, hand_landmark, mp.hands.HAND_CONNECTIONS)
                
        
        # Display framerate
        cTime = time.time()
        fps = 1/(cTime-pTime+0.01)
        pTime = cTime
        cv2.putText(image, f"FPS: {int(fps)}", (500, 50), cv2.FONT_HERSHEY_PLAIN,
                    2, (255, 0, 0), 2)
    
        # Recolor for rendering
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        # Draw a rectangle with blue line borders of thickness of 2 px
        image = cv2.rectangle(image, start_point, end_point, color, thickness)

        
        cv2.imshow("Hand detection", image)
        
        # Press "q" to exit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

## 2. Feature Extraction <a id="feature-extraction"></a>

In [None]:
num_coords = len(results.multi_hand_landmarks[0].landmark) 
num_coords


### Write Columns Head in CSV File <a id="csv-header"></a>

In [None]:
landmarks = ['class']
for val in range(1, num_coords+1):
    landmarks += ['x{}'.format(val), 'y{}'.format(val),
                  'z{}'.format(val), 'v{}'.format(val)]

In [None]:
if not os.path.exists("data"):
    os.mkdir("data")

with open("data/hand_gesture_coords1.csv", mode="w", newline="" ) as f:
    csv_writer = csv.writer(f, delimiter=",", quotechar='"',quoting=csv.QUOTE_MINIMAL)
    #csv_writer.writerow(landmarks)

### Extract Features of Assigned Class <a id="save-coordinates"></a>

In [None]:
# Key in the class_name and start training :)
class_name = "Bad"

cap = cv2.VideoCapture(0)
# cap = cv2.VideoCapture("http://192.168.0.160:4747/video?640x480")

pTime = 0

# represents the top left corner of rectangle
start_point = (300, 100)
  
# represents the bottom right corner of rectangle
end_point = (600, 400)
  
# Blue color in BGR
color = (255, 0, 0)
  
# Line thickness of 2 px
thickness = 2
  

with mp.hands.Hands(max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:

    while True:
        success, frame = cap.read()
        
        if not success:
            break

        # Recolor feed
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Make detections
        results = hands.process(image)

        if results.multi_hand_landmarks:
            for hand_landmark in results.multi_hand_landmarks:
                
                # Draw hand landmarks
                mp.drawing_utils.draw_landmarks(image, hand_landmark, mp.hands.HAND_CONNECTIONS)
                
                hand = hand_landmark.landmark
                row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] 
                                          for landmark in hand]).flatten())
                
                # Append class name
                row.insert(0, class_name)
            
                # Export to CVS
                with open("data/3_hand_gesture_coords.csv", mode="a", newline="" ) as f:
                    csv_writer = csv.writer(f, delimiter=",", quotechar='"',quoting=csv.QUOTE_MINIMAL)
                    csv_writer.writerow(row)  
         
                
                # Display framerate
                cTime = time.time()
                fps = 1/(cTime-pTime+0.01)
                pTime = cTime
                cv2.putText(image, f"FPS: {int(fps)}", (500, 50), cv2.FONT_HERSHEY_PLAIN,
                            2, (255, 0, 0), 2)
    
        # Recolor for rendering
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        # Draw a rectangle with blue line borders of thickness of 2 px
        image = cv2.rectangle(image, start_point, end_point, color, thickness)


        cv2.imshow("Hand detection", image)

        # Press "q" to exit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

## 3. Train Nueral Network Using Tensorflow <a id="model"></a>

### Load and Preprocess Input Data <a id="load-input"></a>

In [None]:
df = pd.read_csv("data/3_hand_gesture_coords.csv")
df

In [None]:
df['class'].value_counts()


In [None]:
# Split into input and output columns
X = df.drop("class", axis=1)
y = df["class"]

# Ensure all data are floating point values
X = X.astype('float32')

# Encode strings to integer
le = LabelEncoder()
y = le.fit_transform(y)
np.save("data/hand_geture_classes.npy", le.classes_)

In [None]:
from collections import Counter
print('Original dataset shape %s' % Counter(y))

In [None]:
#smote
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

over = SMOTE(random_state=42, n_jobs=-1, k_neighbors=4)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

X_res, y_res = over.fit_resample(X, y)


In [None]:
print('Resampled dataset shape %s' % Counter(y_res))

In [None]:
# Split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=47)

# Determine the number of input features
n_features = X_train.shape[1]

# Determine the number or classes
n_class = len(np.unique(y_train))

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### Train A MLP Model <a id="training"></a>

In [None]:
# Define a model with the sequential api
def create_model():
    model = Sequential()
    model.add(Dense(10, activation='relu', kernel_initializer='he_normal', input_shape=(n_features,)))
    model.add(Dense(10, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(n_class, activation='softmax'))
    return model

model = create_model()



In [None]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Fit the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0, validation_split=0.2, shuffle=True,
                   use_multiprocessing=True)

# Plot learning curves
plt.title('Learning Curves')
plt.xlabel('Epoch')
plt.ylabel('Cross Entropy')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.legend()
plt.show()

In [None]:
# from sklearn.model_selection import RepeatedStratifiedKFold
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import StratifiedKFold


### Evaluate and Serialize Model <a id="evaluate"></a>

In [None]:
# Evaluate the model
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test Accuracy: %.3f' % acc)

# Make a prediction
row = np.random.randn(1, n_features)
yhat = model.predict([row])
print('Predicted: %s (class=%d | %s)' % (yhat, np.argmax(yhat), list(le.inverse_transform([np.argmax(yhat)]))[0]))

In [None]:
if not os.path.exists("generated_model"):
    os.mkdir("generated_model")
    
# Save model to file
model.save("generated_model/3.1a_hand_gesture_model.h5")

## 4. Real-time Detections with Model <a id="real-time-detection"></a>

In [None]:
# Load the model from file
model_inference = load_model("generated_model/3.1a_hand_gesture_model.h5")

# Load the class names
le = LabelEncoder()
le.classes_ = np.load("data/hand_geture_classes.npy", allow_pickle=True)

# Define a text list to store result
text = []

In [None]:
def SignDetection(text):
    cap = cv2.VideoCapture(0)

    pTime = 0

    # represents the top left corner of rectangle
    start_point = (300, 60)

    # represents the bottom right corner of rectangle
    end_point = (600, 380)

    # Blue color in BGR
    color = (255, 0, 0)

    # Line thickness of 2 px
    thickness = 2

    result = []

    with mp.hands.Hands(max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5) as hands:

        while True:
            success, frame = cap.read()

            if not success:
                break

            # Recolor feed
            image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            # Make detections
            results = hands.process(image)

            if results.multi_hand_landmarks:
                for hand_landmark in results.multi_hand_landmarks:

                    # Draw hand landmarks
                    mp.drawing_utils.draw_landmarks(image, hand_landmark, mp.hands.HAND_CONNECTIONS)

                    hand = hand_landmark.landmark
                    row = np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] 
                                              for landmark in hand]).flatten()

                    # Predict using inference model
                    pred = model_inference.predict(row.reshape(1, -1))
                    class_name = list(le.inverse_transform([np.argmax(pred)]))[0]
                    prob = np.max(pred).round(2)
                    output = ''.join(result)  # Define Output Text
                    #print(class_name, prob)  

                    # Display result 
                    cv2.rectangle(image, (0,0), (250,60), (245, 117, 16), -1)
                    cv2.putText(image, "CLASS", (95,12), cv2.FONT_HERSHEY_SIMPLEX,
                               0.5, (0,0,0), 1, cv2.LINE_AA)
                    cv2.putText(image, class_name, (90,40), cv2.FONT_HERSHEY_SIMPLEX,
                               1, (255, 255, 255), 2, cv2.LINE_AA)
                    cv2.putText(image, "PROB", (15,12), cv2.FONT_HERSHEY_SIMPLEX,
                               0.5, (0,0,0), 1, cv2.LINE_AA)
                    cv2.putText(image, str(prob), (10,40), cv2.FONT_HERSHEY_SIMPLEX,
                               1, (255, 255, 255), 2, cv2.LINE_AA)

                    # Display Output Text
                    cv2.rectangle(image, (0,400), (650,500), (245, 117, 16), -1)
                    cv2.putText(image, "RESULT", (15,420), cv2.FONT_HERSHEY_SIMPLEX,
                               0.5, (0,0,0), 1, cv2.LINE_AA)
                    cv2.putText(image, output, (10,460), cv2.FONT_HERSHEY_SIMPLEX,
                               1, (255, 255, 255), 2, cv2.LINE_AA) 


                    # Display framerate
                    cTime = time.time()
                    fps = 1/(cTime-pTime)
                    pTime = cTime
                    cv2.putText(image, f"FPS: {int(fps)}", (500, 50), cv2.FONT_HERSHEY_PLAIN,
                                2, (255, 0, 0), 2)

                    # Retrieve Output Text
                    if (prob >= 0.8) & (keyboard.is_pressed("enter")):  # Detect 'm' input
                        result.append(class_name)
                        time.sleep(1)

                    if (keyboard.is_pressed("space")):  # Detect 'n' input to add space
                        result.append(' ')
                        time.sleep(1)


            # Recolor for rendering
            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            # Draw a rectangle with blue line borders of thickness of 2 px
            image = cv2.rectangle(image, start_point, end_point, color, thickness)


            cv2.imshow("Hand detection", image)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()

    # Store into text list
    if result:   #If not empty list, then append
        text.append(''.join(result))


https://www.kaggle.com/datamunge/sign-language-mnist

# UI Console

In [None]:
import pandas as pd

# assign data of lists.  
data = {'Name': ['MOCHA', 'LATTE', 'CHOCO', 'APPLE', 'OREN', 'MANGO', 'PANDAN', 'CARAMEL', 'CARROT'], 
        'Category': ['COFFEE', 'COFFEE', 'COFFEE', 'JUICE', 'JUICE', 'JUICE', 'CAKE', 'CAKE', 'CAKE'],
        'Price': [8.99, 9.99, 10.99, 7.99, 6.99, 5.99, 12.59, 10.10, 15.20]}  
df = pd.DataFrame(data)
df

In [None]:
def Menu(text):
    rslt_df = df.loc[df['Category'] == text[0]]
    rslt_df = rslt_df.reset_index(drop=True)
    print("\nPlease Pick One From Below: ")
    print("Name        Price")
    print("-----------------")
    for x in range(len(rslt_df)):
        print('%s       %.2f' % (rslt_df['Name'][x], rslt_df['Price'][x]))
    print("\n")


In [None]:
def Price(text):
    rslt_df = df.loc[df['Name'] == text[1]]
    rslt_df = rslt_df.reset_index(drop=True)
    for x in range(len(rslt_df)):
        text.append(rslt_df['Price'][x])


In [None]:
def Main(text): 
    # Define a text list to store result
#     text = ['CAKE', 'CARROT']
    a = input("ZhenLi Coffee Shop \n" 
              "Please Pick One Category to Proceed \n"
              " - Coffee \n - Juice \n - Cake \n"
              "1. Start Sign Detection \n"
              "Your input is: ")
    userinput = int(a)
    
    if (userinput == 1):
        print("\nOpening Camera....")
        print("Tips: 'Enter' to confirm sign || 'q' to end program")
        SignDetection(text) # Return user input
        print("\nYou have picked : " + text[0])
        rslt_df = df.loc[df['Category'] == text[0]]
        Menu(text)  #Print all menu item
        
        b = input("1. Start Sign Detection \n"
                  "Your input is: ")
        userinput = int(b)
        if (userinput == 1):
            print("\nOpening Camera....")
            print("Tips: 'Enter' to confirm sign || 'q' to end program")
            SignDetection(text)
            print("\nYou have picked : " + text[1])
        else: 
            raise Exception("Please Enter Correct Input") 
        
        Price(text)    #Find the price
        
        print("\n\n--------------")
        print("Your Orders:")
        print("--------------")
        print("You have ordered a %s %s" %(text[1],text[0]))
        print("Total Price: RM %.2f" %text[2])
    else: 
        raise Exception("Please Enter Correct Input") 
    
    return text

In [None]:
text=[]
text = Main(text)

# Pos Taging

In [None]:
# Create DataFrame
df = pd.DataFrame(columns = ['text', 'tagged_text'])

for x in text:
    print(x)
    tag_text = word_tokenize(x)
    pos_text = nltk.pos_tag(tag_text)
    #Insert into dataframe
    df_length = len(df)
    df.loc[df_length] = [x,pos_text]
    
df

# Parsing (Parse Tree)

In [None]:
# Extract all parts of speech from any text
chunker = RegexpParser("""
                       NP: {<DT>?<JJ>*<NN>}    #To extract Noun Phrases
                       P: {<IN>}               #To extract Prepositions
                       V: {<V.*>}              #To extract Verbs
                       PP: {<p> <NP>}          #To extract Prepositional Phrases
                       VP: {<V> <NP|PP>*}      #To extract Verb Phrases
                       """)

# Print all parts of speech in above sentence
for x in range(len(df)):
    print("Before Extracting\n", df['text'][x])
    output = chunker.parse(df['tagged_text'][x])
    print("After Extracting\n", output)
    
    # To draw the parse tree
    output.draw()

# Grammar Designing

In [None]:
# code for displaying multiple images in one figure
  
#import libraries
import cv2
from matplotlib import pyplot as plt
  
# create figure
fig = plt.figure(figsize=(10, 7))
  
# setting values to rows and column variables
rows = 2
columns = 5
  
# reading images
Image1 = cv2.imread('sign_pic/hello.png')
Image2 = cv2.imread('sign_pic/thank you.png')
Image3 = cv2.imread('sign_pic/love.png')
Image4 = cv2.imread('sign_pic/call me.png')
Image5 = cv2.imread('sign_pic/yes.png')
Image6 = cv2.imread('sign_pic/no.png')
Image7 = cv2.imread('sign_pic/bye.png')
Image8 = cv2.imread('sign_pic/ok.png')
Image9 = cv2.imread('sign_pic/nice.png')
Image10 = cv2.imread('sign_pic/bad.png')

# Adds a subplot at the 1st position
fig.add_subplot(rows, columns, 1)
  
# showing image
plt.imshow(Image1)
plt.axis('off')
plt.title("Hello")
  
# Adds a subplot at the 2nd position
fig.add_subplot(rows, columns, 2)
  
# showing image
plt.imshow(Image2)
plt.axis('off')
plt.title("Thank you")
  
# Adds a subplot at the 3rd position
fig.add_subplot(rows, columns, 3)
  
# showing image
plt.imshow(Image3)
plt.axis('off')
plt.title("Love")
  
# Adds a subplot at the 4th position
fig.add_subplot(rows, columns, 4)
  
# showing image
plt.imshow(Image4)
plt.axis('off')
plt.title("Call me")

# Adds a subplot at the 4th position
fig.add_subplot(rows, columns, 5)
  
# showing image
plt.imshow(Image5)
plt.axis('off')
plt.title("Yes")

# Adds a subplot at the 4th position
fig.add_subplot(rows, columns, 6)
  
# showing image
plt.imshow(Image6)
plt.axis('off')
plt.title("No")

# Adds a subplot at the 4th position
fig.add_subplot(rows, columns, 7)
  
# showing image
plt.imshow(Image7)
plt.axis('off')
plt.title("Bye")

# Adds a subplot at the 4th position
fig.add_subplot(rows, columns, 8)
  
# showing image
plt.imshow(Image8)
plt.axis('off')
plt.title("Ok")

# Adds a subplot at the 4th position
fig.add_subplot(rows, columns, 9)
  
# showing image
plt.imshow(Image9)
plt.axis('off')
plt.title("Nice")

# Adds a subplot at the 4th position
fig.add_subplot(rows, columns, 10)
  
# showing image
plt.imshow(Image10)
plt.axis('off')
plt.title("Bad")

In [None]:
import pandas as pd
df1 = pd.DataFrame()

In [None]:
text1 = []

def Test(text1): 
    # Define a text list to store result
#     text = ['CAKE', 'CARROT']
    a = input("1. Start Sign Detection \n"
              "Your input is: ")
    userinput = int(a)
    
    if (userinput == 1):
        print("\nOpening Camera....")
        print("Tips: 'Enter' to confirm sign || 'q' to end program")
        SignDetection(text1) # Return user input
        print("\nYour Sentence is: " + ' '.join(map(str, text1)))
    
    else: 
        raise Exception("Please Enter Correct Input") 
    
    return text

In [None]:
text1 = Test(text1)

In [None]:
# Create DataFrame
df1 = pd.DataFrame(columns = ['text', 'tagged_text'])

for x in text1:
    print(x)
    tag_text = word_tokenize(x)
    pos_text = nltk.pos_tag(tag_text)
    #Insert into dataframe
    df1_length = len(df1)
    df1.loc[df1_length] = [x,pos_text]
    
df1

In [None]:
# Extract all parts of speech from any text
chunker = RegexpParser("""
                       NP: {<DT>?<JJ>*<NN>}    #To extract Noun Phrases
                       P: {<IN>}               #To extract Prepositions
                       V: {<V.*>}              #To extract Verbs
                       PP: {<p> <NP>}          #To extract Prepositional Phrases
                       VP: {<V> <NP|PP>*}      #To extract Verb Phrases
                       """)

# Print all parts of speech in above sentence
for x in range(len(df1)):
    print("Before Extracting\n", df1['text'][x])
    output = chunker.parse(df1['tagged_text'][x])
    print("After Extracting\n", output)
    
    # To draw the parse tree
    output.draw()

# Natural Language Generation