# Imports

In [None]:
import os
import cv2
import numpy as np
import mediapipe as mp
import threading
import imageio

from functools import partial
from matplotlib import pyplot as plt
import matplotlib.animation as animation

from sklearn.model_selection import train_test_split

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

from scipy import stats
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

import feedforward

In [None]:
NUMPY_ARRAYS_PATH = os.path.join('dist/numpy_arrays')
VIDEOS_PATH = os.path.join('dist/videos')
GIF_THREADS = {}
ACTIONS = np.array(['1', '2', '3'])
NUM_VIDEOS_PER_ACTION = 20
NUM_FRAMES_PER_VIDEO = 45

## Create Videos Dataset, extract Numpy Arrays and create Gifs

In [None]:
feedforward.create_folder_if_not_exists(NUMPY_ARRAYS_PATH)
feedforward.create_folder_if_not_exists(VIDEOS_PATH)

for action in ACTIONS:
    feedforward.create_folder_if_not_exists(os.path.join(NUMPY_ARRAYS_PATH, action))
    feedforward.create_folder_if_not_exists(os.path.join(VIDEOS_PATH, action))

In [None]:
for action in ACTIONS:
    video_prefix = ''
    
    subdir_videos = os.path.join(VIDEOS_PATH, action)
    feedforward.register_videos(subdir_videos, video_prefix, NUM_VIDEOS_PER_ACTION, NUM_FRAMES_PER_VIDEO, action)
    
    subdir_numpy_arrays = os.path.join(NUMPY_ARRAYS_PATH, action)
    feedforward.videos_2_numpy_pose_arrays(subdir_videos, subdir_numpy_arrays)

In [None]:
def display_gif(gif_path: str, window_name: str, display_time_ms: int = 100):
    gif = imageio.get_reader(gif_path)
    cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)

    for gif_frame in gif:
        cv2.imshow(window_name, gif_frame)
        cv2.waitKey(display_time_ms)

    cv2.destroyWindow(window_name)

def terminate_all_threads():
    global GIF_THREADS
    for action, thread in GIF_THREADS.items():
        thread.join()
    GIF_THREADS = {} 

def open_suggested_gif(gif_path):
    if gif_path not in GIF_THREADS or not GIF_THREADS[gif_path].is_alive():
        thread = threading.Thread(target=display_gif, args=(gif_path, gif_path))
        thread.start()
        GIF_THREADS[gif_path] = thread

def save_gif(landmarks_list: list, output_path: str):
    fig = plt.figure()
    ani = animation.FuncAnimation(fig, partial(feedforward.draw_frame, landmarks_list=landmarks_list), frames=len(landmarks_list), interval=100)
    ani.save(output_path, writer='imagemagick', fps=10) # TODO: Capire se si puo usare la variabile NUM_FRAMES_PER_VIDEO

In [None]:
for action in ACTIONS:
    first_video_path = os.path.join(VIDEOS_PATH, action, '2.mp4')
    pose_landmarks = feedforward.load_pose_landmarks_from_video(first_video_path)
    output_path = os.path.join(NUMPY_ARRAYS_PATH, action, f'{action}.gif')
    save_gif(pose_landmarks, output_path)

## Build and Train LSTM Neural Network

In [None]:
label_map = {label: num for num, label in enumerate(ACTIONS)}
sequences, labels = [], []

for action in ACTIONS:
    for idx_video in range(NUM_VIDEOS_PER_ACTION):
        window = []
        for frame_num in range(NUM_FRAMES_PER_VIDEO):
            res = np.load(os.path.join(NUMPY_ARRAYS_PATH, action, str(idx_video), f"{frame_num}.npy"))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [None]:
X = np.array(sequences)
y = to_categorical(labels).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

In [None]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(NUM_FRAMES_PER_VIDEO, feedforward.POSE_DESCRIPTOR_LEN)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(ACTIONS.shape[0], activation='softmax'))

log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.fit(X_train, y_train, epochs=50, callbacks=[tb_callback])

# Evaluation using Confusion Matrix and Accuracy



In [None]:
yhat = model.predict(X_test)
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [None]:
multilabel_confusion_matrix(ytrue, yhat)

In [None]:
accuracy_score(ytrue, yhat)
precision_score = lambda ytrue, yhat: np.sum([1 for yt, yp in zip(ytrue, yhat) if yt == yp]) / len(yhat)
recall_score = lambda ytrue, yhat: np.sum([1 for yt, yp in zip(ytrue, yhat) if yt == yp]) / len(ytrue)
f_score = stats.hmean([precision_score(ytrue, yhat), recall_score(ytrue, yhat)])
recall_score(ytrue, yhat)

# Test in Real Time v2
Predict more combinations at a time

In [None]:
sequence = []
sentence = []
predictions = []

NUM_FRAMES_FOR_STABILITY = 9
THRESHOLD = 0.75

known_combinations = {
    '123': ['1', '2', '3'],
    '212': ['2', '1', '2'],
    '312': ['3', '1', '2'],
    #'3212': ['3', '2', '1', '2'],
    #'213': ['2', '1', '3'],
    #'2312': ['2', '3', '1', '2'],
}

actual_combination = ""

suggested_next_gesture = ""
next_gestures = {}

DONE = "DONE!"

In [None]:
def process_stable_prediction(res):
    if res[np.argmax(res)] > THRESHOLD:            
        if len(sentence) > 0: 
            if ACTIONS[np.argmax(res)] != sentence[-1]:
                sentence.append(ACTIONS[np.argmax(res)])
        else:
            sentence.append(ACTIONS[np.argmax(res)])
        
        #Case 1: If the actual combination is empty, then check if the last gesture is the start of a known combination
        if next_gestures == {}:
            for combination in known_combinations:
                if combination.startswith(''.join(sentence[-1])):
                    idx = 1
                    next_gestures[combination] = idx
        #If the actual combination is not empty:
        else:
            for combo in list(next_gestures.keys()):
                idx = next_gestures[combo]
                #Case 2: If the idx is ok and the last gesture is the next gesture of the actual combination
                if idx != 0 and idx < len(known_combinations[combo]) and sentence[-1] == known_combinations[combo][idx]:
                    idx += 1
                    next_gestures[combo] = idx
                #Case 3: If the idx is ok and the last gesture is still the same as the previous one
                elif idx != 0 and idx < len(known_combinations[combo]) and sentence[-1] == known_combinations[combo][idx-1]:
                    next_gestures[combo] = idx
                #Case 4: If the idx is over the length of the actual combination, then combination is done
                elif idx >= len(known_combinations[combo]):
                    next_gestures.pop(combo)
                #Case 5: If the last gesture is not the next gesture of the actual combination, then reset the combination
                elif sentence[-1] != known_combinations[combo][idx]:
                    next_gestures.pop(combo)

In [None]:
def display_results(image):
    left_text_position = (120, 400)
    right_text_position = (120, 300)
    
    if sentence:
        cv2.putText(image, f'Current Prediction: {sentence[-1]}', left_text_position, cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)

        offset = 0
        
        for combo in next_gestures:
            if next_gestures[combo] < len(known_combinations[combo]):
                suggested_next_gesture = known_combinations[combo][next_gestures[combo]]
            else: 
                suggested_next_gesture = DONE
                next_gestures == {}

            cv2.putText(image, f'{known_combinations[combo]} Next Gesture: {suggested_next_gesture}', right_text_position, cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)
            
            if suggested_next_gesture == DONE:
                next_gestures == {}
                terminate_all_threads()
            else:
                index_gif = int(np.where(ACTIONS == suggested_next_gesture)[0][0])
                gif_path = os.path.join(NUMPY_ARRAYS_PATH, ACTIONS[index_gif], f'{ACTIONS[index_gif]}.gif')
                open_suggested_gif(gif_path)

            offset += 30
            right_text_position = (120, 200 + offset)

        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        cv2.imshow('OpenCV Feed', image)

In [None]:
cap = cv2.VideoCapture(feedforward.CAMERA_INDEX)

with mp.solutions.holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        image, results = feedforward.mediapipe_detection(frame, holistic)

        feedforward.draw_pose_landmarks(image, results)

        keypoints = feedforward.extract_pose_descriptor(results)
        sequence.append(keypoints)
        sequence = sequence[-NUM_FRAMES_PER_VIDEO:]

        if len(sequence) == NUM_FRAMES_PER_VIDEO:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(res))
             
            is_stable_prediction = np.unique(predictions[-NUM_FRAMES_FOR_STABILITY:])[0]==np.argmax(res)
            if is_stable_prediction: 
                process_stable_prediction(res)

            if len(sentence) > 5: sentence = sentence[-5:]
            image = feedforward.prob_viz(res, ACTIONS, image)

        display_results(image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            terminate_all_threads()
            break

    cap.release()
    cv2.destroyAllWindows()