1. Import Dependencies

In [None]:
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import mediapipe as mp

2. Keypoints using MP Holistic

In [None]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [None]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [None]:
def draw_landmarks(image, results):
    #mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACE_CONNECTIONS) # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS) # Draw face connections
    #mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections

In [None]:
def draw_styled_landmarks(image, results):
    #เปลี่ยนสีของจุด landmark 
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS,
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             )
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=2), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=2), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=2), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [None]:
mp_holistic.POSE_CONNECTIONS

In [None]:
cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()

        # Make detections
        image, results = mediapipe_detection(frame, holistic)
        print(results)
        
        # Draw landmarks
        draw_landmarks(image, results)

        # Show to screen
        cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    cap.release()
    cv2.destroyAllWindows()

In [None]:
len(results.left_hand_landmarks.landmark)

In [None]:
results

In [None]:
draw_landmarks(frame, results)

In [None]:
plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

3. Extract Keypoint Values

In [None]:
len(results.left_hand_landmarks.landmark)

In [None]:
pose = []
for res in results.pose_landmarks.landmark:
    test = np.array([res.x, res.y, res.z, res.visibility])
    pose.append(test)

In [None]:
pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)

In [None]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, lh, rh])

In [None]:
result_test = extract_keypoints(results)

In [None]:
result_test

In [None]:
np.save('0', result_test)

In [None]:
np.load('0.npy')

4. Setup Folders for Collection

In [None]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('Train_15word') 

# Actions that we try to detect
# array ของท่าทางภาษามือ
actions = np.array(['wait','ขอโทษที่มาสาย','คุณชื่ออะไร','คุณสบายดีไหม','ฉันสบายดี','แล้วพบกันใหม่',
'คุณกินข้าวหรือยัง','คุณอายุเท่าไหร่','ฉันกำลังจะกลับบ้าน','คุณจะกลับบ้านกี่โมง','คุณเหนื่อยไหม',
'ฉันไม่สบาย','พรุ่งนี้คุณจะไปไหน','ห้องน้ำไปทางไหน','ฉันกำลังทานข้าว','ไปกินข้าวด้วยกันไหม'])

# fifteen videos worth of data
# 15 video = 15 folder 
no_sequences = 30

# Videos are going to be 15 frames in length
# 15 ภาพ ใน 1 วิดีโอ
sequence_length = 30

# Folder start
start_folder = 30

In [None]:
# create folder data
for action in actions: 
    #dirmax = np.max(np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int))
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

In [None]:
# หมวดที่ 1 >> 'Hello','Thank you','Sorry','Fine','Sick','Good luck','No problem','Yes','No','See you again'
## 'สวัสดี','ขอบคุณ','ขอโทษ','สบายดี','ไม่สบาย(ป่วย)','โชคดี','ไม่เป็นไร','ใช่','ไม่','พบกันใหม่'
# หมวดที่ 4 >> 'Clear soup','Noodles','Fried rice','Salad','Omelet','Papaya salad','Bread','Water','Cofee','Tea'
# หมวดที่ 7 >> '7-11','Big C','Shop','House','Hospital','School','Police station','Train station','Airport','Zoo'
# หมวดที่ 8 >> 'Daytime','Nighttime','Morning','Afternoon','Today','Tomorrow','Yesterday','Day','Month','Year'
# หมวดที่ 10 >> 'Pencil','Eraser','Notebook,'Book','T-shirt','Shoes','Skirt','Hairbrush','Earrings','Glasses'

# 'ขอโทษที่มาสาย','คุณชื่ออะไร','คุณสบายดีไหม','ฉันสบายดี','แล้วพบกันใหม่'

5. Collect Keypoint Values for Training and Testing

In [None]:
#Training Data *ไม่ต้องรันซ้ำ*
cap = cv2.VideoCapture(0)
# Set mediapipe model 
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    # NEW LOOP
    # Loop through actions
    for action in actions:
        # Loop through sequences aka videos
        for sequence in range(no_sequences):
            # Loop through video length aka sequence length
            for frame_num in range(sequence_length):
       
                # Read feed
                ret, frame = cap.read()

                # Make detections
                image, results = mediapipe_detection(frame, holistic)
                print(results)
                
                # Draw landmarks
                draw_landmarks(image, results)

                # NEW Apply wait logic
                if frame_num == 0: 
                    cv2.putText(image, 'STARTING COLLECTION', (120,200), 
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                    cv2.waitKey(2000)
                else: 
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)

                # NEW Export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                # Show to screen
                cv2.imshow('OpenCV Feed', image)

        # Break gracefully
        if cv2.waitKey(15) & 0xFF == ord('q'):
            break
        
    cap.release()
    cv2.destroyAllWindows()

In [None]:
### 1 ไฟล์ = 1 เฟรมของวิดีโอ มี array ขนาด 1*258 (1 มิติ / 258 คือจุดบนมือ ใบหน้า และท่าทาง)
### ถ้า 1 วิดีโอมี 30 เฟรม = 30 ไฟล์ = มี array ขนาด 30*285 ???
npy = np.load('Train_Data/ขอโทษที่มาสาย/0/1.npy')

print('รูปร่างของ array คือ'), np.array(npy.shape) #จำนวนแถว คอลัมน์ (1-2 มิติ) ##จำนวนตาราง แถว คอลัมน์ (3 มิติ)
print('เป็น array กี่มิติ:', np.array(npy).ndim, 'มิติ') #ขนาดของ array กี่มิติ
print('จำนวนสมาชิกทั้งหมด คือ', np.array(npy).size) #จำนวนสมาชิกทั้งหมดใน array

In [None]:
# array ขนาด 1*258 มิติ
np.load('Train_Data4/ขอโทษที่มาสาย/0/1.npy')

6. Preprocess Data and Create Labels and Features

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [None]:
label_map = {label:num for num, label in enumerate(actions)}

In [None]:
label_map

In [None]:
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res)
        sequences.append(window)
        labels.append(label_map[action])

In [None]:
np.array(sequences).shape

In [None]:
# จำนวนสมาชิกทั้งหมดใน array 
# 16 ท่าทาง 
print(np.array(sequences).size)

In [None]:
np.array(sequences).ndim

In [None]:
np.array(labels).shape

In [None]:
X = np.array(sequences)

In [None]:
X.shape

In [None]:
y = to_categorical(labels).astype(int)

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)

# แบ่งข้อมูล train 70% : test 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

7. Build and Train LSTM Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [None]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [None]:
# Sequential api
model = Sequential()

model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,258)))
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))

model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))

model.add(Dense(actions.shape[0], activation='softmax'))

#64,128,32 คือจำนวนโหนดของโมเดล
#input_shape=(15,258) >> 15 คือเฟรมรูป 258 คือจำนวน keypoint (จุดบนมือ+หน้า)


In [None]:
#จำนวนคำ/ประโยคที่เราเทรนเข้าไป
actions.shape[0]

In [None]:
res = [0.7, 0.2,0.1]

In [None]:
action[np.argmax(res)]
# 'Hello','Thank you','Sorry','Fine','Sick','Good luck','No problem','Yes','No','See you again'
## 'สวัสดี','ขอบคุณ','ขอโทษ','สบายดี','ไม่สบาย (ป่วย)','โชคดี','ไม่เป็นไร','ใช่','ไม่','พบกันใหม่'

In [None]:
# optimizer='Adam' คืออัลกอริทึมที่มาช่วยเพิ่มประสิทธิในการทำงานของโมเดล
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
# Train model
# epochs=800 คือจำนวนรอบในการเทรนโมเดล
# เทรนเพื่อนให้ค่า loss ลดลง (น้อยกว่า 0) และให้ค่า accuracy เพิ่มขึ้น (เข้าใกล้ 1)
# สร้างโฟลเดอร์ Logs ขึ้นมา
# ไม่ต้องเทรนทุกรอบก็ได้
model.fit(X_train, y_train, epochs=800, callbacks=[tb_callback])

In [None]:
model.summary()

8. Make Predictions

In [None]:
res = model.predict(X_test)

In [None]:
actions[np.argmax(res[0])]

In [None]:
actions[np.argmax(y_test[0])]

9. Save Weights

In [None]:
model.save('action.h5')

In [None]:
model.load_weights('action.h5')

10. Evaluation using Confusion Matrix and Accuracy

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score, confusion_matrix, classification_report

In [None]:
yhat = model.predict(X_test)

In [None]:
ytrue = np.argmax(y_test, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [None]:
multilabel_confusion_matrix(ytrue, yhat)

In [None]:
confusion_matrix(ytrue, yhat)

In [None]:
print(classification_report(ytrue, yhat, digits=4))

In [None]:
#ความแม่นยำของข้อมูลที่ test ไป
accuracy_score(ytrue, yhat)

11. Test in Real Time

In [None]:
#from scipy import stats
#from PIL import ImageFont
from PIL import Image, ImageFont, ImageDraw
from flask import Flask, render_template, Response

In [None]:
colors = [(245,117,16), (117,245,16), (16,117,245), (0,0,0),(138,43,226),
            (0,100,0),(255,0,0), (240,128,128), (139,69,19),(105,105,105),
            (255,0,0), (240,128,128), (139,69,19),(105,105,105)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        # แถบสีที่ขึ้นบนข้อความแต่ละท่าทาง
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        # แถบข้อความด้านข้าง
        #cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)

        # show thai lang
        
        fontpath = "./BaiJamjuree-Regular.ttf" 

        font =  ImageFont.truetype(fontpath,30)
        output_pil = Image.fromarray(output_frame)
        draw = ImageDraw.Draw(output_pil)
        text = draw.text((3, 52+num*40), actions[num], font = font, fill=(255,255,255))

        output_frame = np.array(output_pil) 
        
    return output_frame

In [None]:
plt.figure(figsize=(7,7))
plt.imshow(prob_viz(res, actions, image, colors))

In [None]:
app = Flask(__name__)

## Demo Test ##

def generate():
    # 1. New detection variables
    sequence = []
    sentence = []
    predictions = []
    threshold = 0.7

    #เปิดกล้องด้วย open cv
    cap = cv2.VideoCapture(0)

    # Set mediapipe model 
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        # while cap.isOpened():
        while True:

            # Read feed
            ret, frame = cap.read()
            
            # Make detections
            image, results = mediapipe_detection(frame, holistic)
            print(results)
                    
            # Draw landmarks
            draw_styled_landmarks(image, results)
                    
            # 2. Prediction logic
            keypoints = extract_keypoints(results)
            sequence.insert(0,keypoints)
            #sequence.append(keypoints)
            sequence = sequence[:30]
                
                
            if len(sequence) == 30:
                res = model.predict(np.expand_dims(sequence, axis=0))[0]
                print(actions[np.argmax(res)])
                predictions.append(np.argmax(res))
                        
                        
            #3. Viz logic
                if np.unique(predictions[-30:])[0]==np.argmax(res): 
                    if res[np.argmax(res)] > threshold:   
                        if len(sentence) > 0: 
                            if actions[np.argmax(res)] != sentence[-1]:
                                sentence.append(actions[np.argmax(res)])
                        else:
                            sentence.append(actions[np.argmax(res)])

                if len(sentence) > 3: 
                    sentence = sentence[-3:]

                    # Viz probabilities
                    #image = prob_viz(res, actions, image, colors)
                
            #Output Text    
            cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1) 
            #cv2.putText(image, ' '.join(sentence), (3,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

            # show thai lang
            fontpath = "./BaiJamjuree-Regular.ttf" 

            font =  ImageFont.truetype(fontpath,30) #truetype(font,ขนาดของ font)
            img_pil = Image.fromarray(image)
            draw = ImageDraw.Draw(img_pil)
            text = draw.text((3,0), ' '.join(sentence), font = font, fill=(255,255,255)) 
            #(3,0) คือระยะห่างของแกน x,y ของข้อความ / fill คือ ใส่สีตัวอักษรแบบ RGB 

            image = np.array(img_pil)
                
            # Show to screen # ถ้าขึ้นเว็บไม่ต้องโชว์
            #cv2.imshow('OpenCV Feed', image)

            frame2 = cv2.imencode('.jpg', image)[1].tobytes()
            yield (b'--frame\r\n'b'Content-Type: image/jpeg\r\n\r\n' + frame2 + b'\r\n')

            # Break gracefully
            # if cv2.waitKey(10) & 0xFF == ord('q'):
            # if cv2.waitKey(1) == 27:
            #     break
            key = cv2.waitKey(20)
            if key == 27:
                break
        cap.release()
        cv2.destroyAllWindows()

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/video_feed')
def video_feed():
    return Response(generate(),
                    mimetype='multipart/x-mixed-replace; boundary=frame')

@app.route('/about')
def about():
    return render_template('about.html')

if __name__=="__main__":
    app.run(debug=False)
    # app.run(host="0.0.0.0", port=5000)

In [None]:
sequence = []
sentence = []
predictions = []
threshold = 0.7

In [None]:
cap.release()
cv2.destroyAllWindows()

In [None]:
res[np.argmax(res)] > threshold

In [None]:
(sequences,30,1662)

In [None]:
model.predict(np.expand_dims(X_test[0],axis=0))

In [None]:
#ความแม่นยำของชุดข้อมูลการทดสอบ
(ls,acc)=model.evaluate(x=X_test,y=y_test)
print('MODEL ACCURACY = {}%'.format(acc*100))

In [None]:
#ความแม่นยำของชุดข้อมูลการฝึก
(ls,acc)=model.evaluate(x=X_train,y=y_train)
print('MODEL ACCURACY = {}%'.format(acc*100))