In [40]:
import cv2
from ultralytics import YOLO
import traceback
import numpy as np

# Detecting Poses with Ultralytics

In [10]:
# Load the YOLOv8 model
model = YOLO('yolov8n-pose.pt')

# Start video capture
cap = cv2.VideoCapture(0)

try:
    # Loop through the video frames
    while cap.isOpened():
        # Read a frame from the video
        success, frame = cap.read()

        if success:
            # Run YOLOv8 inference on the frame
            results = model(source=frame, conf=.3)

            # Get keypoints from results
            pixel_keypoints = results[0].keypoints.xy[0]

            # Calculate and annotate the angle of each joints
            if (len(pixel_keypoints) != 0):
                # Visualize the pose results on the frame
                annotated_frame = results[0].plot()

                # Display the annotated frame
                cv2.imshow("YOLOv8 Inference", annotated_frame)
            else:
                cv2.imshow("Regular frame", frame)

            # Break the loop if 'q' is pressed
            if cv2.waitKey(1) & 0xFF == ord("q"):
                break
        else:
            # Break the loop if the end of the video is reached
            break
except Exception as err:
    traceback.print_exc()
finally:
    # Release the video capture object and close the display window
    cap.release()
    cv2.destroyAllWindows()




0: 480x640 1 person, 96.3ms
Speed: 1.0ms preprocess, 96.3ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 96.1ms
Speed: 3.6ms preprocess, 96.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 94.9ms
Speed: 2.0ms preprocess, 94.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 87.9ms
Speed: 2.0ms preprocess, 87.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 78.3ms
Speed: 1.0ms preprocess, 78.3ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 81.5ms
Speed: 1.0ms preprocess, 81.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 81.6ms
Speed: 1.0ms preprocess, 81.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 79.4ms
Speed: 1.0ms preprocess, 79.4ms inference, 2.0ms postprocess per image at shape (1, 3, 48

# Realtime labeling: Capture Landmarks and Export to CSV

In [11]:
import csv
import os
import numpy as np
from matplotlib import pyplot as plt

In [68]:
# Create header for csv file
filename = 'pushup-form-data.csv'

landmarks = ['class']
for val in range(1, 17+1): # 17 keypoints
    landmarks += [s + str(val) for s in 'xyv']

with open('pushup-form-data.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(landmarks)

In [64]:
def export_landmarks(results, classification, writer):
    pass
    try:
        keypoints = np.array([[res[0], res[1], res[2]] for res in results[0].keypoints.data.tolist()[0]]).flatten().tolist()
        keypoints.insert(0, classification)
        writer.writerow(keypoints)
    except Exception as e:
        pass

In [116]:
with open(filename, mode='a', newline='') as file:
    writer = csv.writer(file)

    # Load the YOLOv8 model
    model = YOLO('yolov8m-pose.pt')

    # Start video capture
    cap = cv2.VideoCapture('push-ups.mp4')
    # cap = cv2.VideoCapture('./temp/test.avi')

    try:
        # Loop through the video frames
        while cap.isOpened():
            # Read a frame from the video
            success, frame = cap.read()

            if success:
                # Run YOLOv8 inference on the frame
                results = model(source=frame, conf=.75)

                # Calculate and annotate the angle of each joints
                if (len(results[0].names) == 1):
                    # Visualize the pose results on the frame
                    annotated_frame = results[0].plot()

                    # Display the annotated frame
                    cv2.imshow("YOLOv8 Inference", annotated_frame)
                else:
                    cv2.imshow("YOLOv8 Inference", frame)

                # Classify pushup form
                key = cv2.pollKey()
                while cv2.pollKey() != -1: pass # IIRC paint events are only called when the queue is empty
                if key == ord('1'):
                    export_landmarks(results, 'correct', writer)
                elif key == ord('2'):
                    export_landmarks(results, 'too high', writer)
                elif key == ord('3'):
                    export_landmarks(results, 'too low', writer)

                # Break the loop if 'q' is pressed
                if key == ord("q"):
                    break
            else:
                # Break the loop if the end of the video is reached
                break
    except Exception as err:
        traceback.print_exc()
    finally:
        # Release the video capture object and close the display window
        cap.release()
        cv2.destroyAllWindows()




0: 384x640 1 person, 272.8ms
Speed: 2.0ms preprocess, 272.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 300.6ms
Speed: 2.0ms preprocess, 300.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 262.6ms
Speed: 2.0ms preprocess, 262.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 268.0ms
Speed: 1.0ms preprocess, 268.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 263.0ms
Speed: 1.0ms preprocess, 263.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 256.0ms
Speed: 2.0ms preprocess, 256.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 269.0ms
Speed: 1.0ms preprocess, 269.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 276.6ms
Speed: 2.0ms preprocess, 276.6ms inference, 1.0ms postprocess per image at

1

# Train Custom Model Using Scikit Learn

In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [117]:
df = pd.read_csv(filename)
df

Unnamed: 0,class,x1,y1,v1,x2,y2,v2,x3,y3,v3,...,v14,x15,y15,v15,x16,y16,v16,x17,y17,v17
0,correct,557.680725,402.782043,0.843193,547.774170,368.216614,0.769735,561.873962,388.712036,0.402847,...,0.964942,1477.866699,565.942810,0.918546,1858.402588,662.524475,0.768147,1827.979248,660.539673,0.701648
1,correct,550.207031,409.370728,0.841245,536.113281,375.112061,0.747550,550.971252,400.338135,0.404314,...,0.967348,1478.100830,584.953552,0.927581,1856.940796,652.873657,0.772320,1814.738159,646.117371,0.712545
2,correct,543.502502,417.739105,0.823207,531.944031,384.270813,0.730645,544.310059,409.302521,0.362560,...,0.969769,1483.036621,588.829285,0.929982,1860.761475,644.958069,0.787588,1822.817871,636.720764,0.726591
3,correct,541.117676,432.953156,0.809318,526.257751,399.619995,0.683199,540.391968,425.048218,0.373734,...,0.968444,1496.131470,589.201355,0.936340,1861.970337,650.980591,0.786793,1831.553101,638.317261,0.743336
4,correct,531.121948,436.844452,0.780115,516.553101,405.449615,0.665909,531.813293,427.074371,0.320338,...,0.967676,1515.775757,585.712036,0.932823,1858.523438,655.281921,0.792572,1839.483398,637.016541,0.749128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1088,too high,373.026428,351.177551,0.816772,352.192230,331.182617,0.702155,375.940216,345.292816,0.454974,...,0.992358,794.599487,358.053894,0.983230,1031.069336,418.840393,0.942158,1007.058533,443.331024,0.918388
1089,too high,375.306122,349.986847,0.839098,354.926544,328.376190,0.710381,375.194031,345.171692,0.522154,...,0.992332,781.127014,366.317108,0.986832,1026.469238,410.052246,0.942898,995.506714,442.640900,0.929104
1090,too high,375.388184,343.427734,0.840926,354.979095,320.888916,0.709647,374.035950,340.871094,0.516825,...,0.993111,757.150635,373.170166,0.989289,1023.884827,416.854858,0.946256,959.197266,446.786865,0.937319
1091,too high,375.256195,348.839264,0.825309,357.753754,328.200562,0.731741,379.140564,342.703705,0.380596,...,0.996039,732.467285,375.923584,0.989290,1021.871338,429.621033,0.957321,932.848389,450.636108,0.932217


In [118]:
X = df.drop('class', axis=1) # features
y = df['class'] # classifications

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=10)

In [75]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [119]:
pipelines = {
    'lr': make_pipeline(StandardScaler(), LogisticRegression()),
    'rc': make_pipeline(StandardScaler(), RidgeClassifier()),
    'rf': make_pipeline(StandardScaler(), RandomForestClassifier()),
    'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier()),
}

In [120]:
fit_models = {}
for name, pipeline in pipelines.items():
    model = pipeline.fit(X_train, y_train)
    fit_models[name] = model

# Evaluate and Save

In [121]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
import pickle

In [122]:
for algo, model in fit_models.items():
    y_pred = model.predict(X_test)
    print(algo,
        accuracy_score(y_test.values, y_pred),
        # precision_score(y_test.values, yhat, average='binary', pos_label='up'),
        # recall_score(y_test.values, yhat, average='binary', pos_label='up')
    )

lr 1.0
rc 0.9963503649635036
rf 1.0
gb 0.9963503649635036


In [125]:
with open('pushup_butt_height.pkl', 'wb') as f:
    pickle.dump(fit_models['gb'], f)

# Run Model

In [126]:
with open('pushup_butt_height.pkl', 'rb') as f:
    model = pickle.load(f)

In [127]:
# Load the YOLOv8 model
pose_model = YOLO('yolov8m-pose.pt')

# Start video capture
cap = cv2.VideoCapture(0)

try:
    # Loop through the video frames
    while cap.isOpened():
        # Read a frame from the video
        success, frame = cap.read()

        if success:
            # Run YOLOv8 inference on the frame
            results = pose_model(source=frame, conf=.3)

            # Get keypoints from results
            pixel_keypoints = results[0].keypoints.xy[0]

            # Calculate and annotate the angle of each joints
            if (len(pixel_keypoints) != 0):
                # Visualize the pose results on the frame
                annotated_frame = results[0].plot()

                keypoints = np.array([[res[0], res[1], res[2]] for res in results[0].keypoints.data.tolist()[0]]).flatten().tolist()
                X = pd.DataFrame([keypoints], columns=landmarks[1:])
                class_ = model.predict(X)[0]

                cv2.putText(annotated_frame, class_.split(' ')[0], (95, 40), cv2.FONT_HERSHEY_SIMPLEX, .5, (255,255,255), 2, cv2.LINE_AA)

                # Display the annotated frame
                cv2.imshow("YOLOv8 Inference", annotated_frame)
            else:
                cv2.imshow("YOLOv8 Inference", frame)

            # Break the loop if 'q' is pressed
            if cv2.waitKey(1) & 0xFF == ord("q"):
                break
        else:
            # Break the loop if the end of the video is reached
            break
except Exception as err:
    traceback.print_exc()
finally:
    # Release the video capture object and close the display window
    cap.release()
    cv2.destroyAllWindows()


0: 480x640 1 person, 331.0ms
Speed: 3.0ms preprocess, 331.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 364.0ms
Speed: 1.0ms preprocess, 364.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 330.6ms
Speed: 1.0ms preprocess, 330.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 335.4ms
Speed: 1.0ms preprocess, 335.4ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 322.6ms
Speed: 2.0ms preprocess, 322.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 319.9ms
Speed: 1.0ms preprocess, 319.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 317.7ms
Speed: 1.0ms preprocess, 317.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 330.1ms
Speed: 2.0ms preprocess, 330.1ms inference, 1.0ms postprocess per image at