In [6]:
import torch
import torch.nn as nn

class NeuralNet(nn.Module):
    def __init__(
        self,
        input_size = 24,
        hidden_size = 256,
        num_classes = 3
    ):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        return out

class KeypointClassification:
    def __init__(self, path_model):
        self.path_model = path_model
        self.classes = ['run', 'stand', 'walk']
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.load_model()

    def load_model(self):
        self.model = NeuralNet()
        self.model.load_state_dict(
            torch.load(self.path_model, map_location=self.device)
        )
    def __call__(self, input_keypoint):
        if not type(input_keypoint) == torch.Tensor:
            input_keypoint = torch.tensor(
                input_keypoint, dtype=torch.float32
            )
        out = self.model(input_keypoint)
        _, predict = torch.max(out, -1)
        label_predict = self.classes[predict]
        return label_predict

if __name__ == '__main__':
    keypoint_classification = KeypointClassification(
        path_model='action_classification.pt'
    )

# Yolo_Model for key point extraction

In [7]:
import sys
import cv2
import numpy as np
from pydantic import BaseModel
import ultralytics

# Define keypoint mappings
class GetKeypoint(BaseModel):
    NOSE:           int = 0
    LEFT_EYE:       int = 1
    RIGHT_EYE:      int = 2
    LEFT_EAR:       int = 3
    RIGHT_EAR:      int = 4
    LEFT_SHOULDER:  int = 5
    RIGHT_SHOULDER: int = 6
    LEFT_ELBOW:     int = 7
    RIGHT_ELBOW:    int = 8
    LEFT_WRIST:     int = 9
    RIGHT_WRIST:    int = 10
    LEFT_HIP:       int = 11
    RIGHT_HIP:      int = 12
    LEFT_KNEE:      int = 13
    RIGHT_KNEE:     int = 14
    LEFT_ANKLE:     int = 15
    RIGHT_ANKLE:    int = 16

class DetectKeypoint:
    def __init__(self, yolov8_model='yolov8m-pose'):
        self.yolov8_model = yolov8_model
        self.get_keypoint = GetKeypoint()
        self.__load_model()

    def __load_model(self):
        if not self.yolov8_model.endswith('-pose'):
            sys.exit('Model not a YOLOv8 pose model')
        self.model = ultralytics.YOLO(self.yolov8_model)

    def extract_keypoint(self, keypoint: np.ndarray) -> list:
        return [
            keypoint[self.get_keypoint.NOSE][0], keypoint[self.get_keypoint.NOSE][1],
            keypoint[self.get_keypoint.LEFT_EYE][0], keypoint[self.get_keypoint.LEFT_EYE][1],
            keypoint[self.get_keypoint.RIGHT_EYE][0], keypoint[self.get_keypoint.RIGHT_EYE][1],
            keypoint[self.get_keypoint.LEFT_EAR][0], keypoint[self.get_keypoint.LEFT_EAR][1],
            keypoint[self.get_keypoint.RIGHT_EAR][0], keypoint[self.get_keypoint.RIGHT_EAR][1],
            keypoint[self.get_keypoint.LEFT_SHOULDER][0], keypoint[self.get_keypoint.LEFT_SHOULDER][1],
            keypoint[self.get_keypoint.RIGHT_SHOULDER][0], keypoint[self.get_keypoint.RIGHT_SHOULDER][1],
            keypoint[self.get_keypoint.LEFT_ELBOW][0], keypoint[self.get_keypoint.LEFT_ELBOW][1],
            keypoint[self.get_keypoint.RIGHT_ELBOW][0], keypoint[self.get_keypoint.RIGHT_ELBOW][1],
            keypoint[self.get_keypoint.LEFT_WRIST][0], keypoint[self.get_keypoint.LEFT_WRIST][1],
            keypoint[self.get_keypoint.RIGHT_WRIST][0], keypoint[self.get_keypoint.RIGHT_WRIST][1],
            keypoint[self.get_keypoint.LEFT_HIP][0], keypoint[self.get_keypoint.LEFT_HIP][1],
            keypoint[self.get_keypoint.RIGHT_HIP][0], keypoint[self.get_keypoint.RIGHT_HIP][1],
            keypoint[self.get_keypoint.LEFT_KNEE][0], keypoint[self.get_keypoint.LEFT_KNEE][1],
            keypoint[self.get_keypoint.RIGHT_KNEE][0], keypoint[self.get_keypoint.RIGHT_KNEE][1],
            keypoint[self.get_keypoint.LEFT_ANKLE][0], keypoint[self.get_keypoint.LEFT_ANKLE][1],
            keypoint[self.get_keypoint.RIGHT_ANKLE][0], keypoint[self.get_keypoint.RIGHT_ANKLE][1]
        ]

    def get_xy_keypoint(self, results) -> list:
        result_keypoint = results.keypoints.xyn[0].cpu().numpy()  # Access keypoints
        return self.extract_keypoint(result_keypoint)

    def __call__(self, image: np.ndarray):
        results = self.model(image, save=False)[0]  # Predict without saving
        return results


In [8]:
def extract_keypoint(keypoint):
    # List to hold the x, y coordinates of each keypoint
    keypoint_coordinates = []

    # Iterate over all keypoints defined in GetKeypoint
    for point in GetKeypoint.__fields__.values():
        x, y = keypoint[point.default]  # Access each keypoint's x, y coordinates
        keypoint_coordinates.extend([x, y])

    return keypoint_coordinates

In [14]:
import torch
import cv2
import numpy as np
from PIL import Image
import ultralytics

# Initialize the keypoint detection and classification models
detection_keypoint = DetectKeypoint()
classification_keypoint = KeypointClassification('action_classification.pt')

# Start capturing video from the webcam
cap = cv2.VideoCapture(0)

while True:
    # Read a frame from the webcam
    ret, frame = cap.read()
    if not ret:
        break

    # Perform keypoint detection
    results = detection_keypoint(frame)
    
    # Check if keypoints or bounding box are detected
    if results.keypoints is None or len(results.boxes) == 0:
        # Display message if no person or keypoints are detected
        cv2.putText(frame, "No person detected", (50, 50), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        cv2.imshow('Action Detection with Keypoints', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        continue

    # Extract keypoints for classification
    results_keypoint = detection_keypoint.get_xy_keypoint(results)
    input_classification = results_keypoint[10:]  # Selecting keypoints subset for classification
    results_classification = classification_keypoint(input_classification)

    # Get bounding box coordinates
    x_min, y_min, x_max, y_max = results.boxes.xyxy[0].cpu().numpy()

    # Draw bounding box and classification result on the frame
    frame = cv2.rectangle(frame, (int(x_min), int(y_min)), (int(x_max), int(y_max)), (0, 0, 255), 2)
    cv2.putText(frame, results_classification.upper(), (int(x_min), int(y_min) - 10), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

    # Draw keypoints
    keypoints = results.keypoints.data[0].cpu().numpy()  # [0] to select the first set of keypoints if there's only one person
    
    # Iterate over each keypoint
    for keypoint in keypoints:
        x, y, confidence = keypoint
        if confidence > 0.5:  # Draw only keypoints with confidence > 0.5
            cv2.circle(frame, (int(x), int(y)), 5, (0, 255, 0), -1)  # Draw green keypoints
    
    # Define keypoint connections (specific indices may vary depending on your keypoint model)
    connections = [
        (5, 6), (5, 11), (6, 12), (11, 13), (13, 15), (12, 14), (14, 16)
    ]
    for (start, end) in connections:
        if start < len(keypoints) and end < len(keypoints):  # Ensure indices are within bounds
            x_start, y_start, conf_start = keypoints[start]
            x_end, y_end, conf_end = keypoints[end]
            # Draw line if both points have high confidence
            if conf_start > 0.5 and conf_end > 0.5:
                cv2.line(frame, (int(x_start), int(y_start)), (int(x_end), int(y_end)), (255, 0, 0), 2)

    # Display the frame with the bounding box, keypoints, and classification
    cv2.imshow('Action Detection with Keypoints', frame)

    # Break loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video capture object and close windows
cap.release()
cv2.destroyAllWindows()



0: 480x640 (no detections), 986.1ms
Speed: 44.7ms preprocess, 986.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1031.0ms
Speed: 12.6ms preprocess, 1031.0ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 952.3ms
Speed: 6.1ms preprocess, 952.3ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1076.6ms
Speed: 28.5ms preprocess, 1076.6ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1055.5ms
Speed: 7.0ms preprocess, 1055.5ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 959.2ms
Speed: 6.6ms preprocess, 959.2ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 936.1ms
Speed: 6.3ms preprocess, 936.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 932.6ms
Speed: 4.1ms preprocess, 932.6ms inference, 8.1ms postproc