In [1]:
import os
import cv2
import mediapipe as mp
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [3]:
DATA_DIR = './data'
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

number_of_classes = 5
dataset_size = 1000

cap = cv2.VideoCapture(0)
for j in range(number_of_classes):
    if not os.path.exists(os.path.join(DATA_DIR, str(j))):
        os.makedirs(os.path.join(DATA_DIR, str(j)))

    print('Collecting data for class {}'.format(j))
    
    while True:
        ret, frame = cap.read()
        cv2.putText(frame, 'Ready? Press "Q" ! :)', (100, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 255, 0), 3,cv2.LINE_AA)
        cv2.imshow('frame', frame)
        if cv2.waitKey(25) == ord('q'):
            break

    counter = 0
    while counter < dataset_size:
        ret, frame = cap.read()
        cv2.imshow('frame', frame)
        cv2.waitKey(25)
        cv2.imwrite(os.path.join(DATA_DIR, str(j), '{}.jpg'.format(counter)), frame)

        counter += 1

cap.release()
cv2.destroyAllWindows()


Collecting data for class 0


2024-11-06 10:52:37.528 python[1321:17834] +[IMKClient subclass]: chose IMKClient_Legacy
2024-11-06 10:52:37.528 python[1321:17834] +[IMKInputSession subclass]: chose IMKInputSession_Legacy


Collecting data for class 1
Collecting data for class 2
Collecting data for class 3
Collecting data for class 4


In [6]:
## Creating the marking for the hand
# Initialize Mediapipe Hands and Drawing Utilities
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

# Set up the Hands model in static image mode
hands = mp_hands.Hands(static_image_mode=True, min_detection_confidence=0.3)

# Directory containing the image data
DATA_DIR = './data'

# Prepare lists to hold data and labels
data = []
labels = []

# Loop through each directory and image file in DATA_DIR
for dir_ in os.listdir(DATA_DIR):
    for img_path in os.listdir(os.path.join(DATA_DIR, dir_)):
        data_aux = []
        x_ = []
        y_ = []

        # Construct full path to the image
        img_path_full = os.path.join(DATA_DIR, dir_, img_path)
        
        # Read the image
        img = cv2.imread(img_path_full)

        # Check if image is successfully loaded
        if img is None:
            print(f"Failed to load image at path: {img_path_full}")
            continue

        # Convert the image to RGB for Mediapipe processing
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Process the image to detect hands and landmarks
        results = hands.process(img_rgb)
        if results.multi_hand_landmarks:
            # Loop through each detected hand
            for hand_landmarks in results.multi_hand_landmarks:
                # Extract landmarks and store x, y coordinates
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y
                    x_.append(x)
                    y_.append(y)

                # Normalize landmarks by subtracting minimum x, y values
                for i in range(len(hand_landmarks.landmark)):
                    x = hand_landmarks.landmark[i].x
                    y = hand_landmarks.landmark[i].y
                    data_aux.append(x - min(x_))
                    data_aux.append(y - min(y_))

            # Append processed data and labels
            data.append(data_aux)
            labels.append(dir_)

# Save processed data and labels to a pickle file
with open('data.pickle', 'wb') as f:
    pickle.dump({'data': data, 'labels': labels}, f)

# Release resources
hands.close()
print("Data processing complete and saved to 'data.pickle'")


I0000 00:00:1730871233.115874   17834 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M3 Pro


Failed to load image at path: ./data/3/.ipynb_checkpoints
Data processing complete and saved to 'data.pickle'


In [4]:
##Model train
# Load data
data_dict = pickle.load(open('./data.pickle', 'rb'))
data = data_dict['data']
labels = data_dict['labels']

# Determine the fixed length for each sample (e.g., 42 coordinates if 21 landmarks with x and y each)
fixed_length = 42  # Adjust this based on your landmarks

# Pad or truncate data to fixed length
data_padded = []
for sample in data:
    if len(sample) < fixed_length:
        sample += [0] * (fixed_length - len(sample))
    else:
        sample = sample[:fixed_length]
    data_padded.append(sample)

# Convert to numpy array
data_padded = np.array(data_padded)
labels = np.asarray(labels)

# Encode labels to numeric values
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Split data
x_train, x_test, y_train, y_test = train_test_split(data_padded, labels_encoded, test_size=0.2, shuffle=True, stratify=labels_encoded)

# Convert data to PyTorch tensors
x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
x_test_tensor = torch.tensor(x_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define model parameters
input_size = fixed_length
hidden_size = 64
output_size = len(np.unique(labels_encoded))

# Define your PyTorch model
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Load model if exists, otherwise train
model_path = 'model.pth'
model = SimpleNN(input_size, hidden_size, output_size)

if os.path.exists(model_path):
    print("Loading model from file...")
    model.load_state_dict(torch.load(model_path))
else:
    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    # Training loop
    num_epochs = 1000
    for epoch in range(num_epochs):
        outputs = model(x_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Save the model's state dictionary
    torch.save(model.state_dict(), model_path)
    print("Model trained and saved to file.")

# Evaluation
model.eval()
with torch.no_grad():
    outputs = model(x_test_tensor)
    _, predicted = torch.max(outputs, 1)
    accuracy = accuracy_score(predicted.numpy(), y_test)

print('{}% of samples were classified correctly!'.format(accuracy * 100))


Loading model from file...
99.47423764458465% of samples were classified correctly!


  model.load_state_dict(torch.load(model_path))


In [4]:
## Model testing
# Define the SimpleNN class to match the saved model because pytorch only saves the the weights 
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Model parameters 
input_size = 42  # This should match the fixed length used in training
hidden_size = 64
output_size = 5  

# Instantiate and load the model
model = SimpleNN(input_size, hidden_size, output_size)
model.load_state_dict(torch.load('./model.pth'))
model.eval()  # Set the model to evaluation mode

# Initialize video capture
cap = cv2.VideoCapture(0)

# Initialize MediaPipe hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

# Configure MediaPipe Hands
hands = mp_hands.Hands(static_image_mode=False, min_detection_confidence=0.3)

# Label dictionary
labels_dict = {0: "Hello", 1: "Please", 2: "Thank you", 3: "OK", 4: "Thumbs up"}

while True:
    data_aux = []
    x_ = []
    y_ = []

    # Capture frame-by-frame
    ret, frame = cap.read()
    if not ret:
        break

    H, W, _ = frame.shape
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Process the image and detect hands
    results = hands.process(frame_rgb)
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                frame,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                mp_drawing_styles.get_default_hand_landmarks_style(),
                mp_drawing_styles.get_default_hand_connections_style()
            )

            # Collect x, y coordinates of each landmark
            for landmark in hand_landmarks.landmark:
                x = landmark.x
                y = landmark.y
                x_.append(x)
                y_.append(y)

            # Normalize landmarks and construct data_aux with 42 elements
            for landmark in hand_landmarks.landmark:
                data_aux.append(landmark.x - min(x_))
                data_aux.append(landmark.y - min(y_))

            # Ensure data_aux matches the expected input shape of the model
            data_aux = data_aux[:42]  # Keep only 42 values if more were added

            # Bounding box coordinates for hand
            x1 = int(min(x_) * W) - 10
            y1 = int(min(y_) * H) - 10
            x2 = int(max(x_) * W) - 10
            y2 = int(max(y_) * H) - 10

            # Inference
            with torch.no_grad():
                inputs = torch.tensor(data_aux, dtype=torch.float32).unsqueeze(0)  # Add batch dimension
                outputs = model(inputs)  # Pass the input through the model
                predicted_index = torch.argmax(outputs, dim=1).item()  # Get predicted class index
                predicted_character = labels_dict.get(predicted_index, "Unknown")  # Get label or default to "Unknown"

            # Display bounding box and prediction on frame
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 0), 4)
            cv2.putText(frame, predicted_character, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1.3, (0, 0, 0), 3,cv2.LINE_AA)

    # Display the resulting frame
    cv2.imshow('frame', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):  # Exit if 'q' key is pressed
        break

# Release the capture and close windows
cap.release()
cv2.destroyAllWindows()


  model.load_state_dict(torch.load('./model.pth'))
I0000 00:00:1732078843.182878   10673 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M3 Pro
