In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!mkdir data/

# connect drive

!cp /content/drive/MyDrive/landmark_train.csv data
!gdown  1gzWOtABiVmJ38usCSDe5F9gR2tECt3zu -O data/
# !gdown  15lwipssmC_K82ukRfb0uVCiDH1TZ3QCf -O data/
!gdown  1nIo1_wBmkovz-u_BCsV5c1Kbz6ZqoKwq -O data/
# Download file hand_gesture.yaml
!gdown  1ZteHYSgbuZu_GcUJHW8ZzoZv1DE8-oLw -O data/

mkdir: cannot create directory ‘data/’: File exists
Downloading...
From: https://drive.google.com/uc?id=1gzWOtABiVmJ38usCSDe5F9gR2tECt3zu
To: /content/data/landmark_val.csv
100% 369k/369k [00:00<00:00, 91.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1nIo1_wBmkovz-u_BCsV5c1Kbz6ZqoKwq
To: /content/data/landmark_test.csv
100% 320k/320k [00:00<00:00, 61.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ZteHYSgbuZu_GcUJHW8ZzoZv1DE8-oLw
To: /content/data/hand_gesture.yaml
100% 120/120 [00:00<00:00, 679kB/s]


In [3]:
!pip install mediapipe==0.10.18
!pip install torchmetrics



In [4]:
import os
import cv2
import yaml
import torch
import numpy as np
import pandas as pd
from torch import nn
import mediapipe as mp
from torch import optim
from datetime import datetime
from torchmetrics import Accuracy
from torch.utils.data import Dataset, DataLoader

In [5]:
class CustomImageDataset(Dataset):
    def __init__(self, filepath):
        self.data = pd.read_csv(filepath)
        self.labels = torch.from_numpy(self.data.iloc[:, 0].to_numpy())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        one_hot_label = self.labels[idx]
        torch_data = torch.from_numpy(self.data.iloc[idx, 1:].to_numpy(dtype=np.float32))
        return torch_data, one_hot_label

In [6]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim=63, hidden_dim=128):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        list_label = label_dict_from_config_file("./data/hand_gesture.yaml")
        self.output_dim = len(list_label)

        self.model = nn.Sequential(
            # Layer 1
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            # Layer 2
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.4),
            # Layer 3
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.4),
            # Layer 4
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.6),
            # Output layer
            nn.Linear(hidden_dim, self.output_dim)
        )

    def forward(self, x):
        x = self.flatten(x)
        return self.model(x)

    def predict(self, x, threshold=0.8):
        logits = self(x)
        softmax_prob = nn.Softmax(dim=1)(logits)
        chosen_ind = torch.argmax(softmax_prob, dim=1)
        return torch.where(softmax_prob[0, chosen_ind] > threshold, chosen_ind, -1)

    def predict_with_known_class(self, x):
        logits = self(x)
        softmax_prob = nn.Softmax(dim=1)(logits)
        return torch.argmax(softmax_prob, dim=1)

    def score(self, logits):
        return -torch.amax(logits, dim=1)


def label_dict_from_config_file(relative_path):
    with open(relative_path, "r") as f:
        label_tag = yaml.full_load(f)["gestures"]
    return label_tag

In [7]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.watched_metrics = np.inf

    def early_stop(self, current_value):
        if current_value < self.watched_metrics:
            self.watched_metrics = current_value
            self.counter = 0
        elif current_value > (self.watched_metrics + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [8]:
class HandLandmarksDetector():
    def __init__(self) -> None:
        self.mp_drawing = mp.solutions.drawing_utils
        self.mp_drawing_styles = mp.solutions.drawing_styles
        self.mp_hands = mp.solutions.hands
        self.detector = self.mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=1,
            min_detection_confidence=0.5
        )

    def detectHand(self, frame):
        """
        Detects the hand landmarks in the frame and returns
        the landmarks along with an annotated image.
        """
        hands = []
        frame = cv2.flip(frame, 1)
        annotated_image = frame.copy()
        results = self.detector.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        if results.multi_hand_landmarks is not None:
            for hand_landmarks in results.multi_hand_landmarks:
                hand = []
                self.mp_drawing.draw_landmarks(
                    annotated_image,
                    hand_landmarks,
                    self.mp_hands.HAND_CONNECTIONS,
                    self.mp_drawing_styles.get_default_hand_landmarks_style(),
                    self.mp_drawing_styles.get_default_hand_connections_style())
                for landmark in hand_landmarks.landmark:
                    x, y, z = landmark.x, landmark.y, landmark.z
                    hand.extend([x, y, z])
            hands.append(hand)
        return hands, annotated_image

In [9]:
def train(train_loader, val_loader, model, criterion, early_stopper, optimizer, epochs=300):
    # add auroc score
    best_vloss = 1_000_000
    timestamp = datetime.now().strftime('%d-%m %H:%M')
    for epoch in range(epochs):
        # training
        model.train(True)
        running_loss = 0.0
        acc_train = Accuracy(num_classes=len(LIST_LABEL), task='MULTICLASS')
        for batch_idx, data in enumerate(train_loader):
            inputs, labels = data

            optimizer.zero_grad()
            preds = model(inputs)
            loss = criterion(preds, labels)
            loss.backward()
            optimizer.step()

            acc_train.update(model.predict_with_known_class(inputs), labels)
            running_loss += loss.item()

        avg_loss = running_loss / len(train_loader)

        # validation
        model.train(False)
        running_vloss = 0.0
        acc_val = Accuracy(num_classes=len(LIST_LABEL), task='MULTICLASS')
        for i, vdata in enumerate(val_loader):
            vinputs, vlabels = vdata
            preds = model(vinputs)
            vloss = criterion(preds, vlabels)
            running_vloss += vloss.item()
            acc_val.update(model.predict_with_known_class(vinputs), vlabels)

        # Log the running loss averaged per batch for both training and validation
        print(f"Epoch [{epoch+1}/{epochs}]: ")
        print(f"\ttrain_acc: {acc_train.compute().item()}, val_acc: {acc_val.compute().item()}")
        avg_vloss = running_vloss / len(val_loader)
        print(f'\ttrain_loss: {avg_loss}, val_loss: {avg_vloss}')

        # Track best performance, and save the model's state
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            best_model_path = f'./{save_path}/model_{timestamp}_{model.__class__.__name__}_best'
            torch.save(model.state_dict(), best_model_path)

        if early_stopper.early_stop(avg_vloss):
            print(f'Stopping at epoch {epoch+1}, minimum: {early_stopper.watched_metrics}')
            break

    model_path = f'./{save_path}/model_{timestamp}_{model.__class__.__name__}_last'
    torch.save(model.state_dict(), model_path)

    print(acc_val.compute())
    return model, best_model_path

In [10]:
DATA_FOLDER_PATH = "./data/"
LIST_LABEL = label_dict_from_config_file("./data/hand_gesture.yaml")

train_path = os.path.join(DATA_FOLDER_PATH, "landmark_train.csv")
val_path = os.path.join(DATA_FOLDER_PATH, "landmark_val.csv")
test_path = os.path.join(DATA_FOLDER_PATH, "landmark_test.csv")
save_path = './models'
os.makedirs(save_path,exist_ok=True)

train_set = CustomImageDataset(train_path)
train_loader = DataLoader(train_set, batch_size=40, shuffle=True)

val_set = CustomImageDataset(val_path)
val_loader = DataLoader(val_set, batch_size=50, shuffle=False)

test_set = CustomImageDataset(test_path)
test_loader = DataLoader(test_set, batch_size=20, shuffle=False)

model = NeuralNetwork()
criterion = nn.CrossEntropyLoss()
early_stopper = EarlyStopper(patience=30, min_delta=0.01)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [11]:
model, best_model_path = train(train_loader, val_loader, model, criterion, early_stopper, optimizer)

Epoch [1/300]: 
	train_acc: 0.246257483959198, val_acc: 0.34812286496162415
	train_loss: 1.6001512701831646, val_loss: 1.5891388654708862
Epoch [2/300]: 
	train_acc: 0.397455096244812, val_acc: 0.35494881868362427
	train_loss: 1.552045699375779, val_loss: 1.5426491300264995
Epoch [3/300]: 
	train_acc: 0.5071107745170593, val_acc: 0.41296929121017456
	train_loss: 1.4377595559874576, val_loss: 1.4003636240959167
Epoch [4/300]: 
	train_acc: 0.6253742575645447, val_acc: 0.552901029586792
	train_loss: 1.1840336509604952, val_loss: 1.0669372876485188
Epoch [5/300]: 
	train_acc: 0.7529940009117126, val_acc: 0.788395881652832
	train_loss: 0.8167673731917766, val_loss: 0.647696519891421
Epoch [6/300]: 
	train_acc: 0.8259730339050293, val_acc: 0.8976109027862549
	train_loss: 0.550378443589851, val_loss: 0.39768968025843304
Epoch [7/300]: 
	train_acc: 0.908682644367218, val_acc: 0.9897611141204834
	train_loss: 0.3653398723299824, val_loss: 0.19972985113660494
Epoch [8/300]: 
	train_acc: 0.9472305

In [12]:
network = NeuralNetwork()
network.load_state_dict(torch.load(best_model_path, weights_only=False))

network.eval()
acc_test = Accuracy(num_classes=len(LIST_LABEL), task='MULTICLASS')
for i, test_data in enumerate(test_loader):
    test_input, test_label = test_data
    preds = network(test_input)
    acc_test.update(preds, test_label)

print(network.__class__.__name__)
print(f"Accuracy of model: {acc_test.compute().item()}")
print("========================================================================")

NeuralNetwork
Accuracy of model: 0.9724409580230713
