In [1]:
import os
import json
from PIL import Image
import cv2
import mediapipe as mp
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder, ImageNet
from torchvision import transforms, models
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights
import random, shutil

In [2]:
if torch.cuda.is_available():
    print("CUDA is available. PyTorch can access the GPU.")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"Current GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. PyTorch is using the CPU.")


CUDA is available. PyTorch can access the GPU.
Number of GPUs available: 1
Current GPU Name: NVIDIA GeForce RTX 4050 Laptop GPU


In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

dataset = ImageFolder(
    root = "archive",
    transform = transform
)
#'dataset' is used to identify the folder in which we have our dataset

In [4]:
class_names = dataset.classes
num_classes = len(class_names)
print(class_names)
print(f"Number of classes: {num_classes}") 

['Test', 'Train']
Number of classes: 2


In [5]:
SOURCE_DIR = "archive"
OUTPUT_DIR = "archive_output"
TRAIN_RATIO = 0.8

random.seed(42)

for split in ["train", "val"]:
    os.makedirs(os.path.join(OUTPUT_DIR, split), exist_ok=True)

for class_name in os.listdir(SOURCE_DIR):
    class_path = os.path.join(SOURCE_DIR, class_name)

    if not os.path.isdir(class_path):
        continue

    images = [
        img for img in os.listdir(class_path)
        if img.lower().endswith((".jpg", ".png", ".jpeg"))
    ]

    random.shuffle(images)

    n_train = int(len(images) * TRAIN_RATIO)
    train_imgs = images[:n_train]
    val_imgs = images[n_train:]

    for split, split_imgs in [("train", train_imgs), ("val", val_imgs)]:
        split_class_dir = os.path.join(OUTPUT_DIR, split, class_name)
        os.makedirs(split_class_dir, exist_ok=True)

        for img in split_imgs:
            shutil.copy(
                os.path.join(class_path, img),
                os.path.join(split_class_dir, img)
            )

    print(f"{class_name}: {len(train_imgs)} train, {len(val_imgs)} val")

print("\nTrain / Validation split completed.")

Test: 0 train, 0 val
Train: 0 train, 0 val

Train / Validation split completed.


In [6]:
train_dataset = ImageFolder(
    root="archive_output/train",
    transform=transform
)

val_dataset = ImageFolder(
    root="archive_output/val",
    transform=transform
)

FileNotFoundError: Found no valid file for the classes Test, Train. Supported extensions are: .jpg, .jpeg, .png, .ppm, .bmp, .pgm, .tif, .tiff, .webp

In [None]:
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=0
)

val_loader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=0
)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights = MobileNet_V2_Weights.DEFAULT
model = mobilenet_v2(weights=weights)

for param in model.features.parameters():
    param.requires_grad = False
#freezing the feature extractor

model.classifier[1] = nn.Linear(
    model.last_channel,
    num_classes 
    #Linear(1280 → 36)
)

model = model.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.classifier.parameters(),
    lr=1e-4
)

In [None]:
epochs = 15

for epoch in range(epochs):
    #train
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = running_loss / len(train_loader)
    train_acc = 100 * correct / total

    #val
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            val_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = 100 * val_correct / val_total

    print(
        f"Epoch [{epoch+1}/{epochs}] "
        f"- Train Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}% "
        f"- Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%"
    )


In [48]:
torch.save(model.state_dict(), "mobilenet_gesture.pth")
print("Model saved.")

Model saved.


In [49]:
import json

with open("class_names.json", "w") as f:
    json.dump(class_names, f)

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights = MobileNet_V2_Weights.DEFAULT
model = mobilenet_v2(weights=None)  # no pretrained at inference

model.classifier[1] = torch.nn.Linear(
    model.last_channel,
    len(class_names)
)

model.load_state_dict(torch.load("mobilenet_gesture.pth", map_location=device))
model.to(device)
model.eval()

transform = weights.transforms()

In [51]:
def preprocess(img):
    if img is None or img.size == 0:
        return None

    img = cv2.resize(img, (255, 255))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype("float32") / 255.0
    img = np.transpose(img, (2, 0, 1))  # HWC → CHW
    img = torch.from_numpy(img).unsqueeze(0)
    return img

In [52]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.6,
    min_tracking_confidence=0.6
)

In [53]:
cap = cv2.VideoCapture(0)
mp_drawing = mp.solutions.drawing_utils

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)

    h, w, _ = frame.shape

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)

    if results.multi_hand_landmarks:
        hand_landmarks = results.multi_hand_landmarks[0]

        # Draw skeleton (visual only)
        mp_drawing.draw_landmarks(
            frame,
            hand_landmarks,
            mp_hands.HAND_CONNECTIONS
        )

        # Landmark bounding box
        x_coords = [lm.x for lm in hand_landmarks.landmark]
        y_coords = [lm.y for lm in hand_landmarks.landmark]

        xmin = int(min(x_coords) * w)
        ymin = int(min(y_coords) * h)
        xmax = int(max(x_coords) * w)
        ymax = int(max(y_coords) * h)

        hand_crop = frame[ymin:ymax, xmin:xmax]
        
        # Make square & scale
        box_size = max(xmax - xmin, ymax - ymin)
        cx = (xmin + xmax) // 2
        cy = (ymin + ymax) // 2

        scale = 1.3
        half = int(box_size * scale / 2)

        xmin = max(0, cx - half)
        ymin = max(0, cy - half)
        xmax = min(w, cx + half)
        ymax = min(h, cy + half)

        hand_crop = frame[ymin:ymax, xmin:xmax]

        if hand_crop.size != 0:
            hand_rgb = cv2.cvtColor(hand_crop, cv2.COLOR_BGR2RGB)
            hand_pil = Image.fromarray(hand_rgb)

            input_tensor = transform(hand_pil).unsqueeze(0).to(device)

            with torch.no_grad():
                output = model(input_tensor)
                probs = torch.softmax(output, dim=1)
                conf, pred = torch.max(probs, dim=1)

            label = class_names[pred.item()]
            confidence = conf.item()

            cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
            cv2.putText(
                frame,
                f"{label} ({confidence:.2f})",
                (xmin, ymin - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.8,
                (0, 255, 0),
                2
            )

            cv2.imshow("HAND_CROP_DEBUG", hand_crop)

    cv2.imshow("Hand Gesture Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


In [41]:
print(class_names)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [42]:
model.classifier[1].out_features == len(class_names)

True

In [43]:
print(dataset.class_to_idx)
print(dataset.classes)

{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9}
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
