In [1]:
import os
import json
from PIL import Image
import cv2
import mediapipe as mp
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder, ImageNet
from torchvision import transforms, models
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights
import random, shutil

In [2]:
if torch.cuda.is_available():
    print("CUDA is available. PyTorch can access the GPU.")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"Current GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. PyTorch is using the CPU.")


CUDA is available. PyTorch can access the GPU.
Number of GPUs available: 1
Current GPU Name: NVIDIA GeForce RTX 4050 Laptop GPU


In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

train_dataset = ImageFolder(
    root="archive/train",
    transform=transform
)

val_dataset = ImageFolder(
    root="archive/test",
    transform=transform
)

In [4]:
class_names = train_dataset.classes
num_classes = len(class_names)
print(class_names)
print(f"Number of classes: {num_classes}") 

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y']
Number of classes: 24


In [5]:
train_loader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    num_workers=4,
    pin_memory = True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=128,
    shuffle=False,
    num_workers=4
)

train_dataset.classes == val_dataset.classes

print(len(train_dataset), len(val_dataset))

27455 7172


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights = MobileNet_V2_Weights.DEFAULT
model = mobilenet_v2(weights=weights)

for param in model.features.parameters():
    param.requires_grad = False
#freezing the feature extractor

model.classifier[1] = nn.Linear(
    model.last_channel,
    num_classes 
    #Linear(1280 → 36)
)

model = model.to(device)

In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.classifier.parameters(),
    lr=1e-4
)

scaler = torch.amp.GradScaler('cuda')

In [8]:
epochs = 15

for epoch in range(epochs):
    # TRAIN
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        optimizer.zero_grad()

        with torch.amp.autocast('cuda'):
            outputs = model(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()

        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = running_loss / len(train_loader)
    train_acc = 100 * correct / total

    # VALIDATION
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            with torch.amp.autocast('cuda'):
                outputs = model(images)
                loss = criterion(outputs, labels)

            val_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = 100 * val_correct / val_total

    print(
        f"Epoch [{epoch+1}/{epochs}] "
        f"- Train Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}% "
        f"- Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%"
    )


Epoch [1/15] - Train Loss: 2.9462, Train Acc: 26.77% - Val Loss: 2.7473, Val Acc: 40.37%
Epoch [2/15] - Train Loss: 2.5282, Train Acc: 54.56% - Val Loss: 2.4014, Val Acc: 51.88%
Epoch [3/15] - Train Loss: 2.2047, Train Acc: 64.07% - Val Loss: 2.1340, Val Acc: 58.44%
Epoch [4/15] - Train Loss: 1.9510, Train Acc: 68.95% - Val Loss: 1.9293, Val Acc: 62.37%
Epoch [5/15] - Train Loss: 1.7535, Train Acc: 71.81% - Val Loss: 1.7627, Val Acc: 63.97%
Epoch [6/15] - Train Loss: 1.5920, Train Acc: 74.02% - Val Loss: 1.6308, Val Acc: 66.23%
Epoch [7/15] - Train Loss: 1.4641, Train Acc: 75.66% - Val Loss: 1.5325, Val Acc: 67.18%
Epoch [8/15] - Train Loss: 1.3553, Train Acc: 76.99% - Val Loss: 1.4368, Val Acc: 69.00%
Epoch [9/15] - Train Loss: 1.2638, Train Acc: 78.15% - Val Loss: 1.3704, Val Acc: 69.81%
Epoch [10/15] - Train Loss: 1.1846, Train Acc: 79.33% - Val Loss: 1.2956, Val Acc: 70.93%
Epoch [11/15] - Train Loss: 1.1152, Train Acc: 80.35% - Val Loss: 1.2365, Val Acc: 71.82%
Epoch [12/15] - Tra

In [9]:
torch.save(model.state_dict(), "mobilent_new.pth")
print("Model saved.")

Model saved.


In [10]:
import json

with open("class_names.json", "w") as f:
    json.dump(class_names, f)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights = MobileNet_V2_Weights.DEFAULT
model = mobilenet_v2(weights=None)  # no pretrained at inference

model.classifier[1] = torch.nn.Linear(
    model.last_channel,
    len(class_names)
)

model.load_state_dict(torch.load("mobilent_new.pth", map_location=device))
model.to(device)
model.eval()

transform = weights.transforms()

In [12]:
def preprocess(img):
    if img is None or img.size == 0:
        return None

    img = cv2.resize(img, (255, 255))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype("float32") / 255.0
    img = np.transpose(img, (2, 0, 1))  # HWC → CHW
    img = torch.from_numpy(img).unsqueeze(0)
    return img

In [13]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.6,
    min_tracking_confidence=0.6
)

In [14]:
cap = cv2.VideoCapture(0)
mp_drawing = mp.solutions.drawing_utils

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)

    h, w, _ = frame.shape

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)

    if results.multi_hand_landmarks:
        hand_landmarks = results.multi_hand_landmarks[0]

        # Draw skeleton (visual only)
        mp_drawing.draw_landmarks(
            frame,
            hand_landmarks,
            mp_hands.HAND_CONNECTIONS
        )

        # Landmark bounding box
        x_coords = [lm.x for lm in hand_landmarks.landmark]
        y_coords = [lm.y for lm in hand_landmarks.landmark]

        xmin = int(min(x_coords) * w)
        ymin = int(min(y_coords) * h)
        xmax = int(max(x_coords) * w)
        ymax = int(max(y_coords) * h)

        hand_crop = frame[ymin:ymax, xmin:xmax]
        
        # Make square & scale
        box_size = max(xmax - xmin, ymax - ymin)
        cx = (xmin + xmax) // 2
        cy = (ymin + ymax) // 2

        scale = 1.3
        half = int(box_size * scale / 2)

        xmin = max(0, cx - half)
        ymin = max(0, cy - half)
        xmax = min(w, cx + half)
        ymax = min(h, cy + half)

        hand_crop = frame[ymin:ymax, xmin:xmax]

        if hand_crop.size != 0:
            hand_rgb = cv2.cvtColor(hand_crop, cv2.COLOR_BGR2RGB)
            hand_pil = Image.fromarray(hand_rgb)

            input_tensor = transform(hand_pil).unsqueeze(0).to(device)

            with torch.no_grad():
                output = model(input_tensor)
                probs = torch.softmax(output, dim=1)
                conf, pred = torch.max(probs, dim=1)

            label = class_names[pred.item()]
            confidence = conf.item()

            cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
            cv2.putText(
                frame,
                f"{label} ({confidence:.2f})",
                (xmin, ymin - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.8,
                (0, 255, 0),
                2
            )

            cv2.imshow("HAND_CROP_DEBUG", hand_crop)

    cv2.imshow("Hand Gesture Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


In [21]:
print(class_names)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y']


In [22]:
model.classifier[1].out_features == len(class_names)

True

In [24]:
train_dataset.classes == val_dataset.classes

True