In [1]:
import os
import json
from PIL import Image
import cv2
import mediapipe as mp
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import ImageFolder, ImageNet
from torchvision import transforms, models
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights
import random, shutil

In [2]:
if torch.cuda.is_available():
    print("CUDA is available. PyTorch can access the GPU.")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"Current GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. PyTorch is using the CPU.")


CUDA is available. PyTorch can access the GPU.
Number of GPUs available: 1
Current GPU Name: NVIDIA GeForce RTX 4050 Laptop GPU


In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

dataset = ImageFolder(
    root = "newdata",
    transform = transform

)
#'dataset' is used to identify the folder in which we have our dataset

In [4]:
class_names = dataset.classes
num_classes = len(class_names)
print(class_names)
print(f"Number of classes: {num_classes}") 

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
Number of classes: 36


In [5]:
SOURCE_DIR = "newdata"
OUTPUT_DIR = "data"
TRAIN_RATIO = 0.8

random.seed(42)

for split in ["train", "val"]:
    os.makedirs(os.path.join(OUTPUT_DIR, split), exist_ok=True)

for class_name in os.listdir(SOURCE_DIR):
    class_path = os.path.join(SOURCE_DIR, class_name)

    if not os.path.isdir(class_path):
        continue

    images = [
        img for img in os.listdir(class_path)
        if img.lower().endswith((".jpg", ".png", ".jpeg"))
    ]

    random.shuffle(images)

    n_train = int(len(images) * TRAIN_RATIO)
    train_imgs = images[:n_train]
    val_imgs = images[n_train:]

    for split, split_imgs in [("train", train_imgs), ("val", val_imgs)]:
        split_class_dir = os.path.join(OUTPUT_DIR, split, class_name)
        os.makedirs(split_class_dir, exist_ok=True)

        for img in split_imgs:
            shutil.copy(
                os.path.join(class_path, img),
                os.path.join(split_class_dir, img)
            )

    print(f"{class_name}: {len(train_imgs)} train, {len(val_imgs)} val")

print("\nTrain / Validation split completed.")

0: 20 train, 5 val
1: 20 train, 5 val
2: 20 train, 5 val
3: 20 train, 5 val
4: 20 train, 5 val
5: 20 train, 5 val
6: 20 train, 5 val
7: 20 train, 5 val
8: 20 train, 5 val
9: 20 train, 5 val
A: 20 train, 5 val
B: 20 train, 5 val
C: 20 train, 5 val
D: 20 train, 5 val
E: 20 train, 5 val
F: 20 train, 5 val
G: 20 train, 5 val
H: 20 train, 5 val
I: 20 train, 5 val
J: 20 train, 5 val
K: 20 train, 5 val
L: 20 train, 5 val
M: 20 train, 5 val
N: 20 train, 5 val
O: 20 train, 5 val
P: 20 train, 5 val
Q: 20 train, 5 val
R: 20 train, 5 val
S: 20 train, 5 val
T: 20 train, 5 val
U: 20 train, 5 val
V: 20 train, 5 val
W: 20 train, 5 val
X: 20 train, 5 val
Y: 20 train, 5 val
Z: 20 train, 5 val

Train / Validation split completed.


In [6]:
train_dataset = ImageFolder(
    root="data/train",
    transform=transform
)

val_dataset = ImageFolder(
    root="data/val",
    transform=transform
)

In [7]:
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    num_workers=0
)

val_loader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=0
)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights = MobileNet_V2_Weights.DEFAULT
model = mobilenet_v2(weights=weights)

for param in model.features.parameters():
    param.requires_grad = False
#freezing the feature extractor

model.classifier[1] = nn.Linear(
    model.last_channel,
    num_classes 
    #Linear(1280 → 36)
)

model = model.to(device)

In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.classifier.parameters(),
    lr=1e-4
)

In [14]:
epochs = 20

for epoch in range(epochs):
    #train
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = running_loss / len(train_loader)
    train_acc = 100 * correct / total

    #val
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            val_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = 100 * val_correct / val_total

    print(
        f"Epoch [{epoch+1}/{epochs}] "
        f"- Train Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}% "
        f"- Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%"
    )


Epoch [1/20] - Train Loss: 1.7949, Train Acc: 88.19% - Val Loss: 1.9430, Val Acc: 77.22%
Epoch [2/20] - Train Loss: 1.7375, Train Acc: 88.75% - Val Loss: 1.9248, Val Acc: 77.78%
Epoch [3/20] - Train Loss: 1.6916, Train Acc: 87.92% - Val Loss: 1.8784, Val Acc: 75.56%
Epoch [4/20] - Train Loss: 1.6604, Train Acc: 89.58% - Val Loss: 1.8313, Val Acc: 78.33%
Epoch [5/20] - Train Loss: 1.6275, Train Acc: 89.17% - Val Loss: 1.8020, Val Acc: 78.33%
Epoch [6/20] - Train Loss: 1.5937, Train Acc: 88.61% - Val Loss: 1.7710, Val Acc: 76.67%
Epoch [7/20] - Train Loss: 1.5746, Train Acc: 89.44% - Val Loss: 1.7438, Val Acc: 78.89%
Epoch [8/20] - Train Loss: 1.5026, Train Acc: 90.14% - Val Loss: 1.7134, Val Acc: 77.78%
Epoch [9/20] - Train Loss: 1.4727, Train Acc: 90.83% - Val Loss: 1.6850, Val Acc: 78.89%
Epoch [10/20] - Train Loss: 1.4333, Train Acc: 91.11% - Val Loss: 1.6677, Val Acc: 76.67%
Epoch [11/20] - Train Loss: 1.3885, Train Acc: 90.83% - Val Loss: 1.6072, Val Acc: 80.56%
Epoch [12/20] - Tra

In [15]:
torch.save(model.state_dict(), "mobilenet_gesture.pth")
print("Model saved.")

Model saved.


In [16]:
import json

with open("class_names.json", "w") as f:
    json.dump(class_names, f)

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights = MobileNet_V2_Weights.DEFAULT
model = mobilenet_v2(weights=None)  # no pretrained at inference

model.classifier[1] = torch.nn.Linear(
    model.last_channel,
    len(class_names)
)

model.load_state_dict(torch.load("mobilenet_gesture.pth", map_location=device))
model.to(device)
model.eval()

transform = weights.transforms()

In [18]:
def preprocess(img):
    if img is None or img.size == 0:
        return None

    img = cv2.resize(img, (255, 255))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype("float32") / 255.0
    img = np.transpose(img, (2, 0, 1))  # HWC → CHW
    img = torch.from_numpy(img).unsqueeze(0)
    return img

In [22]:
import cv2
import torch

cap = cv2.VideoCapture(0)
cv2.namedWindow("Gesture Detection", cv2.WINDOW_NORMAL)

model.eval()

with torch.no_grad():
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        hand = frame

        if hand is None:
            cv2.putText(
                frame, "No hand detected",
                (30, 40),
                cv2.FONT_HERSHEY_SIMPLEX,
                1, (0, 0, 255), 2
            )
            cv2.imshow("Gesture Detection", frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
            continue

        x = preprocess(hand).to(device)

        outputs = model(x)
        probs = torch.softmax(outputs, dim=1)
        conf, pred = torch.max(probs, dim=1)

        if conf.item() < 0.3:
            label = "Unknown"
        else:
            label = class_names[pred.item()]

        cv2.putText(
            frame,
            f"{label} ({conf.item():.2f})",
            (30, 40),
            cv2.FONT_HERSHEY_SIMPLEX,
            1,
            (0, 255, 0),
            2
        )

        cv2.imshow("Gesture Detection", frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()

KeyboardInterrupt: 

In [20]:
print(class_names)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [21]:
model.classifier[1].out_features == len(class_names)

True