In [1]:
import os
import json
from PIL import Image
import cv2
import mediapipe as mp
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights

In [2]:

if torch.cuda.is_available():
    print("CUDA is available. PyTorch can access the GPU.")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"Current GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. PyTorch is using the CPU.")


CUDA is available. PyTorch can access the GPU.
Number of GPUs available: 1
Current GPU Name: NVIDIA GeForce RTX 4050 Laptop GPU


In [2]:
#class_names = ['Hello','Live long','Thank you','Thumbs down','Thumbs up']
class_names = [
    'hello',
    'thumbs_up',
    'thumbs_down',
    'yo'
]
num_classes = len(class_names)
print(f"Number of classes: {num_classes}") 
print(class_names)

Number of classes: 4
['hello', 'thumbs_up', 'thumbs_down', 'yo']


In [4]:
class COCODataset(Dataset):
    def __init__(self, images_dir, annotation_file, transform=None):
        self.images_dir = images_dir
        self.transform = transform

        with open(annotation_file, 'r') as f:
            coco = json.load(f)

        self.image_id_to_filename = {
            img['id']: img['file_name']
            for img in coco['images']
        }

        self.cat_id_to_name = {
            cat['id']: cat['name'].lower().replace(" ", "_")
            for cat in coco['categories']
        }

        self.class_to_idx = {
            name: idx for idx, name in enumerate(class_names)
        }

        self.samples = []
        for ann in coco['annotations']:
            image_id = ann['image_id']
            cat_name = self.cat_id_to_name[ann['category_id']]
            label = self.class_to_idx[cat_name]

            filename = self.image_id_to_filename[image_id]
            self.samples.append((filename, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        filename, label = self.samples[idx]
        base_name = os.path.basename(filename)
        img_path = os.path.join(self.images_dir, base_name)

        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, label

In [5]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

In [3]:
dataset = COCODataset(
    images_dir="hand_gestures/images",
    annotation_file="hand_gestures/result.json",
    transform=transform
)

loader = DataLoader(
    dataset,
    batch_size=16,
    shuffle=True,
    num_workers=0
)

NameError: name 'COCODataset' is not defined

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights = MobileNet_V2_Weights.DEFAULT
model = mobilenet_v2(weights=weights)

model.classifier[1] = nn.Linear(
    model.last_channel,
    num_classes
)

model = model.to(device)

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [9]:
epochs = 15

for epoch in range(epochs):
    model.train()
    running_loss = 0

    for images, labels in loader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(loader)
    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {avg_loss:.4f}")

Epoch [1/15] - Loss: 1.3644
Epoch [2/15] - Loss: 1.0890
Epoch [3/15] - Loss: 0.9565
Epoch [4/15] - Loss: 0.7874
Epoch [5/15] - Loss: 0.6536
Epoch [6/15] - Loss: 0.5554
Epoch [7/15] - Loss: 0.4696
Epoch [8/15] - Loss: 0.3558
Epoch [9/15] - Loss: 0.3313
Epoch [10/15] - Loss: 0.2742
Epoch [11/15] - Loss: 0.2013
Epoch [12/15] - Loss: 0.1538
Epoch [13/15] - Loss: 0.1434
Epoch [14/15] - Loss: 0.1872
Epoch [15/15] - Loss: 0.2406


In [10]:
torch.save(model.state_dict(), "mobilenet_gesture.pth")
print("Model saved.")

Model saved.


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights = MobileNet_V2_Weights.DEFAULT
model = mobilenet_v2(weights=None)  # no pretrained at inference

model.classifier[1] = torch.nn.Linear(
    model.last_channel,
    len(class_names)
)

model.load_state_dict(torch.load("mobilenet_gesture.pth", map_location=device))
model.to(device)
model.eval()

transform = weights.transforms()

In [11]:
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    h, w, _ = frame.shape
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:

            # ---- Get bounding box from landmarks ----
            x_list = [lm.x for lm in hand_landmarks.landmark]
            y_list = [lm.y for lm in hand_landmarks.landmark]

            x_min = int(min(x_list) * w)
            y_min = int(min(y_list) * h)
            x_max = int(max(x_list) * w)
            y_max = int(max(y_list) * h)

            # Add padding
            pad = 20
            x_min = max(0, x_min - pad)
            y_min = max(0, y_min - pad)
            x_max = min(w, x_max + pad)
            y_max = min(h, y_max + pad)

            hand_crop = frame[y_min:y_max, x_min:x_max]

            if hand_crop.size == 0:
                continue

            # ---- Preprocess for MobileNet ----
            hand_pil = Image.fromarray(cv2.cvtColor(hand_crop, cv2.COLOR_BGR2RGB))
            input_tensor = transform(hand_pil).unsqueeze(0).to(device)

            # ---- Prediction ----
            with torch.no_grad():
                outputs = model(input_tensor)
                probs = torch.softmax(outputs, dim=1)
                conf, pred = torch.max(probs, dim=1)

            label = class_names[pred.item()]
            confidence = conf.item()
            print(outputs.softmax(dim=1))
            # ---- Draw ----
            cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
            cv2.putText(
                frame,
                f"{label} ({confidence:.2f})",
                (x_min, y_min - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.8,
                (0, 255, 0),
                2
            )

            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    cv2.imshow("Hand Gesture Recognition", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

tensor([[0.2025, 0.4428, 0.2115, 0.1432]], device='cuda:0')
tensor([[0.1919, 0.4393, 0.2120, 0.1569]], device='cuda:0')
tensor([[0.2048, 0.3514, 0.2855, 0.1583]], device='cuda:0')
tensor([[0.1999, 0.3944, 0.2510, 0.1547]], device='cuda:0')
tensor([[0.2181, 0.3868, 0.2545, 0.1406]], device='cuda:0')
tensor([[0.2064, 0.2941, 0.2980, 0.2015]], device='cuda:0')
tensor([[0.2381, 0.4012, 0.2036, 0.1570]], device='cuda:0')
tensor([[0.2285, 0.3450, 0.2414, 0.1852]], device='cuda:0')
tensor([[0.2630, 0.2842, 0.2503, 0.2025]], device='cuda:0')
tensor([[0.2342, 0.3081, 0.2606, 0.1971]], device='cuda:0')
tensor([[0.2342, 0.3081, 0.2606, 0.1971]], device='cuda:0')
tensor([[0.2397, 0.2819, 0.2720, 0.2064]], device='cuda:0')
tensor([[0.2498, 0.3237, 0.2924, 0.1341]], device='cuda:0')
tensor([[0.2508, 0.3305, 0.3000, 0.1187]], device='cuda:0')
tensor([[0.2196, 0.4070, 0.2569, 0.1164]], device='cuda:0')
tensor([[0.2147, 0.4161, 0.2412, 0.1281]], device='cuda:0')
tensor([[0.2363, 0.4147, 0.2165, 0.1325]

KeyboardInterrupt: 

In [7]:
import mediapipe as mp
import cv2

mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils

hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    min_detection_confidence=0.55,
    min_tracking_confidence=0.6
)


In [9]:
from collections import Counter
print(Counter(labels))

NameError: name 'labels' is not defined