In [1]:
import cv2
import torch
from torchvision import transforms, models
from PIL import Image, ImageEnhance
import numpy as np
from collections import deque

In [2]:
# Mac GPU support (MPS)
device = torch.device("mps")
print(f"Using device: {device}")

Using device: mps


In [3]:
# Load the pretrained ResNet18 architecture and adjust final layer (we did this in ASL_model_training)
model = models.resnet18(pretrained=False)
model.fc = torch.nn.Linear(model.fc.in_features, 29)  # 29 ASL classes



In [4]:
# Load saved best model weights (we the best_model_asl.pth that was saved from model training script)
model.load_state_dict(torch.load('/Users/suparnac/dev_envs/CV_Projects/Amarican_Sign_Language/best_model_asl.pth', map_location=device))
model.to(device)
model.eval()  # Set to evaluation mode

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [5]:
# Class-to-index mapping (exactly as used during training)
class_to_idx = {
    'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6,
    'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13,
    'O': 14, 'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20,
    'V': 21, 'W': 22, 'X': 23, 'Y': 24, 'Z': 25, 'del': 26, 'nothing': 27, 'space': 28
}

# Reverse mapping from idx to class label
idx_to_class = {v: k for k, v in class_to_idx.items()}



In [6]:
# Define transforms matching training preprocessing exactly 
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])


In [7]:
# Initialize webcam capture with AVFoundation backend for Mac
cap = cv2.VideoCapture(0, cv2.CAP_AVFOUNDATION)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)


True

In [8]:
# Define ROI (Region of Interest) for hand detection
roi_top, roi_bottom = 100, 400
roi_left, roi_right = 200, 500

# Prediction smoothing using a sliding window
prediction_window = deque(maxlen=5)  # Store last 5 predictions



In [9]:
def preprocess_frame(frame):
    """Enhanced preprocessing for better real-time performance"""
    # Convert to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    # Enhance contrast and brightness
    pil_image = Image.fromarray(rgb_frame)
    
    # Enhance contrast slightly
    enhancer = ImageEnhance.Contrast(pil_image)
    pil_image = enhancer.enhance(1.2)
    
    # Enhance brightness slightly
    enhancer = ImageEnhance.Brightness(pil_image)
    pil_image = enhancer.enhance(1.1)
    
    return pil_image


In [10]:
def get_smoothed_prediction(predictions_queue):
    """Get most frequent prediction from the queue"""
    if not predictions_queue:
        return "nothing"
    
    # Count occurrences of each prediction
    prediction_counts = {}
    for pred in predictions_queue:
        prediction_counts[pred] = prediction_counts.get(pred, 0) + 1
    
    # Return most frequent prediction
    return max(prediction_counts, key=prediction_counts.get)

print("Starting ASL detection with improved performance...")
print("Position your hand in the green ROI box for best results.")
print("Press 'q' to quit.")



Starting ASL detection with improved performance...
Position your hand in the green ROI box for best results.
Press 'q' to quit.


In [11]:
while True:
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame")
        break
    
    # Flip frame horizontally for mirror effect (more natural for user)
    frame = cv2.flip(frame, 1)
    
    # Draw ROI rectangle on frame (for hand placement/ place your hand in that region or better accuracy)
    cv2.rectangle(frame, (roi_left, roi_top), (roi_right, roi_bottom), (0, 255, 0), 2)
    cv2.putText(frame, "Position hand here", (roi_left, roi_top-10), 
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 1)
    
    # Extract ROI from frame
    roi = frame[roi_top:roi_bottom, roi_left:roi_right]
    
    # Check if ROI is valid
    if roi.size > 0:
        try:
            processed_image = preprocess_frame(roi)
  
            input_tensor = transform(processed_image).unsqueeze(0).to(device)

            with torch.no_grad():
                outputs = model(input_tensor)
                probabilities = torch.softmax(outputs, dim=1)
                confidence, predicted_idx = torch.max(probabilities, 1)
                
                predicted_class = idx_to_class[int(predicted_idx)]
                confidence_score = float(confidence)

                if confidence_score > 0.5:  # Confidence threshold
                    prediction_window.append(predicted_class)
                else:
                    prediction_window.append("nothing")  # Low confidence

            smoothed_prediction = get_smoothed_prediction(prediction_window)

            display_text = f'Prediction: {smoothed_prediction.upper()}'
            confidence_text = f'Confidence: {confidence_score:.2f}'

            color = (0, 0, 255)  # Red color
            
            cv2.putText(frame, display_text, (10, 40), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1.2, color, 3)
            cv2.putText(frame, confidence_text, (10, 80), 
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 2)
            
        except Exception as e:
            print(f"Error during inference: {e}")
            cv2.putText(frame, "Processing Error", (10, 40), 
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
    
    # Show frame
    cv2.imshow('Improved ASL Real-time Detection', frame)
    
    # Exit if 'q' pressed
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break
    elif key == ord('r'):  # Reset prediction window
        prediction_window.clear()
        print("Prediction window reset")

cap.release()
cv2.destroyAllWindows()
