In [7]:
import numpy as np
import tensorflow as tf
import mediapipe as mp
import pyautogui
import cv2

print("NumPy:", np.__version__)
print("TensorFlow:", tf.__version__)
print("Mediapipe:", mp.__version__)
print("PyAutoGUI: installed")
print("OpenCV:", cv2.__version__)

NumPy: 1.24.3
TensorFlow: 2.13.0
Mediapipe: 0.10.21
PyAutoGUI: installed
OpenCV: 4.8.0


# Imports & Setup

In [10]:
import cv2
import os
import mediapipe as mp
import numpy as np
import pyautogui
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Initialize Mediapipe

In [13]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1)
mp_draw = mp.solutions.drawing_utils

# FIST (mute)

In [72]:
import cv2
import os
import mediapipe as mp
import numpy as np

# Gesture name and folder
gesture_name = "Fist"
save_path = f"gesture_data/{gesture_name}"
os.makedirs(save_path, exist_ok=True)

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1)
mp_draw = mp.solutions.drawing_utils

# Start webcam
cap = cv2.VideoCapture(0)
frame_count = 0
max_frames = 300  # Number of frames to collect

print(f"Collecting {max_frames} frames for {gesture_name}. Press 'q' to stop early.")

while True:
    ret, frame = cap.read()
    if not ret:
        continue
    
    # Mirror image
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb_frame)
    
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            # Flatten landmarks: 21 points × 3 (x,y,z) = 63 features
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])
            
            # Save as numpy array
            np.save(os.path.join(save_path, f"{gesture_name}_{frame_count}.npy"), np.array(landmarks))
            frame_count += 1
    
    # Display count on frame
    cv2.putText(frame, f"{gesture_name} frames: {frame_count}/{max_frames}", (10,30),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
    
    cv2.imshow(f"{gesture_name} Capture", frame)
    if cv2.waitKey(1) & 0xFF == ord('q') or frame_count >= max_frames:
        break

cap.release()
cv2.destroyAllWindows()
print(f"Collected {frame_count} frames for {gesture_name} gesture.")


Collecting 300 frames for Fist. Press 'q' to stop early.
Collected 300 frames for Fist gesture.


# Gesture 2: OPEN PALM (mouse movement)

In [68]:
# Create folder to save Open Palm gesture data
gesture_name = "OpenPalm"
if not os.path.exists(f"gesture_data/{gesture_name}"):
    os.makedirs(f"gesture_data/{gesture_name}")

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1)
mp_draw = mp.solutions.drawing_utils

# Start webcamimport cv2
import os
import mediapipe as mp
import numpy as np

# Gesture name and folder
gesture_name = "OpenPalm"
save_path = f"gesture_data/{gesture_name}"
os.makedirs(save_path, exist_ok=True)

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1)
mp_draw = mp.solutions.drawing_utils

# Start webcam
cap = cv2.VideoCapture(0)
frame_count = 0
max_frames = 300  # Number of frames to collect

print(f"Collecting {max_frames} frames for {gesture_name}. Press 'q' to stop early.")

while True:
    ret, frame = cap.read()
    if not ret:
        continue
    
    # Mirror image
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb_frame)
    
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            # Flatten landmarks: 21 points × 3 (x,y,z) = 63 features
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])
            
            # Save as numpy array
            np.save(os.path.join(save_path, f"{gesture_name}_{frame_count}.npy"), np.array(landmarks))
            frame_count += 1
    
    # Display count on frame
    cv2.putText(frame, f"{gesture_name} frames: {frame_count}/{max_frames}", (10,30),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
    
    cv2.imshow("Open Palm Capture", frame)
    if cv2.waitKey(1) & 0xFF == ord('q') or frame_count >= max_frames:
        break

cap.release()
cv2.destroyAllWindows()
print(f"Collected {frame_count} frames for {gesture_name} gesture.")

cap = cv2.VideoCapture(0)
frame_count = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Flip the frame for mirror view
    frame = cv2.flip(frame, 1)
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb_frame)
    
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            # Save landmarks as numpy array
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.append([lm.x, lm.y, lm.z])
            landmarks = np.array(landmarks)
            np.save(f"gesture_data/{gesture_name}/{gesture_name}_{frame_count}.npy", landmarks)
            frame_count += 1
    
    cv2.imshow("Open Palm Capture", frame)
    if cv2.waitKey(1) & 0xFF == ord('q') or frame_count >= 300:
        break

cap.release()
cv2.destroyAllWindows()
print(f"Collected {frame_count} frames for {gesture_name} gesture.")

Collecting 300 frames for OpenPalm. Press 'q' to stop early.
Collected 300 frames for OpenPalm gesture.
Collected 19 frames for OpenPalm gesture.


# working

In [60]:
import cv2
import mediapipe as mp
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import pyautogui

# Initialize Mediapipe
mp_hands = mp.solutions.hands
mp_draw = mp.solutions.drawing_utils
hands = mp_hands.Hands(max_num_hands=1)

# Example gesture data (simplified for demo)
X = [
    [0]*42,  # Fist placeholder
    [1]*42   # OpenPalm placeholder
]
y = ["Fist", "OpenPalm"]

# Train KNN
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X, y)

# Start webcam
cap = cv2.VideoCapture(0)

screen_width, screen_height = pyautogui.size()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    if results.multi_hand_landmarks:
        for handLms in results.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, handLms, mp_hands.HAND_CONNECTIONS)

            # Flatten landmarks
            landmarks = []
            for lm in handLms.landmark:
                landmarks.extend([lm.x, lm.y])

            # Predict gesture
            gesture = model.predict([landmarks])[0]
            cv2.putText(frame, gesture, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)

            # Perform actions
            if gesture == "Fist":
                pyautogui.click()  # Left click
            elif gesture == "OpenPalm":
                # Move mouse based on index finger tip
                x = int(handLms.landmark[8].x * screen_width)
                y = int(handLms.landmark[8].y * screen_height)
                pyautogui.moveTo(x, y)

    cv2.imshow("Gesture Control", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

# Peace sign (scroll down)

In [38]:
import cv2
import mediapipe as mp
import numpy as np
import pickle

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1)
mp_draw = mp.solutions.drawing_utils

gesture_data = []  # List to store [landmarks, label]
gesture_name = "Peace"  # Label for this gesture
num_samples = 100  # Number of frames to collect

cap = cv2.VideoCapture(0)
collected = 0

while collected < num_samples:
    ret, frame = cap.read()
    if not ret:
        continue
    
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(frame_rgb)
    
    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            # Draw landmarks
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            # Extract landmarks
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.append(lm.x)
                landmarks.append(lm.y)
            gesture_data.append([landmarks, gesture_name])
            collected += 1
    
    cv2.putText(frame, f"Collected: {collected}/{num_samples}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
    cv2.imshow("Peace Sign Data Collection", frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

# Save data
with open("gesture_data.pkl", "ab") as f:  # append mode to combine with previous gestures
    pickle.dump(gesture_data, f)

print("Peace Sign data collected successfully!")

Peace Sign data collected successfully!


# train with KNN

In [41]:
import pickle
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import cv2
import mediapipe as mp
import pyautogui

# Load all gesture data
all_data = []
try:
    with open("gesture_data.pkl", "rb") as f:
        while True:
            try:
                data = pickle.load(f)
                all_data.extend(data)
            except EOFError:
                break
except FileNotFoundError:
    print("No gesture data found. Collect gestures first.")
    exit()

# Prepare features and labels
X = [item[0] for item in all_data]  # landmarks
y = [item[1] for item in all_data]  # gesture labels

# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
print("KNN model trained successfully!")

# Initialize Mediapipe
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1)
mp_draw = mp.solutions.drawing_utils

# Start webcam
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        continue

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(frame_rgb)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.append(lm.x)
                landmarks.append(lm.y)
            
            prediction = knn.predict([landmarks])[0]  # Predict gesture
            cv2.putText(frame, prediction, (10,50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
            
            # Perform actions
            if prediction == "Peace":
                pyautogui.scroll(-300)  # Scroll down
            elif prediction == "Fist":
                pyautogui.press("space")  # Example: Play/Pause video
            elif prediction == "OpenPalm":
                pyautogui.scroll(300)  # Scroll up

    cv2.imshow("Gesture Control", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

KNN model trained successfully!


# Thumbs up (volume up)

In [44]:
import cv2
import mediapipe as mp
import pickle

gesture_name = "ThumbsUp"
data = []

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1)
mp_draw = mp.solutions.drawing_utils

cap = cv2.VideoCapture(0)

print(f"Collecting data for {gesture_name}. Press 'q' to stop.")

while True:
    ret, frame = cap.read()
    if not ret:
        continue

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(frame_rgb)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.append(lm.x)
                landmarks.append(lm.y)
            data.append([landmarks, gesture_name])  # Save landmarks with gesture label

    cv2.imshow("Collect Thumbs Up", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

# Save gesture data
try:
    with open("gesture_data.pkl", "rb") as f:
        all_data = pickle.load(f)
except:
    all_data = []

all_data.extend(data)

with open("gesture_data.pkl", "wb") as f:
    pickle.dump(all_data, f)

print(f"Data collection for {gesture_name} completed. Total samples: {len(data)}")

Collecting data for ThumbsUp. Press 'q' to stop.
Data collection for ThumbsUp completed. Total samples: 2129


# Train with KNN

In [47]:
import cv2
import mediapipe as mp
import numpy as np
import pickle
from sklearn.neighbors import KNeighborsClassifier
import pyautogui

# Load all gesture data
with open("gesture_data.pkl", "rb") as f:
    all_data = pickle.load(f)

# Prepare features and labels
X = []
y = []
for landmarks, label in all_data:
    X.append(landmarks)
    y.append(label)

X = np.array(X)
y = np.array(y)

# Train KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
print("KNN retrained with new gesture!")

# Start webcam for gesture recognition
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1)
mp_draw = mp.solutions.drawing_utils

cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        continue

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(frame_rgb)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            landmarks = []
            for lm in hand_landmarks.landmark:
                landmarks.append(lm.x)
                landmarks.append(lm.y)
            
            prediction = knn.predict([landmarks])[0]
            cv2.putText(frame, f"Gesture: {prediction}", (10, 50),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            # Map gestures to actions
            if prediction == "ThumbsUp":
                pyautogui.press('volumeup')

    cv2.imshow("Gesture Control", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

KNN retrained with new gesture!


# Thumbs down (volume down)

In [52]:
import cv2
import mediapipe as mp
import numpy as np
import pickle

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1)
mp_draw = mp.solutions.drawing_utils

gesture_name = "ThumbsDown"
data = []

cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        continue
    
    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)
    
    if result.multi_hand_landmarks:
        for handLms in result.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, handLms, mp_hands.HAND_CONNECTIONS)
            # Flatten landmarks
            landmarks = []
            for lm in handLms.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])
            data.append(landmarks)
    
    cv2.putText(frame, f"Collecting: {gesture_name}", (10, 30), 
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)
    cv2.imshow("Collecting Thumbs Down", frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

# Save data
with open(f"{gesture_name}.pkl", "wb") as f:
    pickle.dump(np.array(data), f)

print(f"{gesture_name} data saved successfully!")


ThumbsDown data saved successfully!


# All 4 gestures

In [80]:
import cv2
import mediapipe as mp
import numpy as np
import pickle
import os
from sklearn.neighbors import KNeighborsClassifier
import pyautogui

# Load all gesture data
all_data = []

# Load gesture_data.pkl (ThumbsUp, Fist if saved)
try:
    with open("gesture_data.pkl", "rb") as f:
        data = pickle.load(f)
        all_data.extend(data)
except FileNotFoundError:
    print("gesture_data.pkl not found, skipping.")

# Load ThumbsDown.pkl
try:
    with open("ThumbsDown.pkl", "rb") as f:
        data = pickle.load(f)
        # convert to [landmarks, label]
        data = [[item.tolist(), "ThumbsDown"] for item in data]
        all_data.extend(data)
except FileNotFoundError:
    print("ThumbsDown.pkl not found, skipping.")

# Load OpenPalm .npy files
openpalm_folder = "gesture_data/OpenPalm"
if os.path.exists(openpalm_folder):
    for file in os.listdir(openpalm_folder):
        if file.endswith(".npy"):
            landmarks = np.load(os.path.join(openpalm_folder, file)).flatten()
            all_data.append([landmarks.tolist(), "OpenPalm"])
else:
    print("OpenPalm folder not found, skipping.")

# Prepare features and labels
X = []
y = []

for landmarks, label in all_data:
    # Make sure each sample has 63 features (21 points × 3)
    if len(landmarks) == 42:  # Only x,y
        landmarks_3d = []
        for i in range(21):
            x = landmarks[i*2]
            y_coord = landmarks[i*2+1]
            z = 0
            landmarks_3d.extend([x, y_coord, z])
        landmarks = landmarks_3d
    X.append(landmarks)
    y.append(label)

X = np.array(X)
y = np.array(y)
print("All landmarks shape:", X.shape)

# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
print("KNN trained for all gestures!")

# Initialize Mediapipe
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1)
mp_draw = mp.solutions.drawing_utils

# Start webcam
cap = cv2.VideoCapture(0)
screen_width, screen_height = pyautogui.size()

while True:
    ret, frame = cap.read()
    if not ret:
        continue

    frame = cv2.flip(frame, 1)  # mirror view
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)

    if results.multi_hand_landmarks:
        for handLms in results.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, handLms, mp_hands.HAND_CONNECTIONS)

            # Flatten landmarks
            landmarks = []
            for lm in handLms.landmark:
                landmarks.extend([lm.x, lm.y, lm.z])

            # Predict gesture
            gesture = knn.predict([landmarks])[0]
            cv2.putText(frame, f"{gesture}", (10, 50),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            # Map gestures to actions
            if gesture == "OpenPalm":
                # Move mouse using index finger tip
                x = int(handLms.landmark[8].x * screen_width)
                y = int(handLms.landmark[8].y * screen_height)
                pyautogui.moveTo(x, y)
            elif gesture == "Fist":
                pyautogui.press("volumemute")
            elif gesture == "ThumbsUp":
                pyautogui.press("volumeup")
            elif gesture == "ThumbsDown":
                pyautogui.press("volumedown")

    cv2.imshow("Gesture Control", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

All landmarks shape: (3024, 63)
KNN trained for all gestures!
