In [6]:
import tensorflow as tf
import mediapipe as mp
import numpy as np
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout, Input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers
import matplotlib.pyplot as plt

# Initialize MediaPipe for hand landmarks
mp_hands = mp.solutions.hands
hands = mp_hands.Hands()

# Define function to extract MediaPipe features (e.g., hand landmarks)
def extract_mediapipe_features(image):
    # Convert image from float32 (range [0, 1]) to uint8 (range [0, 255])
    image_uint8 = np.uint8(image * 255)
    results = hands.process(image_uint8)
    if results.multi_hand_landmarks:
        features = []
        for hand_landmarks in results.multi_hand_landmarks:
            for landmark in hand_landmarks.landmark:
                features.extend([landmark.x, landmark.y, landmark.z])
        return np.array(features)
    else:
        return np.zeros(63)  # Adjust length based on the number of landmarks


# Load VGG16 without the fully connected layers (include_top=False)
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze all layers of VGG16
for layer in base_model.layers:
    layer.trainable = False

# Custom classification head
vgg16_output = base_model.output
vgg16_output = Flatten()(vgg16_output)

I0000 00:00:1740743931.473843 4677709 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.3), renderer: Apple M1
W0000 00:00:1740743931.552938 4690288 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1740743931.578433 4690288 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [7]:
# Define input layer for image
image_input = Input(shape=(224, 224, 3))
vgg16_features = base_model(image_input)
vgg16_features = Flatten()(vgg16_features)

# Define input layer for MediaPipe features (hand landmarks)
mediapipe_input = Input(shape=(63,))  # 63 features for hand landmarks (21 landmarks * 3 coordinates each)
x = layers.Concatenate()([vgg16_features, mediapipe_input])

# Add fully connected layers
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)
x = Dense(1, activation='sigmoid')(x)  # Binary classification

# Define model
model = Model(inputs=[image_input, mediapipe_input], outputs=x)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

In [8]:
# Image data generators for training and validation
train_datagen = ImageDataGenerator(rescale=1./255, rotation_range=20, horizontal_flip=True)
val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    "./dataset/train",
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary'
)

val_generator = val_datagen.flow_from_directory(
    "./dataset/valid",
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary'
)

Found 607 images belonging to 2 classes.
Found 57 images belonging to 2 classes.


In [9]:

# Step 1: Create lists to store the MediaPipe features for train and validation sets
train_mediapipe_features = []
val_mediapipe_features = []

# Step 2: Extract MediaPipe features for training and validation datasets
for image_batch, _ in train_generator:
    for img in image_batch:
        features = extract_mediapipe_features(img)
        train_mediapipe_features.append(features)
        
for image_batch, _ in val_generator:
    for img in image_batch:
        features = extract_mediapipe_features(img)
        val_mediapipe_features.append(features)

# Convert lists to NumPy arrays
train_mediapipe_features = np.array(train_mediapipe_features)
val_mediapipe_features = np.array(val_mediapipe_features)

W0000 00:00:1740743934.912395 4690293 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.


KeyboardInterrupt: 

In [None]:
# Step 3: Train the model using the image and MediaPipe features
history = model.fit(
    [train_generator, train_mediapipe_features],
    validation_data=([val_generator, val_mediapipe_features]),
    epochs=10
)