In [1]:
import torch
import cv2
import numpy as np
from torchvision.transforms import Compose, Resize, ToTensor, InterpolationMode
from tensorflow.keras.models import load_model  # For loading your height detection model
from PIL import Image

In [3]:
# Load YOLOv5 model (pretrained)
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # YOLOv5 small model for object detection

# Load MiDaS model for depth estimation
midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")
midas.to('cpu').eval()

# Load your pre-trained height detection model
height_model = load_model('"E:/Anaconda/height_prediction_model.h5"')  # Adjust the path to your model

Using cache found in C:\Users\dell/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2024-10-9 Python-3.11.2 torch-2.4.1+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
Using cache found in C:\Users\dell/.cache\torch\hub\intel-isl_MiDaS_master


Loading weights:  None


Using cache found in C:\Users\dell/.cache\torch\hub\rwightman_gen-efficientnet-pytorch_master


ValueError: File format not supported: filepath="E:/Anaconda/height_prediction_model.h5". Keras 3 only supports V3 `.keras` files and legacy H5 format files (`.h5` extension). Note that the legacy SavedModel format is not supported by `load_model()` in Keras 3. In order to reload a TensorFlow SavedModel as an inference-only layer in Keras 3, use `keras.layers.TFSMLayer("E:/Anaconda/height_prediction_model.h5", call_endpoint='serving_default')` (note that your `call_endpoint` might have a different name).

In [None]:
# Transform for MiDaS input image
transform = Compose([
    Resize((384, 384), interpolation=InterpolationMode.BILINEAR),  # Resize to 384x384 as required by MiDaS
    ToTensor()
])

# Function to estimate depth using MiDaS
def estimate_depth(image):
    # Convert OpenCV image (numpy array) to PIL image
    image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))  # Convert BGR to RGB for PIL
    
    # Apply MiDaS depth estimation model
    input_batch = transform(image_pil).unsqueeze(0)  # Prepare the image for the model
    with torch.no_grad():
        prediction = midas(input_batch)
        prediction = torch.nn.functional.interpolate(
            prediction.unsqueeze(1),
            size=image.shape[:2],  # Resize depth map to match the original image size
            mode="bicubic",
            align_corners=False
        ).squeeze()
    
    depth_map = prediction.cpu().numpy()
    return depth_map

# Function to predict height using the pre-trained height detection model
def predict_height(image):
    resized_image = cv2.resize(image, (150, 150))  # Resize to the input size of the height model
    image_array = resized_image / 255.0  # Normalize
    image_array = np.expand_dims(image_array, axis=0)  # Add batch dimension
    predicted_height = height_model.predict(image_array)
    return predicted_height[0][0]  # Return the predicted height

# Access the laptop camera (0 is usually the built-in webcam)
cap = cv2.VideoCapture(0)

while True:
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame")
        break

    # Use YOLOv5 to detect objects in the frame
    results = yolo_model(frame)
    
    # Get bounding boxes of detected objects (person detection in this case)
    detected_objects = results.pandas().xyxy[0]  # Get bounding box dataframe
    
    # Apply MiDaS depth estimation on the current frame
    depth_map = estimate_depth(frame)
    
    for i, row in detected_objects.iterrows():
        # Only focus on 'person' class
        if row['name'] == 'person':
            # Get the bounding box coordinates
            x1, y1, x2, y2 = int(row['xmin']), int(row['ymin']), int(row['xmax']), int(row['ymax'])
            
            # Crop the person from the frame using the bounding box
            person_image = frame[y1:y2, x1:x2]
            
            # Predict height using your height detection model
            predicted_height = predict_height(person_image)
            
            # Focus on the lower-most pixel (feet area) of the bounding box
            feet_depth = depth_map[y2, (x1 + x2) // 2]  # Take the center of the bounding box's bottom
            
            # Draw bounding box
            cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
            
            # Display the predicted height and depth
            cv2.putText(frame, f"Predicted Height: {predicted_height:.2f} inches", (x1, y1 - 30), 
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)
            cv2.putText(frame, f"Depth: {feet_depth:.2f} meters", (x1, y1 - 10), 
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)

    # Normalize depth map for visualization
    depth_map_normalized = cv2.normalize(depth_map, None, 0, 255, cv2.NORM_MINMAX)
    depth_map_normalized = np.uint8(depth_map_normalized)
    
    # Show the frame with YOLOv5 bounding boxes, predicted height, and depth map
    cv2.imshow('YOLOv5 + MiDaS + Height Detection', frame)
    cv2.imshow('Depth Map', depth_map_normalized)
    
    # Press 'q' to quit the video stream
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the camera and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()