In [4]:
# ONLY RUN THIS ONCE. Uncomment right command for your OS before running.
# !source .venv/bin/activate (MacOS / Linux)
# !.venv\Scripts\activate (Windows)
!pip3 install torch torchvision
!pip install numpy
!pip install pyserial
!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl (37.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86


# The Actual Code Functionality

**ADVICE:** Follow the comments, not the code. ML libraries are highly abstracted, so I doubt you'd intuitively grasp the logic by reading the code.

In [1]:
# Check if your camera is working (ignore deprecation warning)
!python3 -c "import cv2; cap = cv2.VideoCapture(1); print(cap.isOpened()); cap.release()"

# If True, proceed. If not, try:
# 1. run !system_profiler SPCameraDataType to list existing cameras
# 2. Go to system preferences -> privacy & security -> camera -> check if your IDE has access to camera
# 3. Try play around with the indexes e.g., cv2.VideoCapture(0)
# FYI: cv.VideoCapture(1) doesn't need a proxy like Photobooth. It takes images straight from your camera.

True


In [None]:
# Only use this code if you want to check which camera index works
import cv2
import time
import platform

def print_camera_info():
    # List all available cameras on macOS
    import subprocess
    result = subprocess.run(['system_profiler', 'SPCameraDataType'], capture_output=True, text=True)
    print("Available cameras:\n", result.stdout)

try:
    print(f"OpenCV version: {cv2.__version__}")
    print(f"OS: {platform.system()} {platform.release()}")
    print_camera_info()
    
    # Try both built-in and external camera indices
    for camera_index in [0, 1, -1]:
        print(f"\nTrying camera index: {camera_index}")
        cap = cv2.VideoCapture(camera_index)
        
        if cap.isOpened():
            # Set camera properties before reading
            cap.set(cv2.CAP_PROP_CONVERT_RGB, 1.0)
            cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc('M','J','P','G'))
            
            time.sleep(2)  # Warm up
            
            ret, frame = cap.read()
            if ret and frame is not None:
                print(f"Success with camera {camera_index}")
                print(f"Frame stats - min: {frame.min()}, max: {frame.max()}, mean: {frame.mean()}")
                break
        cap.release()
    
finally:
    if 'cap' in locals():
        cap.release()


OpenCV version: 4.11.0
OS: Darwin 23.6.0
Available cameras:
 Camera:

    FaceTime HD Camera:

      Model ID: FaceTime HD Camera
      Unique ID: 3F45E80A-0176-46F7-B185-BB9E2C0E82E3

    iPhone (2) Camera:

      Model ID: iPhone15,4
      Unique ID: 38183445-CB3B-41DE-B949-417000000001



Trying camera index: 0

Trying camera index: 1
Success with camera 1
Frame stats - min: 0, max: 255, mean: 125.90424012988683


In [16]:
import cv2
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import numpy as np

# Load pre-trained model
model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.DEFAULT)
model.eval()  # Set to evaluation mode

# Init preprocess method
preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Use logging to classify errors and info messages
import logging
import time
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Before recording, close pre-existing camera connections (Fixed frame capture bug)
pre_cap = cv2.VideoCapture(1)
pre_cap.release()
del pre_cap

time.sleep(1)

try:
    # Open camera and capture image
    cap = cv2.VideoCapture(1) # 0 is your computer's default camera
    # Configure auto exposure, brightness and contrast when you come to it.
    logging.info(f"Camera is opened: {cap.isOpened()}")
    # Print camera properties to debug
    logging.info(f"Frame Width: {cap.get(cv2.CAP_PROP_FRAME_WIDTH)}")
    logging.info(f"Frame Height: {cap.get(cv2.CAP_PROP_FRAME_HEIGHT)}")
    logging.info(f"FPS: {cap.get(cv2.CAP_PROP_FPS)}")
    time.sleep(2)
    if not cap.isOpened():
         raise RuntimeError("Cannot open camera - check if camera is connected")

    # Have multiple capture attempts
    for i in range(10):
        ret, frame = cap.read() # ret: bool, frame: np.ndarray (matrix of pixels)
        time.sleep(1)
        if ret:
            logging.info(f"Frame captured: {ret} on attempt {i+1}")
            logging.info(f"Frame shape: {frame.shape if frame is not None else 'No frame'}")
            # indicators whether frame is black
            logging.info(f"Frame min value: {frame.min()}")  # Should not be 0
            logging.info(f"Frame max value: {frame.max()}")  # Should not be 0
            logging.info(f"Frame mean value: {frame.mean()}")  # Should not be 0
            break
        else:
            logging.info(f"Failed to capture frame on attempt {i+1}")

    # Look for 'test_capture.jpg' in the notebooks folder
    cv2.imwrite("test_capture.jpg", frame)
    logging.info("Frame captured and saved as test_capture.jpg")

    # Pass frame through reprocess and then classify frame
    input_tensor = preprocess(frame).unsqueeze(0)  # Add batch dimension e.g., [height, width, channels] -> [batch_size, h, w, c] [1, ...] means process 1 image at a time.
    
except Exception as e:
    logging.error(f"Camera error: {e}")
    raise

finally:
    cap.release() # release camera resources after use

# MobileNetV2 uses ImageNet labels get via HTTP request
from urllib.request import urlopen
LABELS_URL = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
labels = urlopen(LABELS_URL).read().decode('utf-8').splitlines()

with torch.no_grad():
        output = model(input_tensor)
        class_id = torch.argmax(output).item()  # Get predicted class ID

        # Softmax with numerator = e**(p-max(p)) for each p
        # softmax: list = [ numerator / sum(numerators) for each p ]
        # sum(softmax) = 1.0 so we can express confidence score as a % (0-100)
        probabilities = torch.nn.functional.softmax(output[0], dim=0) # normalises into array of positive probability values
        top3_prob, top3_catid = torch.topk(probabilities, 3)
        for i in range(3):
            # Interestingly detected objects on my face (glasses) instead of my face (with low confidence)
            logging.info(f"prediction {i+1}: {labels[top3_catid[i]]} ({top3_catid[i]}) -> {top3_prob[i].item():.3f}")

        # Calculate prediction accuracy (%)
        true_label_id = 728 # Rigt now plastic bag but change to what it actually is

        # single prediction accuracy
        single_accuracy = 100 if true_label_id == top3_catid[0] else 0
        logging.info(f"Single prediction accuracy: {single_accuracy:.2f}%")

        # top-3 prediction accuracy (not binary, has ranked score)
        top3_accuracy = 0
        # I guess ranking logic right now is p * (1/2)**(i-1)
        if true_label_id == top3_catid[0]:
            top3_accuracy = 100
        elif true_label_id == top3_catid[1]:
            top3_accuracy = 50
        elif true_label_id == top3_catid[2]:
            top3_accuracy = 25
        logging.info(f"Top-3 prediction accuracy: {top3_accuracy:.2f}%")

        # confidence score
        accuracy = top3_prob[true_label_id].item() * 100 if true_label_id in top3_catid else 0
        logging.info(f"Confidence score: {accuracy:.2f}%")


2025-01-30 09:14:25,749 - INFO - Camera is opened: True
2025-01-30 09:14:25,750 - INFO - Frame Width: 1920.0
2025-01-30 09:14:25,751 - INFO - Frame Height: 1080.0
2025-01-30 09:14:25,751 - INFO - FPS: 15.0
2025-01-30 09:14:28,784 - INFO - Frame captured: True on attempt 1
2025-01-30 09:14:28,785 - INFO - Frame shape: (1080, 1920, 3)
2025-01-30 09:14:28,788 - INFO - Frame min value: 0
2025-01-30 09:14:28,789 - INFO - Frame max value: 255
2025-01-30 09:14:28,796 - INFO - Frame mean value: 115.30855725951646
2025-01-30 09:14:28,808 - INFO - Frame captured and saved as test_capture.jpg
2025-01-30 09:14:29,057 - INFO - prediction 1: plastic bag (728) -> 0.391
2025-01-30 09:14:29,058 - INFO - prediction 2: motor scooter (670) -> 0.021
2025-01-30 09:14:29,058 - INFO - prediction 3: snowmobile (802) -> 0.015


### Draft Code for Serial Communication with Arduino

```python
import serial

# Initialize serial communication
arduino = serial.Serial('COM3', 9600)  # Adjust port and baud rate

# Capture image
cap = cv2.VideoCapture(0)
ret, frame = cap.read()

# Preprocess and classify
input_tensor = preprocess(frame).unsqueeze(0)  # Add batch dimension
with torch.no_grad():
    output = model(input_tensor)
class_id = torch.argmax(output).item()  # Get predicted class ID

# Send signal to Arduino
arduino.write(str(class_id).encode())

cap.release()
```