In [1]:
import cv2
import numpy as np
from PIL import Image, ImageDraw
import tensorflow as tf
import os

# Paths
video_input_path = 'video1.mp4'
video_output_path = 'output_video2.mp4'

# Load TFLite model
interpreter = tf.lite.Interpreter(model_path='retained-16.tflite')
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
input_shape = input_details[0]['shape']  # [1, height, width, 3]
input_dtype = input_details[0]['dtype']

# Helper: preprocess frame
def preprocess_frame(frame):
    img = cv2.resize(frame, (input_shape[2], input_shape[1]))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = np.array(img, dtype=np.float32) / 255.0
    img = np.expand_dims(img, axis=0)
    return img.astype(input_dtype)

# Helper: draw boxes
def draw_boxes_cv2(frame, boxes, scores, classes, threshold=0.4):
    for i in range(len(scores)):
        if scores[i] >= threshold:
            ymin, xmin, ymax, xmax = boxes[i]
            start_point = (int(xmin), int(ymin))
            end_point = (int(xmax), int(ymax))
            color = (0, 0, 255)
            cv2.rectangle(frame, start_point, end_point, color, 2)
            label = f'ID {int(classes[i])} | {scores[i]:.2f}'
            cv2.putText(frame, label, (int(xmin), int(ymin)-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    return frame

# Open video
cap = cv2.VideoCapture(video_input_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(video_output_path, fourcc, fps, (width, height))

frame_count = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    frame_count += 1
    print(f"Processing frame {frame_count}")

    input_data = preprocess_frame(frame)

    interpreter.set_tensor(input_details[0]['index'], input_data)
    interpreter.invoke()
    output_data = interpreter.get_tensor(output_details[0]['index'])

    try:
        predictions = output_data[0]  # (N, 6) = x, y, w, h, conf, class
        boxes, scores, classes = [], [], []

        print(f"Detected details for frame {frame_count}:")
        for pred in predictions:
            x_center, y_center, width_box, height_box, conf, class_id = pred
            if conf < 0.4:
                continue
            xmin = (x_center - width_box / 2) * width
            ymin = (y_center - height_box / 2) * height
            xmax = (x_center + width_box / 2) * width
            ymax = (y_center + height_box / 2) * height
            boxes.append([ymin, xmin, ymax, xmax])
            scores.append(conf)
            classes.append(class_id)
            print(f"  Box: [{xmin:.1f}, {ymin:.1f}, {xmax:.1f}, {ymax:.1f}], Confidence: {conf:.2f}, Class: {int(class_id)}")

        frame = draw_boxes_cv2(frame, boxes, scores, classes)
    except Exception as e:
        print(f"Post-processing failed: {e}")

    out.write(frame)

cap.release()
out.release()
print(f"✅ Video saved at: {video_output_path}")

    TF 2.20. Please use the LiteRT interpreter from the ai_edge_litert package.
    See the [migration guide](https://ai.google.dev/edge/litert/migration)
    for details.
    


Processing frame 1
Detected details for frame 1:
  Box: [335.7, 346.2, 489.7, 403.6], Confidence: 0.79, Class: 0
  Box: [338.1, 347.1, 486.4, 402.9], Confidence: 0.62, Class: 0
  Box: [769.8, 192.0, 942.2, 222.4], Confidence: 0.45, Class: 0
  Box: [769.7, 192.9, 944.0, 222.3], Confidence: 0.46, Class: 0
  Box: [180.1, 257.0, 520.6, 304.3], Confidence: 0.61, Class: 0
  Box: [180.8, 257.2, 519.7, 304.3], Confidence: 0.53, Class: 0
  Box: [176.3, 255.0, 551.1, 305.0], Confidence: 0.75, Class: 0
  Box: [177.3, 255.1, 548.9, 305.1], Confidence: 0.66, Class: 0
  Box: [335.6, 345.4, 489.4, 403.0], Confidence: 0.85, Class: 0
  Box: [336.2, 344.9, 488.1, 403.5], Confidence: 0.87, Class: 0
  Box: [336.8, 345.0, 488.9, 403.0], Confidence: 0.83, Class: 0
  Box: [337.1, 345.5, 488.6, 402.8], Confidence: 0.85, Class: 0
  Box: [338.8, 345.1, 487.4, 402.4], Confidence: 0.85, Class: 0
  Box: [337.8, 345.0, 487.8, 402.6], Confidence: 0.82, Class: 0
  Box: [335.5, 345.9, 488.2, 403.7], Confidence: 0.74, 

KeyboardInterrupt: 

In [2]:
import cv2
import numpy as np
from PIL import Image, ImageDraw
import tensorflow as tf
import os

# Paths
video_input_path = 'video1.mp4'
video_output_path = 'output_video2.mp4'

# Load TFLite model
interpreter = tf.lite.Interpreter(model_path='retained-16.tflite')
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
input_shape = input_details[0]['shape']  # [1, height, width, 3]
input_dtype = input_details[0]['dtype']

# Initialize counters
total_detections = 0
paused = False

# Helper: preprocess frame
def preprocess_frame(frame):
    img = cv2.resize(frame, (input_shape[2], input_shape[1]))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = np.array(img, dtype=np.float32) / 255.0
    img = np.expand_dims(img, axis=0)
    return img.astype(input_dtype)

# Helper: draw boxes and counter
def draw_boxes_cv2(frame, boxes, scores, classes, frame_count, frame_detections, total_detections, threshold=0.4):
    # Draw detection boxes
    for i in range(len(scores)):
        if scores[i] >= threshold:
            ymin, xmin, ymax, xmax = boxes[i]
            start_point = (int(xmin), int(ymin))
            end_point = (int(xmax), int(ymax))
            color = (0, 0, 255)  # Red boxes
            cv2.rectangle(frame, start_point, end_point, color, 2)
            label = f'ID {int(classes[i])} | {scores[i]:.2f}'
            cv2.putText(frame, label, (int(xmin), int(ymin)-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
    
    # Create overlay for counter information
    overlay = frame.copy()
    
    # Semi-transparent background for text
    cv2.rectangle(overlay, (10, 10), (500, 140), (0, 0, 0), -1)
    cv2.addWeighted(overlay, 0.7, frame, 0.3, 0, frame)
    
    # Add counter text
    cv2.putText(frame, f'Frame: {frame_count}', (20, 35), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
    cv2.putText(frame, f'Current Detections: {frame_detections}', (20, 65), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)
    cv2.putText(frame, f'Total Detections: {total_detections}', (20, 95), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2)
    cv2.putText(frame, 'Controls: Q=Quit, P=Pause, SPACE=Step', (20, 125), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
    
    return frame

# Open video
cap = cv2.VideoCapture(video_input_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(video_output_path, fourcc, fps, (width, height))

frame_count = 0

# Create display window
cv2.namedWindow('Object Detection - Live Feed', cv2.WINDOW_NORMAL)
cv2.resizeWindow('Object Detection - Live Feed', 1200, 800)

print("🎥 Starting video processing with live display...")
print("Controls: Press 'Q' to quit, 'P' to pause/resume, 'SPACE' to step through frames")

while cap.isOpened():
    if not paused:
        ret, frame = cap.read()
        if not ret:
            break
        frame_count += 1
        print(f"Processing frame {frame_count}")

        input_data = preprocess_frame(frame)

        interpreter.set_tensor(input_details[0]['index'], input_data)
        interpreter.invoke()
        output_data = interpreter.get_tensor(output_details[0]['index'])

        try:
            predictions = output_data[0]  # (N, 6) = x, y, w, h, conf, class
            boxes, scores, classes = [], [], []
            frame_detections = 0

            print(f"Detected details for frame {frame_count}:")
            for pred in predictions:
                x_center, y_center, width_box, height_box, conf, class_id = pred
                if conf < 0.4:
                    continue
                xmin = (x_center - width_box / 2) * width
                ymin = (y_center - height_box / 2) * height
                xmax = (x_center + width_box / 2) * width
                ymax = (y_center + height_box / 2) * height
                boxes.append([ymin, xmin, ymax, xmax])
                scores.append(conf)
                classes.append(class_id)
                frame_detections += 1
                total_detections += 1
                print(f"  Box: [{xmin:.1f}, {ymin:.1f}, {xmax:.1f}, {ymax:.1f}], Confidence: {conf:.2f}, Class: {int(class_id)}")

            # Draw boxes and counter on frame
            frame = draw_boxes_cv2(frame, boxes, scores, classes, frame_count, frame_detections, total_detections)
            
            # Save frame to output video
            out.write(frame)
            
            # Update console with frame stats
            print(f"  → Frame {frame_count}: {frame_detections} detections | Total: {total_detections}")
            
        except Exception as e:
            print(f"Post-processing failed: {e}")
            frame = draw_boxes_cv2(frame, [], [], [], frame_count, 0, total_detections)
            out.write(frame)

    # Display the frame
    cv2.imshow('Object Detection - Live Feed', frame)
    
    # Handle key presses
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q') or key == ord('Q'):
        print("🛑 Stopping video processing...")
        break
    elif key == ord('p') or key == ord('P'):
        paused = not paused
        status = "PAUSED" if paused else "RESUMED"
        print(f"⏸️  Video {status}")
    elif key == ord(' '):  # Spacebar for step-by-step
        if paused:
            paused = False
            cv2.waitKey(1)
            paused = True

# Cleanup
cap.release()
out.release()
cv2.destroyAllWindows()

# Final statistics
print("\n" + "="*50)
print("🎯 FINAL DETECTION STATISTICS")
print("="*50)
print(f"📊 Total Frames Processed: {frame_count}")
print(f"🔍 Total Detections Found: {total_detections}")
if frame_count > 0:
    print(f"📈 Average Detections per Frame: {total_detections/frame_count:.2f}")
print(f"💾 Output Video Saved: {video_output_path}")
print("="*50)
print("✅ Processing Complete!")

    TF 2.20. Please use the LiteRT interpreter from the ai_edge_litert package.
    See the [migration guide](https://ai.google.dev/edge/litert/migration)
    for details.
    


🎥 Starting video processing with live display...
Controls: Press 'Q' to quit, 'P' to pause/resume, 'SPACE' to step through frames
Processing frame 1
Detected details for frame 1:
  Box: [335.7, 346.2, 489.7, 403.6], Confidence: 0.79, Class: 0
  Box: [338.1, 347.1, 486.4, 402.9], Confidence: 0.62, Class: 0
  Box: [769.8, 192.0, 942.2, 222.4], Confidence: 0.45, Class: 0
  Box: [769.7, 192.9, 944.0, 222.3], Confidence: 0.46, Class: 0
  Box: [180.1, 257.0, 520.6, 304.3], Confidence: 0.61, Class: 0
  Box: [180.8, 257.2, 519.7, 304.3], Confidence: 0.53, Class: 0
  Box: [176.3, 255.0, 551.1, 305.0], Confidence: 0.75, Class: 0
  Box: [177.3, 255.1, 548.9, 305.1], Confidence: 0.66, Class: 0
  Box: [335.6, 345.4, 489.4, 403.0], Confidence: 0.85, Class: 0
  Box: [336.2, 344.9, 488.1, 403.5], Confidence: 0.87, Class: 0
  Box: [336.8, 345.0, 488.9, 403.0], Confidence: 0.83, Class: 0
  Box: [337.1, 345.5, 488.6, 402.8], Confidence: 0.85, Class: 0
  Box: [338.8, 345.1, 487.4, 402.4], Confidence: 0.85