In [None]:
%pip install torchvision --index-url https://download.pytorch.org/whl/cu118
%pip install opencv-python
%pip install opencv-contrib-python
%pip install opencv-python torch
%pip install opencv-python ultralytics
%pip install git+https://github.com/openai/CLIP.git
%pip install openai
%pip install pyttsx3
%pip install gTTS
%pip install moviepy
%pip install ffmpeg
%pip install ultralytics gTTS ffmpeg-python

In [3]:
%pip install ultralytics openai gtts torch torchvision opencv-python

Note: you may need to restart the kernel to use updated packages.


In [10]:
import os
import time
import cv2
import torch
import numpy as np
from ultralytics import YOLOWorld
from gtts import gTTS
from openai import OpenAI
import re

# ─── CONFIG ──────────────────────────────────────────────────────────
VIDEO_IN    = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk.mp4"
VIDEO_OUT   = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk_annotated.mp4"
AUDIO_DIR   = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\audio_clips"
FINAL_AUDIO = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk_audio.mp3"
TRANSCRIPT  = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk_audio.txt"

MODEL_WEIGHTS = "yolov8x-worldv2.pt"
FRAME_SKIP    = 10        # process every Nth frame
CONF_THRESH   = 0.6      # YOLO confidence threshold
OPENAI_KEY    = "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" # your key
MAX_HAZARDS   = 150      # safeguard for CLIP text prompt length
# ──────────────────────────────────────────────────────────────────────

client = OpenAI(api_key=OPENAI_KEY)
spoken = {}

def speak_and_save(text: str, cooldown_s: float = 10.0):
    now = time.time()
    if now - spoken.get(text, 0) < cooldown_s:
        return None
    spoken[text] = now
    os.makedirs(AUDIO_DIR, exist_ok=True)
    out = os.path.join(AUDIO_DIR, f"{int(now)}.mp3")
    gTTS(text).save(out)
    return out

def fetch_hazard_list():
    resp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role":"system","content":"List 100 common hazards/obstacles on a city sidewalk, comma-separated."},
            {"role":"user","content":"Generate 100 common hazards/obstacles for a pedestrian walking in a city."}
        ],
        temperature=0.7,
        max_tokens=300
    )
    txt = resp.choices[0].message.content
    items = re.split(r"[\n,]", txt)
    hazards = []
    for itm in items:
        h = re.sub(r"^\d+\.\s*", "", itm.strip())
        if h:
            hazards.append(h)
    return hazards[:MAX_HAZARDS]

def main():
    hazards = fetch_hazard_list()

    model = YOLOWorld(MODEL_WEIGHTS)
    model.conf = CONF_THRESH
    model.set_classes(hazards)

    cap = cv2.VideoCapture(VIDEO_IN)
    w, h = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out_vid = cv2.VideoWriter(VIDEO_OUT, fourcc, round(fps/FRAME_SKIP), (w, h))

    left_x, right_x, mid_y = w*0.25, w*0.75, h*0.5
    audio_paths, alert_texts = [], []
    frame_id = 0

    while True:
        ret, frame = cap.read()
        if not ret: break
        frame_id += 1
        if frame_id % FRAME_SKIP != 0: continue

        device = "cuda" if torch.cuda.is_available() else "cpu"
        res = model.predict(frame, conf=CONF_THRESH, device=device)[0]
        boxes = res.boxes.xyxy.cpu().numpy()
        confs = res.boxes.conf.cpu().numpy()
        cids  = res.boxes.cls.cpu().numpy()

        close_hazards = []
        light_states  = []

        # 1) Process each detection
        for (x1,y1,x2,y2), conf, cid in zip(boxes, confs, cids):
            name = res.names[int(cid)]
            cx, cy = (x1+x2)/2, (y1+y2)/2
            area_ratio = ((x2-x1)*(y2-y1)) / (w*h)

            # traffic signal logic
            if name == "traffic signal":
                crop = frame[int(y1):int(y2), int(x1):int(x2)]
                hsv  = cv2.cvtColor(crop, cv2.COLOR_BGR2HSV)

                # red mask
                lower_r1 = np.array([0,100,100]); upper_r1 = np.array([10,255,255])
                lower_r2 = np.array([160,100,100]); upper_r2 = np.array([180,255,255])
                red_mask = cv2.bitwise_or(
                    cv2.inRange(hsv, lower_r1, upper_r1),
                    cv2.inRange(hsv, lower_r2, upper_r2)
                )
                green_mask = cv2.inRange(hsv, np.array([35,100,100]), np.array([85,255,255]))
                state = "RED light" if cv2.countNonZero(red_mask) > cv2.countNonZero(green_mask) else "GREEN light"
                light_states.append(state)
                label = state
            else:
                label = name

            # “close” condition
            is_close = (cy > 0.7*h) or (area_ratio > 0.05)
            if is_close and name != "traffic signal":
                close_hazards.append(name)

            # draw
            color = (0,0,255) if is_close else (0,255,0)
            cv2.rectangle(frame, (int(x1),int(y1)), (int(x2),int(y2)), color, 2)
            cv2.putText(frame, f"{label} {conf:.2f}",
                        (int(x1), int(y1)-6),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)

        # 2) Generate alerts
        # Hazards
        if close_hazards:
            txt = "Alert: close hazards – " + ", ".join(dict.fromkeys(close_hazards))
            alert_texts.append(txt)
            p = speak_and_save(txt)
            if p: audio_paths.append(p)

        # Traffic lights
        if light_states:
            txt = "Traffic light state – " + ", ".join(dict.fromkeys(light_states))
            alert_texts.append(txt)
            p = speak_and_save(txt)
            if p: audio_paths.append(p)

        # 3) Draw H-split lines (blue)
        for (x0,y0,x1,y1) in [
            (int(left_x),0,int(left_x),h),
            (int(right_x),0,int(right_x),h),
            (int(left_x),int(mid_y),int(right_x),int(mid_y))
        ]:
            cv2.line(frame,(x0,y0),(x1,y1),(255,0,0),2)

        out_vid.write(frame)

    cap.release()
    out_vid.release()

    # 4) Merge all TTS mp3s
    if audio_paths:
        with open(FINAL_AUDIO, "wb") as fw:
            for mp3 in sorted(audio_paths):
                with open(mp3, "rb") as fr:
                    fw.write(fr.read())
    else:
        print("⚠️  No audio clips were generated, skipping final audio merge.")

    # 5) Write transcript
    with open(TRANSCRIPT, "w", encoding="utf-8") as tf:
        tf.write("\n".join(alert_texts))

    print("✅ Video:",      VIDEO_OUT)
    print("✅ Audio:",      FINAL_AUDIO)
    print("✅ Transcript:", TRANSCRIPT)

if __name__ == "__main__":
    main()


0: 384x640 (no detections), 36.5ms
Speed: 1.9ms preprocess, 36.5ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 33.3ms
Speed: 2.9ms preprocess, 33.3ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 32.8ms
Speed: 1.9ms preprocess, 32.8ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 33.0ms
Speed: 1.9ms preprocess, 33.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 32.7ms
Speed: 1.8ms preprocess, 32.7ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 32.3ms
Speed: 3.3ms preprocess, 32.3ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 33.3ms
Speed: 1.8ms preprocess, 33.3ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 32.8ms
Speed: 2.5ms preprocess, 32.8ms i

In [13]:
import os
import time
import cv2
import torch
import numpy as np
from ultralytics import YOLOWorld
from gtts import gTTS
from openai import OpenAI
import re

# ─── CONFIG ──────────────────────────────────────────────────────────
VIDEO_IN    = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk.mp4"
VIDEO_OUT   = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk_annotated.mp4"
AUDIO_DIR   = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\audio_clips"
FINAL_AUDIO = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk_audio.mp3"
TRANSCRIPT  = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk_audio.txt"

MODEL_WEIGHTS = "yolov8x-worldv2.pt"
FRAME_SKIP    = 5        # process every Nth frame
CONF_THRESH   = 0.6      # YOLO confidence threshold
OPENAI_KEY    = "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"  # your OpenAI key
# ──────────────────────────────────────────────────────────────────────

client = OpenAI(api_key=OPENAI_KEY)

# simple rate-limiter so we don't re-speak the same alert too often
spoken = {}
def speak_and_save(text: str, cooldown_s: float = 30.0):
    now = time.time()
    last = spoken.get(text, 0)
    if now - last < cooldown_s:
        return None
    spoken[text] = now
    os.makedirs(AUDIO_DIR, exist_ok=True)
    out_path = os.path.join(AUDIO_DIR, f"{int(now)}.mp3")
    gTTS(text).save(out_path)
    return out_path

def fetch_hazard_list():
    resp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role":"system","content":"List 100 common hazards/obstacles on a city sidewalk, comma-separated."},
            {"role":"user","content":"Generate 100 common hazards/obstacles for a pedestrian walking in a city."}
        ],
        temperature=0.7,
        max_tokens=300
    )
    txt = resp.choices[0].message.content
    # split on commas or newlines, strip numbering
    items = re.split(r"[,\n]", txt)
    hazards = []
    for itm in items:
        h = itm.strip()
        if not h:
            continue
        h = re.sub(r"^\d+\.\s*", "", h)
        hazards.append(h)
    return hazards

def main():
    # 1) one-time fetch
    hazards = fetch_hazard_list()

    # 2) load model
    model = YOLOWorld(MODEL_WEIGHTS)
    model.conf = CONF_THRESH
    model.set_classes(hazards)

    # 3) setup I/O
    cap = cv2.VideoCapture(VIDEO_IN)
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(VIDEO_OUT, fourcc, fps/FRAME_SKIP, (w, h))

    left_x, right_x, middle_y = w*0.25, w*0.75, h*0.5
    audio_paths, alert_texts = [], []
    frame_id = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_id += 1
        if frame_id % FRAME_SKIP != 0:
            continue

        device = "cuda" if torch.cuda.is_available() else "cpu"
        res = model.predict(frame, conf=CONF_THRESH, device=device)[0]
        preds = res.boxes.data.cpu().numpy()

        regions = {"ground": [], "left": [], "right": [], "front": []}

        # draw detections & classify
        for x1,y1,x2,y2,conf,cid in preds:
            name = res.names[int(cid)]
            cx, cy = (x1+x2)/2, (y1+y2)/2
            if cx < left_x:
                regions["left"].append(name)
            elif cx > right_x:
                regions["right"].append(name)
            elif cy < middle_y:
                regions["front"].append(name)
            else:
                regions["ground"].append(name)

            # red if on “ground” (close), green otherwise
            color = (0,0,255) if cy >= middle_y else (0,255,0)
            cv2.rectangle(frame, (int(x1),int(y1)), (int(x2),int(y2)), color, 2)
            cv2.putText(frame, f"{name} {conf:.2f}", (int(x1),int(y1)-6),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)

        # if any close (“ground”) obstacles, speak & record
        if regions["ground"]:
            txt = "Alert: ground obstacle – " + ", ".join(dict.fromkeys(regions["ground"]))
            alert_texts.append(txt)
            mp3 = speak_and_save(txt)
            if mp3:
                audio_paths.append(mp3)

        # draw H-split in blue
        for (x0,y0,x1,y1) in [
            (int(left_x),0,int(left_x),h),
            (int(right_x),0,int(right_x),h),
            (int(left_x),int(middle_y),int(right_x),int(middle_y))
        ]:
            cv2.line(frame, (x0,y0), (x1,y1), (255,0,0), 2)

        out.write(frame)

    cap.release()
    out.release()

    # ─── Merge all .mp3 clips by byte-concatenation ──────────────────────
    if audio_paths:
        with open(FINAL_AUDIO, "wb") as fw:
            for mp3 in sorted(audio_paths):
                with open(mp3, "rb") as fr:
                    fw.write(fr.read())
    else:
        print("⚠️  No audio clips generated; skipped audio merge.")

    # ─── Write full transcript ──────────────────────────────────────────
    with open(TRANSCRIPT, "w", encoding="utf-8") as tf:
        tf.write("\n".join(alert_texts) or "No alerts detected.")

    print("✅ Annotated video:", VIDEO_OUT)
    print("✅ Merged audio  :", FINAL_AUDIO)
    print("✅ Transcript   :", TRANSCRIPT)

if __name__ == "__main__":
    main()



0: 384x640 (no detections), 36.7ms
Speed: 3.0ms preprocess, 36.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 33.2ms
Speed: 1.7ms preprocess, 33.2ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 33.4ms
Speed: 1.7ms preprocess, 33.4ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 32.9ms
Speed: 1.8ms preprocess, 32.9ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 32.6ms
Speed: 2.0ms preprocess, 32.6ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 32.9ms
Speed: 1.7ms preprocess, 32.9ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 33.2ms
Speed: 1.7ms preprocess, 33.2ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 33.2ms
Speed: 1.7ms preprocess, 33.2ms i

In [5]:
print("CUDA Available:", torch.cuda.is_available())
print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

CUDA Available: True
Device Name: NVIDIA GeForce GTX 1080


In [3]:
client = OpenAI(api_key="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")
tts_engine = pyttsx3.init()
tts_engine.setProperty('rate', 150)  # Adjust speaking speed
spoken_messages = {}
message_timeout = 30 # seconds
fallback_counter = 0
fallback_threshold = 3 # After x amount, delcare uncertain
saved_audio_paths = []

In [14]:
def speak_and_save(text, audio_output_dir="audio_clips"):
    now = time.time()
    if text:
        last = spoken_messages.get(text, 0)
        if now - last > message_timeout:
            spoken_messages[text] = now
            os.makedirs(audio_output_dir, exist_ok=True)
            path = os.path.join(audio_output_dir, f"{int(now)}.mp3")
            gTTS(text).save(path)
            return path
    return None

In [15]:
def initialize_model_and_video(video_file, output_video_file, weight_file, walking_hazards, frame_skip):
    model = YOLOWorld(weight_file)
    model.set_classes(walking_hazards)

    cap = cv2.VideoCapture(video_file)
    if not cap.isOpened():
        raise IOError("Cannot open video")

    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_file, fourcc, fps // frame_skip, (width, height))

    return model, cap, out

In [16]:
def detect_objects_in_frame(model, frame, confidence_threshold=0.6):
    """Run YOLO-World, filter by threshold, then mark low-confidence anomalies."""
    results = model.predict(frame)
    preds   = results[0].boxes.data.cpu().numpy()
    objs    = []

    # collect detections above threshold
    for x1,y1,x2,y2,score,cid in preds:
        if score >= confidence_threshold:
            objs.append({
                'name': model.names[int(cid)],
                'box': (int(x1),int(y1),int(x2),int(y2)),
                'score': float(score),
                'anomaly': False
            })

    # statistical anomaly: score < (mean - std)
    if len(objs) > 1:
        scores = np.array([o['score'] for o in objs])
        m, s    = scores.mean(), scores.std()
        threshold = m - s
        for o in objs:
            if o['score'] < threshold:
                o['anomaly'] = True
    return objs

In [17]:
def summarize_with_llm(objects, mode="description"):
    """Summarize detected objects using OpenAI's LLM."""
    if not objects:
        return None

    objects_summary = {}
    for obj in objects:
        name = obj['name']
        objects_summary[name] = objects_summary.get(name, 0) + 1


    text_summary = ", ".join([f"{count} {obj}(s)" for obj, count in objects_summary.items()])

    if mode == "description":
        prompt = f"Describe the following environment in natural language: {text_summary}"
    elif mode == "warning":
        prompt = f"Based on the following objects, generate a short warning message if there is any obstacle: {text_summary}"

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are an assistant that helps with real-time navigation for visually impaired users."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.5,
        max_tokens=100
    )

    return response.choices[0].message.content.strip()

In [19]:
def process_video(model, cap, out, frame_skip, confidence_threshold, describe_interval):
    global fallback_counter
    frame_id = 0
    last_desc = time.time()
    seen_set  = set()
    warn_last = ""

    # initial description
    ret, frame = cap.read()
    if ret:
        objs = detect_objects_in_frame(model, frame, confidence_threshold)
        if objs:
            desc = summarize_with_llm(objs,'description')
            print(f"📄 {desc}")
            p = speak_and_save(desc)
            if p: saved_audio_paths.append(p)
        last_desc = time.time()
    cap.set(cv2.CAP_PROP_POS_FRAMES,0)

    while True:
        ret, frame = cap.read()
        if not ret: break
        frame_id += 1
        if frame_id % frame_skip: continue

        objs = detect_objects_in_frame(model, frame, confidence_threshold)
        if not objs:
            fallback_counter +=1
            if fallback_counter >= fallback_threshold:
                p = speak_and_save("Low confidence, proceed with caution.")
                if p: saved_audio_paths.append(p)
                fallback_counter=0
            continue
        fallback_counter=0

        names = set([o['name'] for o in objs])
        # draw each box
        for o in objs:
            x1,y1,x2,y2 = o['box']
            color = (0,0,255) if o['anomaly'] else (0,255,0)
            cv2.rectangle(frame,(x1,y1),(x2,y2),color,2)
            label = f"{o['name']} {o['score']:.2f}"
            if o['anomaly']: label += " (!)"
            cv2.putText(frame,label,(x1,y1-10),cv2.FONT_HERSHEY_SIMPLEX,0.6,color,2)

        # periodic description
        if time.time() - last_desc > describe_interval:
            desc = summarize_with_llm(objs,'description')
            print(f"📄 {desc}")
            p = speak_and_save(desc)
            if p: saved_audio_paths.append(p)
            last_desc = time.time()

        # warnings on set change
        if names!=seen_set:
            warn = summarize_with_llm(objs,'warning')
            if warn and warn!=warn_last:
                print(f"⚠️ {warn}")
                p = speak_and_save(warn)
                if p: saved_audio_paths.append(p)
                warn_last = warn
            seen_set = names

        out.write(frame)
    cap.release()
    out.release()
    print("✅ Done processing video.")

In [None]:
if __name__ == "__main__":
    # File paths
    video_file = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk.mp4"
    output_video_file = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk_annotated.mp4"
    audio_file = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk_audio.mp3"
    weight_file = 'yolov8x-worldv2.pt'
    frame_skip = 5
    confidence_threshold = 0.6
    describe_interval = 60  # seconds between descriptions

    walking_hazards = [
        'person', 'bicycle', 'car', 'motorcycle', 'bus', 'truck', 'traffic light', 'stop sign', 'red light', 'green light'
    ]

    model, cap, out = initialize_model_and_video(video_file, output_video_file, weight_file, walking_hazards, frame_skip)
    process_video(model, cap, out, frame_skip, confidence_threshold, describe_interval)
    add_audio_to_video(output_video_file, "audio_clips", audio_file)

    print("Video and audio saved.")
    

In [12]:
import os
import cv2
import torch
import numpy as np
from gtts import gTTS
from ultralytics import YOLOWorld

def main():
    # ─── CONFIG ────────────────────────────────────────────────────────────────
    VIDEO_IN      = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk.mp4"
    VIDEO_OUT     = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk_annotated.mp4"
    FINAL_AUDIO   = r"C:\Users\tyler\OneDrive\Desktop\Computer Vision\CV Project\Videos\walk_audio.mp3"
    MODEL_WEIGHTS = "yolov8x-worldv2.pt"
    FPS           = 10               # desired write‐out FPS
    CONF_THRESH   = 0.5              # detection confidence threshold
    MEMORY_TTL    = 30               # frames between repeated warnings per object
    # ────────────────────────────────────────────────────────────────────────────

    # load model
    model = YOLOWorld(MODEL_WEIGHTS)
    model.conf = CONF_THRESH

    # open I/O
    cap = cv2.VideoCapture(VIDEO_IN)
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out    = cv2.VideoWriter(VIDEO_OUT, fourcc, FPS, (w, h))

    # memory so we only re-warn each object every MEMORY_TTL frames
    last_warn = {}   # tracker_id -> last_frame_warned
    warnings   = []  # collect all warning texts

    frame_id = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_id += 1

        # run detection
        results = model(frame)[0]
        boxes   = results.boxes.xyxy.cpu().numpy()
        scores  = results.boxes.conf.cpu().numpy()
        tids    = getattr(results.boxes, "ids",
                           np.arange(len(boxes)))
        classes = results.boxes.cls.cpu().numpy().astype(int)

        # annotate & warnings
        for (x1, y1, x2, y2), conf, tid, cls in zip(boxes, scores, tids, classes):
            if conf < CONF_THRESH:
                continue
            # draw bbox
            cv2.rectangle(frame,
                          (int(x1), int(y1)),
                          (int(x2), int(y2)),
                          (0, 255, 0), 2)
            label = f"{model.names[cls]} {conf:0.2f}"
            cv2.putText(frame, label,
                        (int(x1), int(y1) - 5),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5, (0, 255, 0), 1)

            # ground region warning
            cy = (y1 + y2) / 2
            if cy > h * 0.5:
                last = last_warn.get(tid, -MEMORY_TTL)
                if frame_id - last >= MEMORY_TTL:
                    txt = f"Warning: {model.names[cls]} ahead."
                    warnings.append(txt)
                    last_warn[tid] = frame_id

        out.write(frame)

    cap.release()
    out.release()
    print("✅ Annotated video saved to", VIDEO_OUT)

    # ─── single TTS pass over all warnings ────────────────────────────────────
    if warnings:
        full_text = " ".join(warnings)
        tts = gTTS(full_text)
        tts.save(FINAL_AUDIO)
        print("✅ Final audio saved to", FINAL_AUDIO)
    else:
        print("⚠️  No warnings were generated, so no audio file created.")

if __name__ == "__main__":
    main()

 



0: 384x640 2 persons, 2 traffic lights, 46.2ms
Speed: 2.8ms preprocess, 46.2ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 2 traffic lights, 36.7ms
Speed: 3.0ms preprocess, 36.7ms inference, 2.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 2 traffic lights, 35.9ms
Speed: 3.2ms preprocess, 35.9ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 2 traffic lights, 33.8ms
Speed: 2.4ms preprocess, 33.8ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 2 traffic lights, 33.3ms
Speed: 2.4ms preprocess, 33.3ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 3 traffic lights, 32.9ms
Speed: 2.2ms preprocess, 32.9ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 2 traffic lights, 1 stop sign, 33.1ms
Speed: 1.8ms preprocess, 33.1ms inference, 2.1ms postprocess per imag