<a href="https://colab.research.google.com/github/tomasndlate/thesis/blob/main/ThesisResearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set-up

Install required dependencies:

In [None]:
!pip install -U typing num2words opencv-python decord transformers av accelerate git+https://github.com/huggingface/transformers@v4.49.0-SmolVLM-2

Login into Hugging Face API (using HF token saved in colab):

In [None]:
from huggingface_hub import login
from google.colab import userdata

token = userdata.get('HF_TOKEN')

login(token)

Import SmolVLM2-256M model:

In [None]:
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText

model_id = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"

processor = AutoProcessor.from_pretrained(model_id)

model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    attn_implementation="eager",
    device_map="auto"
)

print("Model loaded successfully")

Sliding Window Strategy - Streaming Inference Logic:

Define the needed data:

In [None]:
image_path = "/content/drive/MyDrive/ThesisResearch/dmd/test.png"
video_path = "/content/drive/MyDrive/ThesisResearch/dataset/videos/gA_1_s1_2019-03-08T09;31;15+01;00_rgb_body.mp4" #"/content/drive/MyDrive/ThesisResearch/dmd/gA/3/s1/gA_3_s1_2019-03-08T10;27;38+01;00_ir_body.mp4"

Custom predict function:

In [None]:
from typing import Literal

def predict_with_model(
    media_type: Literal["video", "image"],
    media_path: str,
    prompt: str,
    model,
    processor,
    max_new_tokens: int =150
    ):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": media_type, "path": media_path},
                {"type": "text", "text": prompt}
            ]
        }
    ]

    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to("cuda", torch.bfloat16)

    with torch.no_grad():
        output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)

    response = processor.batch_decode(output_ids, skip_special_tokens=True)
    return response[0]

In [None]:
import cv2
import numpy as np

def frame_changed(prev, curr, threshold=5):
    diff = cv2.absdiff(prev, curr)
    return np.mean(diff) > threshold


In [None]:
import cv2
import numpy as np
from collections import deque

VIDEO_PATH = video_path
FRAME_DIFF_THRESHOLD = 3
MAX_BUFFER_FRAMES = 8

def frame_changed(prev_gray, curr_gray, threshold):
    diff = cv2.absdiff(prev_gray, curr_gray)
    print(f"frame change value {np.mean(diff)}")
    return np.mean(diff) > threshold

def main2():
    cap = cv2.VideoCapture(VIDEO_PATH)
    if not cap.isOpened():
        raise RuntimeError(f"Cannot open video: {VIDEO_PATH}")

    fps = cap.get(cv2.CAP_PROP_FPS)
    fps = fps if fps > 0 else 30
    TARGET_FPS = 5
    STRIDE = max(1, int(round(fps / TARGET_FPS)))

    print(f"Video FPS={fps:.1f}, checking every {STRIDE} frames")

    frame_idx = 0
    prev_gray = None

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_idx % STRIDE != 0:
            frame_idx += 1
            continue

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        gray = cv2.resize(gray, (160, 90))

        if prev_gray is not None:
            diff = np.mean(cv2.absdiff(prev_gray, gray))
            if diff > FRAME_DIFF_THRESHOLD:
                print(f"[CHANGE] frame={frame_idx}, diff={diff:.2f}")
                ## append to content

        prev_gray = gray
        frame_idx += 1
        ## if content > 10 frames
        ## call prediction in here

    ## if content still has something call prediction in here
    cap.release()
    print("Done.")

main2()



#for frame in frames_chunk:
#    messages[0]["content"].append({"type": "image", "image": frame})

#messages[0]["content"].append({"type": "text", "text": prompt})

## process and generate
#inputs = processor.apply_chat_template(...)



# Experiment 1: Zero-Shot Prompting (Gaze Direction)

### Image prediction

In [None]:
experiment_prompt = "Where is the person looking to in this image?"
predicted_response = predict_with_model("image", image_path, experiment_prompt, model, processor)
print(predicted_response)

### Video prediction

# Experiment 2: One-Shot Prompting (Distraction Detection)

### Image prediction

In [None]:
example_image = image_path
inference_image = image_path

experiment_prompt = f"""
  -User: You are a driver monitoring system that is responsible for assuring
   the driver is driving safely and alert when they are distracted. What is
   the state of this driver? {example_image}

  -Assistant: This driver is distracted because he is having a phonecall while driving

  -User: And how about this driver? {inference_image}
"""

predicted_response = predict_with_model("image", image_path, experiment_prompt, model, processor)
print(predicted_response)

### Video prediction

# Experiment 3: Structured Output (Code-Format)

### Image prediction

One-shot + Output formatted:

In [None]:
example_image = image_path
inference_image = image_path

experiment_prompt = f"""
  -User: You are a driver monitoring system that is responsible for
   assuring the driver is driving safely and alert when they are distracted.
   You need to communicate with the HMI to alert the driver, please provide
  the following variables with True or False: Distracted, Talking, Using
phone. What is the state of this driver? {example_image}

  -Assistant: Distracted = True, Talking = No, Using phone=No

  -User: And how about this driver? {inference_image}
"""

predicted_response = predict_with_model("image", image_path, experiment_prompt, model, processor)
print(predicted_response)

### Video prediction

# Evaluation and Challenges