<a href="https://colab.research.google.com/github/wissemkarouss/CAN-secure_dream_drive/blob/main/X-CLIP/Video_text_matching_with_X_CLIP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set-up environment

We'll first install 🤗 Transformers (from Github as it's not yet included in a new release) and decord, which we'll use to decode a video.

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


In [None]:
!pip install -q decord

## Load video

Here we load a video of people eating spaghetti.

In [None]:
from huggingface_hub import hf_hub_download
from ipywidgets import Video

file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
Video.from_file(file_path, width=500)

Video(value=b'\x00\x00\x00 ftypisom\x00\x00\x02\x00isomiso2avc1mp41\x00\x00\x00\x08free\x00\x0fI\xb7mdat\x00\x…

We'll sample 8 frames from the video.

In [19]:
from decord import VideoReader, cpu
import numpy as np

np.random.seed(0)
file_path="/content/n area where abandoned items are being processed for recycling and recovery.mp4"

def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))

# sample 16 frames
vr.seek(0)
indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=len(vr))
video = vr.get_batch(indices).asnumpy()
print(video.shape)

(16, 1080, 1920, 3)


## Run inference

Finally, we forward the video + 3 possible texts through the X-CLIP model. The model will tell us how much each text matches with the given video.

In [27]:
from transformers import XCLIPProcessor, XCLIPModel
import torch

model_name = "microsoft/xclip-base-patch32-16-frames"
processor = XCLIPProcessor.from_pretrained(model_name)
model = XCLIPModel.from_pretrained(model_name)

inputs = processor(text=["playing sports", "n area where abandoned items are being processed for recycling and recovery", "go shopping"], videos=list(video), return_tensors="pt", padding=True)

# forward pass
with torch.no_grad():
    outputs = model(**inputs)

probs = outputs.logits_per_video.softmax(dim=1)
probs

tensor([[3.5401e-04, 9.9871e-01, 9.3128e-04]])

In [37]:
import torch
from transformers import XCLIPProcessor, XCLIPModel
from decord import VideoReader, cpu
import numpy as np
from PIL import Image
import cv2

# Load model and processor
model_name = "microsoft/xclip-base-patch16-zero-shot"
processor = XCLIPProcessor.from_pretrained(model_name)
model = XCLIPModel.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Function to load and sample video frames
def load_video(video_path, num_frames=32, target_size=(224, 224)):
    try:
        vr = VideoReader(video_path, ctx=cpu(0))
        total_frames = len(vr)
        if total_frames < num_frames:
            raise ValueError(f"Video has only {total_frames} frames, but {num_frames} are required.")

        # Sample frames uniformly
        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
        frames = vr.get_batch(frame_indices).asnumpy()  # Shape: (num_frames, height, width, 3)

        # Resize frames to target_size (224, 224) for X-CLIP
        resized_frames = []
        for frame in frames:
            # Convert to PIL Image for resizing
            frame = Image.fromarray(frame)
            frame = frame.resize(target_size, Image.Resampling.LANCZOS)
            resized_frames.append(np.array(frame))

        frames = np.array(resized_frames, dtype=np.uint8)  # Shape: (num_frames, 224, 224, 3)
        return frames
    except Exception as e:
        print(f"Error loading video: {e}")
        return None # Return None to indicate failure

# Load video (replace with your video path)
video_path = "/content/cooking.mp4"  # Replace with your actual video file path
video_frames = load_video(video_path)

if video_frames is not None:
    print(f"Loaded frames shape: {video_frames.shape}")  # Should be (32, 224, 224, 3)

    # Text descriptions: positive and negative prompts
    positive_description = "cat walking"  # Replace with your description
    negative_description = "irrelavant"  # Negative (irrelevant) prompt
    text_descriptions = [positive_description, negative_description]

    # Process video and text
    try:
        inputs = processor(
            text=text_descriptions,
            videos=list(video_frames),  # Pass the loaded frames as a list
            return_tensors="pt",
            padding=True
        )
    except Exception as e:
        print(f"Error processing inputs: {e}")
        inputs = None # Set inputs to None to indicate failure

    if inputs is not None: # Corrected syntax here
        # Move inputs to the same device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Run inference
        try:
            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits_per_video  # Shape: (1, 2) for two descriptions
                probs = logits.softmax(dim=1)  # Convert logits to probabilities
                positive_prob = probs[0, 0].item()  # Probability for positive description
                negative_prob = probs[0, 1].item()  # Probability for negative description
        except Exception as e:
            print(f"Error during inference: {e}")
            positive_prob = negative_prob = None

        if positive_prob is not None:
            # Interpret relevance
            threshold = 0.5  # Probability threshold (adjust as needed)
            is_relevant = positive_prob > threshold
            print(f"Positive Description: '{positive_description}' | Probability Score: {positive_prob:.4f}")
            print(f"Negative Description: '{negative_description}' | Probability Score: {negative_prob:.4f}")
            print(f"Is the positive description relevant? {'Yes' if is_relevant else 'No'}")
        else:
            print("Inference failed.")
    else:
        print("Input processing failed.")
else:
    print("Video loading failed, skipping inference.")

Loaded frames shape: (32, 224, 224, 3)
Positive Description: 'cat walking' | Probability Score: 0.4141
Negative Description: 'irrelavant' | Probability Score: 0.5859
Is the positive description relevant? No
