In [8]:
!pip install moviepy opencv-python torch torchvision pillow sentence-transformers transformers accelerate



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Video descriptions


In [25]:
import os
import cv2
import json
import torch
from PIL import Image
from datetime import datetime
from moviepy import VideoFileClip
from transformers import BlipProcessor, BlipForConditionalGeneration

# ---- Setup ----
# device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

os.makedirs("extracted_frames", exist_ok=True)

# ---- Frame Extraction ----
def extract_key_frames(video_path, num_frames=5):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError(f"Could not open {video_path}")
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    indices = [int(i * total_frames / num_frames) for i in range(num_frames)]
    frame_paths = []
    for idx, frame_num in enumerate(indices):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()
        if ret:
            base = os.path.basename(video_path).split('.')[0]
            path = f"extracted_frames/{base}_frame{idx}.jpg"
            cv2.imwrite(path, frame)
            frame_paths.append(path)
    cap.release()
    return frame_paths

# ---- Caption Generation ----
def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(**inputs)
    return processor.decode(output[0], skip_special_tokens=True)

# ---- Utility: Extract File Info ----
def get_file_info(video_path):
    stat = os.stat(video_path)
    size = stat.st_size
    created = datetime.utcfromtimestamp(stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S UTC')
    return size, created

# ---- Generate Video Description Object ----
def describe_video(video_path, num_frames=5):
    try:
        clip = VideoFileClip(video_path)
        duration = clip.duration
        width, height = clip.size
        fps = clip.fps
        has_audio = clip.audio is not None
        video_format = video_path.split('.')[-1]
    except Exception as e:
        raise RuntimeError(f"Video metadata extraction failed: {e}")

    # Metadata
    file_name = os.path.basename(video_path)
    file_path = os.path.abspath(video_path)
    file_size, created_at = get_file_info(video_path)

    # Captions
    frame_paths = extract_key_frames(video_path, num_frames=num_frames)
    frame_captions = [generate_caption(fp) for fp in frame_paths]

    # Dummy tagging (improvement idea: use keyword extraction or CLIP)
    tags = list(set(word for cap in frame_captions for word in cap.lower().split() if word.isalpha()))
    tags = tags[:5]  # limit

    # Build structured JSON
    json_obj = {
        "type": "video",
        "metadata": {
            "fileName": file_name,
            "filePath": file_path,
            "fileSize": file_size,
            "createdAt": created_at,
            "description": " ".join(set(frame_captions[:2])),
            "tags": tags
        },
        "duration": duration,
        "resolution": {"width": width, "height": height},
        "frameRate": fps,
        "hasAudio": has_audio,
        "videoFormat": video_format,
        "contentAnalysis": {
            "contentOverview": " ".join(set(frame_captions)),
            "actionIntroduction": frame_captions[0],
            "timeBoundDetails": [
                {
                    "detailStartTime": round(i * (duration / num_frames), 2),
                    "detailEndTime": round((i + 1) * (duration / num_frames), 2),
                    "detailDescription": frame_captions[i],
                    "detailConfidence": round(0.8 + 0.02 * (num_frames - i) / num_frames, 2)  # fake confidence
                }
                for i in range(len(frame_captions))
            ],
            "detectedObjects": tags[:5],
            "detectedScenes": list(set(["indoor" if "room" in c or "bed" in c else "outdoor" for c in frame_captions])),
            "estimatedMood": "neutral"
        }
    }

    return json_obj


In [27]:
video_paths = ["Chocolate2_low.mp4", "Chocolate3_low.mp4", "Chocolate4_low.mp4", "Chocolate5_low.mp4", "Business_center_low.mp4"]

all_descriptions = []
for path in video_paths:
    print(f"Processing: {path}")
    desc = describe_video(path)
    all_descriptions.append(desc)
    with open(f"{os.path.splitext(os.path.basename(path))[0]}_description.json", "w") as f:
        json.dump(desc, f, indent=2)


Processing: Chocolate2_low.mp4


  created = datetime.utcfromtimestamp(stat.st_ctime).strftime('%Y-%m-%d %H:%M:%S UTC')


Processing: Chocolate3_low.mp4
Processing: Chocolate4_low.mp4
Processing: Chocolate5_low.mp4
Processing: Business_center_low.mp4
