In [29]:
from moviepy.editor import VideoFileClip
from PIL import Image
import os

def extract_frames_with_moviepy(video_path, output_dir, gap=10):
    """
    Extracts every 'gap' frame from a video using MoviePy and saves them as images.

    Parameters:
    - video_path: Path to the video file.
    - output_dir: Directory to save the extracted frames.
    - gap: Save every 'gap' frame (e.g., frame 0, frame 10, frame 20, etc.).

    Returns:
    - image_paths: List of file paths for the saved frames.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    clip = VideoFileClip(video_path)
    fps = clip.fps
    total_frames = int(clip.fps * clip.duration)
    print(f"Total frames in video (calculated): {total_frames}")
    print(f"Saving every {gap}th frame.")

    processed_count = 0
    saved_count = 0
    image_paths = []

    for i, frame in enumerate(clip.iter_frames(fps=fps, dtype="uint8")):
        processed_count += 1

        if i % gap == 0:
            image_path = os.path.join(output_dir, f"frame_{saved_count*10}.jpg")
            Image.fromarray(frame).save(image_path)
            image_paths.append(image_path)
            saved_count += 1
            print(f"Saved frame {i} to {image_path}")

    print(f"Total frames processed: {processed_count}")
    print(f"Total frames saved: {saved_count}")
    return image_paths

video_path = "videoplayback.mp4"
output_dir = "extracted_images_good"
gap = 10

extracted_images = extract_frames_with_moviepy(video_path, output_dir, gap)
print("Extracted image paths:", extracted_images)

Total frames in video (calculated): 1433
Saving every 10th frame.
Saved frame 0 to extracted_images_good/frame_0.jpg
Saved frame 10 to extracted_images_good/frame_10.jpg
Saved frame 20 to extracted_images_good/frame_20.jpg
Saved frame 30 to extracted_images_good/frame_30.jpg
Saved frame 40 to extracted_images_good/frame_40.jpg
Saved frame 50 to extracted_images_good/frame_50.jpg
Saved frame 60 to extracted_images_good/frame_60.jpg
Saved frame 70 to extracted_images_good/frame_70.jpg
Saved frame 80 to extracted_images_good/frame_80.jpg
Saved frame 90 to extracted_images_good/frame_90.jpg
Saved frame 100 to extracted_images_good/frame_100.jpg
Saved frame 110 to extracted_images_good/frame_110.jpg
Saved frame 120 to extracted_images_good/frame_120.jpg
Saved frame 130 to extracted_images_good/frame_130.jpg
Saved frame 140 to extracted_images_good/frame_140.jpg
Saved frame 150 to extracted_images_good/frame_150.jpg
Saved frame 160 to extracted_images_good/frame_160.jpg
Saved frame 170 to ex

In [None]:
!jupyter nbconvert --to script demo1.ipynb
!mv demo1.txt demo1.py

[NbConvertApp] Converting notebook demo1.ipynb to script
[NbConvertApp] Writing 26965 bytes to demo1.txt


In [32]:
!pip install torch torchvision transformers nltk
!pip uninstall deeplake -y
!pip install "deeplake<4"

import torch
import torch.nn as nn

import deeplake
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from torchvision import transforms
from transformers import BertTokenizer, BertModel
import nltk
import random

from torch.utils.data import Dataset
from PIL import Image

import cv2
import os

Found existing installation: deeplake 3.9.29
Uninstalling deeplake-3.9.29:
  Successfully uninstalled deeplake-3.9.29
Collecting deeplake<4
  Using cached deeplake-3.9.29-py3-none-any.whl
Installing collected packages: deeplake
Successfully installed deeplake-3.9.29


In [None]:
# Import the module
import demo1

# Import specific models or functions as needed
from demo1 import CNN, DecoderRNN, extract_images_from_video, process_image, predict

# Initialize models
device = "cuda" if torch.cuda.is_available() else "cpu"
cnn_model = CNN().to(device)
rnn_model = DecoderRNN().to(device)

# Test the pipeline
video_path = "videoplayback.mp4"
output_dir = "frames"
gap = 10




Opening dataset in read-only mode as you don't have write permissions.


\

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/activeloop/flickr30k



\

hub://activeloop/flickr30k loaded successfully.



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All tests passed!
All tests passed!
All tests passed!
All tests passed!
vocab size 30522
embed dim 768


In [None]:

def process_images_from_folder(folder_path, device, cnn_model, rnn_model):
    """
    Process all images in a folder, predict captions using CNN and RNN, and return a caption map.

    Parameters:
    - folder_path: Path to the folder containing images.
    - device: The device ('cpu' or 'cuda') to run the model.
    - cnn_model: Pretrained CNN model for feature extraction.
    - rnn_model: Pretrained RNN model for caption generation.

    Returns:
    - caption_map: Dictionary mapping captions to their frequency.
    """
    caption_map = {}
    processed_count = 0

    for image_file in sorted(os.listdir(folder_path)):
        image_path = os.path.join(folder_path, image_file)

        try:
            # Load the image with PIL
            image = Image.open(image_path).convert("RGB")

            # Process the image
            processed_image = process_image(image).unsqueeze(0).to(device)  # Add batch dimension

            # Predict the caption
            predicted_caption = predict(cnn_model, rnn_model, processed_image, device=device)

            # Update the caption map
            if predicted_caption in caption_map:
                caption_map[predicted_caption] += 1
            else:
                caption_map[predicted_caption] = 1

            processed_count += 1
            print(f"Processed image {processed_count}: {image_file} -> Caption: {predicted_caption}")
        except Exception as e:
            print(f"Error processing image {image_file}: {e}")

    print(f"Total images processed: {processed_count}")
    return caption_map

# Example Run
folder_path = "extracted_images_good"
caption_map = process_images_from_folder(
    folder_path=folder_path,
    device=device,
    cnn_model=cnn_model,
    rnn_model=rnn_model
)

print("Caption Map:", caption_map)