In [1]:
# git clone

In [2]:
# cd AIC-2024/ai

In [3]:
# pip install -r requirements.txt

In [4]:
import os

In [5]:
list_data = [
    'a',
    'b',
]

In [6]:
ROOT_DIR = r"C:\Users\hokha\OneDrive\Desktop\test_download"
OUTPUT_DIR = r"C:\Users\hokha\OneDrive\Desktop\test"

In [7]:
EMBEDDING_FOLDER = os.path.join(ROOT_DIR, 'embeddings')
FRAME_FOLDER = os.path.join(ROOT_DIR, 'frames')
VIDEO_FOLDER = os.path.join(ROOT_DIR, 'video')
MODEL_FOLDER = os.path.join(ROOT_DIR, 'models')
OCR_FOLDER = os.path.join(ROOT_DIR, 'ocr')
KEYFRAME_FOLDER = os.path.join(ROOT_DIR, 'keyframes')
DE_FOLDER = os.path.join(ROOT_DIR, 'descriptions')

DE_EMBEDDING_FOLDER = os.path.join(OUTPUT_DIR, 'de_embeddings')
OCR_EMBEDDING_FOLDER = os.path.join(OUTPUT_DIR, 'ocr_embeddings')
OBJECT_FOLDER = os.path.join(OUTPUT_DIR, 'objects')
FINAL_EMBEDDING_FOLDER = os.path.join(OUTPUT_DIR, 'final_embeddings')
CSV_FOLDER = os.path.join(OUTPUT_DIR, 'csv')

MODELS_CLIP_NAME='ViT-B-32'
PRETRAINED_CLIP='datacomp_xl_s13b_b90k'
TOKENIZER_CLIP='ViT-B-32'

BATCH_CLIP_SIZE=512

THRESHOLD_KEYFRAME = 0.9
FRAME_WIDTH = 640
FRAME_HEIGHT = 480

KEYFRAME_PROCESS = 2

DE_MODEL="Salesforce/blip-image-captioning-base"
DE_PROCESSOR="Salesforce/blip-image-captioning-base"
DE_BATCH_SIZE=128
DE_EMBEDDING_BATCH_SIZE=128


#Object detection
MODEL_YOLOV8='yolov8m.pt'
BATCH_SIZE_YOLO=64
MODEL_YOLO_ROOT='/data/models'

In [8]:
import torch

print(torch.__version__)

2.4.0+cu124


In [9]:
from scripts.utils import *
from scripts.extract_frame import *
from scripts.image_to_text import *
from scripts.models import *
from scripts.description_embedding import *

import os, shutil
from object_detection import generate_output_json
from timeit import default_timer as timer

def keyframe_and_embedding(frames_folder, video_folder, keyframe_folder, embedding_folder, csv_folder, threshold=1e-3, width=1024, height=1024, batch_size=32, num_processes=8):
    print("Loading CLIP model...")
    model_name = MODELS_CLIP_NAME
    pretrained = PRETRAINED_CLIP
    tokenizers = TOKENIZER_CLIP
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    embedding = (model_name, pretrained, tokenizers, device)
    print("Model loaded.")
    # Create keyframe folder
    if not os.path.exists(keyframe_folder):
        os.makedirs(keyframe_folder)
    # Create embedding folder
    if not os.path.exists(embedding_folder):
        os.makedirs(embedding_folder)
    
    if not os.path.exists(csv_folder):
        os.makedirs(csv_folder)
    
    if not os.path.exists(frames_folder):
        os.makedirs(frames_folder)
    
    # Extract keyframes
    # print("Extracting keyframes...")
    # extract_keyframes(video_folder, keyframe_folder, embedding_folder, embedding, threshold=threshold, width=width, height=height, batch_size=256)

    # print("Extracting frames...")
    # multiprocessing_extract_from_keyframes(video_folder, keyframe_folder, frame_folder, width=width, height=height, num_processes=8)
    
    print("Extracting video...")
    extract_video(frames_folder, video_folder, keyframe_folder, embedding_folder, embedding, csv_folder, threshold=threshold, width=width, height=height, batch_size=batch_size, num_processes=num_processes)
    print("Done.")

def object_detection(frame_folder, object_folder, models = 'yolov8m.pt', batch_size = 64, model_dir='data/models_yolo'):
    if not os.path.exists(object_folder):
        os.makedirs(object_folder)
    
    print("Generating output JSON Object Detection...")
    generate_output_json(frame_folder, object_folder, model_path=models, batch_size=batch_size, model_dir=model_dir)
    print("Done.")

def description_embedding(keyframe_folder, description_folder, embedding_folder, description_model, embedding_model, description_batch_size=32, embedding_batch_size=32):
    if not os.path.exists(embedding_folder):
        os.makedirs(embedding_folder)
    
    if not os.path.exists(description_folder):
        os.makedirs(description_folder)
    
    print("Extracting description...")
    extract_description_folder(keyframe_folder, description_folder, description_model, batch_size=description_batch_size)
    print("Done.")

    print("Embedding description...")
    embedding_description_folder(description_folder, embedding_folder, embedding_model, batch_size=embedding_batch_size)
    print("Done.")


def setup_data():
    s = timer()
    videos_folder = VIDEO_FOLDER
    keyframe_folder = KEYFRAME_FOLDER
    embedding_folder = EMBEDDING_FOLDER
    threshold_keyframe = THRESHOLD_KEYFRAME
    csv_folder = CSV_FOLDER
    frames_folder = FRAME_FOLDER

    batch_extract = BATCH_CLIP_SIZE

    width = FRAME_WIDTH
    height = FRAME_HEIGHT
    start = timer()

    num_process = KEYFRAME_PROCESS
    keyframe_and_embedding(frames_folder, videos_folder, keyframe_folder, embedding_folder, csv_folder, threshold=threshold_keyframe, width=width, height=height, batch_size=batch_extract, num_processes=num_process)
    print("Time to extract keyframes and embeddings: ", timer()-start)

    description_name = DE_MODEL
    description_pretrained = DE_PROCESSOR
    embedding_name = MODELS_CLIP_NAME
    embedding_pretrained = PRETRAINED_CLIP
    tokenizer_name = TOKENIZER_CLIP
    description_folder = DE_FOLDER
    de_embedding_folder = DE_EMBEDDING_FOLDER
    description_batch = DE_BATCH_SIZE
    de_embedding_batch = DE_EMBEDDING_BATCH_SIZE
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    start = timer()
    description_embedding(keyframe_folder, description_folder, de_embedding_folder, (description_name, description_pretrained, device), (embedding_name, embedding_pretrained, tokenizer_name, device), description_batch_size=description_batch, embedding_batch_size=de_embedding_batch)
    print("Time to extract description and embeddings: ", timer()-start)

    object_folder = OBJECT_FOLDER
    batch_object = BATCH_SIZE_YOLO
    yolo_root = MODEL_YOLO_ROOT
    yolo_model = MODEL_YOLOV8
    start = timer()
    object_detection(keyframe_folder, object_folder, models=yolo_model, batch_size=batch_object, model_dir=yolo_root)
    print("Time to extract objects: ", timer()-start)

    print("Total time: ", timer()-s)
    print("Done.")


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
import requests

def download_file(url, filename, output_dir):
    path = os.path.join(output_dir, filename)
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)

def unzip_file(file_path, output_dir):
    import zipfile
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)

    os.remove(file_path)

def download_and_unzip(url, filename, output_dir):
    download_file(url, filename, output_dir)
    unzip_file(os.path.join(output_dir, filename), output_dir)

In [11]:
def remove_all_in_folder(folder):
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        if os.path.isfile(file_path):
            os.remove(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)

In [12]:
for data in list_data:
    print(f"Downloading data {data}")
    # download_and_unzip(f"https://storage.googleapis.com/quickgelu/{data}.zip", f"{data}.zip", ROOT_DIR)
    print(f"Data {data} downloaded")

    print("Setting up data...")
    setup_data()
    print("Data setup done")

    print(f"Removing data {data}")
    remove_all_in_folder(ROOT_DIR)
    print(f"Data {data} removed")


Downloading data a
Data a downloaded
Setting up data...
Loading CLIP model...
Model loaded.
Extracting video...
