In [None]:
import os
import sys
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

current_dir = os.getcwd()
src_path = os.path.join(current_dir, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

from tiktok_feature_extractor import TikTokFeatureExtractor
extractor = TikTokFeatureExtractor()

video_folder = "data/tiktok_videos"
output_folder = "data/tiktok_frames"
csv_output = "data/video_features_results.csv"

if not os.path.exists(video_folder):
    print(f"Video folder '{video_folder}' not found!")
    exit()
os.makedirs(output_folder, exist_ok=True)

video_files = [f for f in os.listdir(video_folder) if f.lower().endswith('.mp4')]
print(f"Found {len(video_files)} video files")

In [1]:
import sys, asyncio, nest_asyncio
sys.path.append('/app/src')
nest_asyncio.apply()

import src.main_batch as main_batch

sys.argv = [
    'main_batch',
    '--csv', '/app/data/creators.csv',
    '--batch-size', '3',
    '--concurrency', '3',
    '--limit-per-creator', '3',
    '-v'
]

await main_batch.main()

2025-08-21 04:06:21,359 [INFO] main_batch: 🚀 Starting TikTok creator batch analysis
2025-08-21 04:06:21,439 [INFO] main_batch: ✅ Config loaded: config/config.yml
2025-08-21 04:06:21,441 [INFO] progress_manager: 🆕 Initialized new progress store
2025-08-21 04:06:21,442 [INFO] batch_processor: ✅ Batch processor ready: batch_size=5, concurrency=2
2025-08-21 04:06:21,443 [INFO] main_batch: 📊 Processing creators list: /app/data/creators.csv
2025-08-21 04:06:21,446 [INFO] main_batch:    File size: 3.7 KB
2025-08-21 04:06:21,447 [INFO] main_batch: ⚙️ Runtime config:
2025-08-21 04:06:21,448 [INFO] main_batch:    batch_size: 3
2025-08-21 04:06:21,449 [INFO] main_batch:    concurrency: 3
2025-08-21 04:06:21,451 [INFO] main_batch:    delay_between_batches: 10 sec
2025-08-21 04:06:21,452 [INFO] main_batch:    max_videos_per_creator: 3
2025-08-21 04:06:21,464 [INFO] batch_processor: 📊 Loaded 269 creators from CSV (encoding=utf-8)
2025-08-21 04:06:21,487 [INFO] progress_manager: 🆕 Created batch batch


0: 640x384 1 person, 1 bottle, 1 apple, 18.1ms
1: 640x384 1 person, 2 bottles, 1 apple, 18.1ms
2: 640x384 1 person, 2 bottles, 1 apple, 18.1ms
3: 640x384 1 person, 2 bottles, 1 apple, 18.1ms
4: 640x384 2 persons, 2 bottles, 1 cup, 1 apple, 18.1ms
5: 640x384 1 person, 2 bottles, 1 apple, 18.1ms
6: 640x384 1 person, 3 bottles, 1 apple, 18.1ms
7: 640x384 3 bottles, 1 apple, 18.1ms
8: 640x384 1 person, 2 bottles, 1 apple, 18.1ms
9: 640x384 3 bottles, 18.1ms
10: 640x384 3 bottles, 18.1ms
11: 640x384 2 bottles, 18.1ms
12: 640x384 2 bottles, 18.1ms
13: 640x384 2 bottles, 18.1ms
14: 640x384 2 bottles, 18.1ms
15: 640x384 2 bottles, 18.1ms
Speed: 6.0ms preprocess, 18.1ms inference, 6.4ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:07:44,456 [INFO] frame_analyzer: Found 0 product frames, 16 non-product frames
2025-08-21 04:07:44,578 [INFO] frame_analyzer: Selecting representative frames for video: alexandrasfinds1_7540866098422353166
2025-08-21 04:07:45,579 [INFO] frame_analyzer: Selected 3 frames with CLIP.
2025-08-21 04:07:45,583 [INFO] frame_analyzer: Selected 3 representative frames for video: lalalakays_7540575930373246263
2025-08-21 04:07:45,653 [INFO] frame_analyzer: After cleanup, video_dir files: ['audio_analysis_results.json', 'lalalakays_7540575930373246263.wav', 'non_vocals.wav', 'representative_lalalakays_7540575930373246263_frame_0000.jpg', 'representative_lalalakays_7540575930373246263_frame_0001.jpg', 'representative_lalalakays_7540575930373246263_frame_0002.jpg', 'speech_transcription.txt', 'vocals.wav']
2025-08-21 04:07:45,662 [INFO] TikTokFeatureExtractor: Multimodal feature extraction started (GPT4O enabled)
2025-08-21 04:07:45,665 [INFO] MultimodalExtractor: Configuration: batch


0: 640x384 1 person, 2 bottles, 49.4ms
1: 640x384 1 person, 2 bottles, 49.4ms
2: 640x384 1 person, 2 bottles, 49.4ms
Speed: 1.8ms preprocess, 49.4ms inference, 2.1ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:07:46,038 [INFO] MultimodalExtractor: YOLO batch inference finished. Total objects: 9
2025-08-21 04:07:46,041 [INFO] MultimodalExtractor: Running BLIP batch captioning on 3 images.



0: 640x384 1 person, 32.0ms
1: 640x384 1 person, 1 couch, 32.0ms
2: 640x384 1 person, 32.0ms
3: 640x384 1 person, 1 toothbrush, 32.0ms
4: 640x384 1 person, 32.0ms
Speed: 1.5ms preprocess, 32.0ms inference, 5.1ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:07:46,791 [INFO] frame_analyzer: Found 1 product frames, 4 non-product frames
2025-08-21 04:07:46,975 [INFO] MultimodalExtractor: BLIP batch captioning finished. Total captions: 3
2025-08-21 04:07:46,978 [INFO] MultimodalExtractor: GPT4O BATCH PROMPT MONITORING
2025-08-21 04:07:46,982 [INFO] MultimodalExtractor: Number of frames: 3
2025-08-21 04:07:46,983 [INFO] MultimodalExtractor: Prompt length: 8694 characters
2025-08-21 04:07:46,985 [INFO] MultimodalExtractor: Prompt content:
2025-08-21 04:07:46,987 [INFO] MultimodalExtractor: ----------------------------------------
2025-08-21 04:07:46,989 [INFO] MultimodalExtractor: You are a professional TikTok video content analysis expert with comprehensive knowledge of TikTok's content creator tag system. 
Analyze the provided video content (including frames, audio transcript, and metadata) and return ONLY a valid JSON response with the following unified structure:

{
  "video_description": "comprehensive description of the enti


0: 640x384 1 person, 1 toothbrush, 44.5ms
1: 640x384 1 person, 44.5ms
2: 640x384 1 person, 44.5ms
Speed: 4.9ms preprocess, 44.5ms inference, 5.7ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:07:47,972 [INFO] MultimodalExtractor: YOLO batch inference finished. Total objects: 4
2025-08-21 04:07:47,974 [INFO] MultimodalExtractor: Running BLIP batch captioning on 3 images.
2025-08-21 04:07:48,759 [INFO] MultimodalExtractor: BLIP batch captioning finished. Total captions: 3
2025-08-21 04:07:48,762 [INFO] MultimodalExtractor: GPT4O BATCH PROMPT MONITORING
2025-08-21 04:07:48,764 [INFO] MultimodalExtractor: Number of frames: 3
2025-08-21 04:07:48,764 [INFO] MultimodalExtractor: Prompt length: 9038 characters
2025-08-21 04:07:48,765 [INFO] MultimodalExtractor: Prompt content:
2025-08-21 04:07:48,766 [INFO] MultimodalExtractor: ----------------------------------------
2025-08-21 04:07:48,768 [INFO] MultimodalExtractor: You are a professional TikTok video content analysis expert with comprehensive knowledge of TikTok's content creator tag system. 
Analyze the provided video content (including frames, audio transcript, and metadata) and return ONLY a valid JSON response


0: 640x384 1 person, 1 bowl, 1 orange, 1 couch, 1 bed, 17.2ms
1: 640x384 1 person, 1 bowl, 1 teddy bear, 17.2ms
2: 640x384 2 persons, 1 bowl, 1 orange, 1 potted plant, 17.2ms
3: 640x384 1 person, 1 bowl, 1 banana, 1 orange, 1 cake, 1 tv, 1 teddy bear, 17.2ms
4: 640x384 2 persons, 1 bowl, 1 orange, 1 tv, 17.2ms
5: 640x384 1 person, 1 bowl, 1 orange, 1 cake, 1 couch, 1 potted plant, 17.2ms
6: 640x384 1 person, 17.2ms
7: 640x384 1 person, 17.2ms
8: 640x384 1 person, 17.2ms
9: 640x384 1 person, 1 bowl, 1 orange, 1 potted plant, 1 vase, 17.2ms
10: 640x384 1 person, 1 bowl, 17.2ms
11: 640x384 1 person, 1 bowl, 17.2ms
12: 640x384 1 person, 1 bowl, 17.2ms
13: 640x384 1 person, 1 bowl, 1 apple, 1 orange, 1 couch, 17.2ms
14: 640x384 1 person, 1 bowl, 2 oranges, 17.2ms
15: 640x384 1 person, 1 bowl, 1 orange, 1 cake, 17.2ms
Speed: 2.7ms preprocess, 17.2ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 1 bowl, 1 cake, 3.7ms
1: 640x384 1 person, 1 bowl, 2 ora

2025-08-21 04:07:54,365 [INFO] frame_analyzer: Found 2 product frames, 37 non-product frames
2025-08-21 04:07:55,469 [INFO] frame_analyzer: Selected 1 frames with CLIP.
2025-08-21 04:07:55,471 [INFO] frame_analyzer: Selected 3 representative frames for video: boise_brooke_7540861540593880334
2025-08-21 04:07:55,644 [INFO] frame_analyzer: After cleanup, video_dir files: ['audio_analysis_results.json', 'boise_brooke_7540861540593880334.wav', 'non_vocals.wav', 'representative_boise_brooke_7540861540593880334_frame_0000.jpg', 'representative_boise_brooke_7540861540593880334_frame_0001.jpg', 'representative_boise_brooke_7540861540593880334_frame_0002.jpg', 'speech_transcription.txt', 'vocals.wav']
2025-08-21 04:07:55,652 [INFO] TikTokFeatureExtractor: Multimodal feature extraction started (GPT4O enabled)
2025-08-21 04:07:55,654 [INFO] MultimodalExtractor: Configuration: batch_image_upload=True, embedding_analysis=False
2025-08-21 04:07:55,655 [INFO] MultimodalExtractor: Using batch image up


0: 640x384 1 person, 1 banana, 1 cake, 1 tv, 39.1ms
1: 640x384 2 persons, 1 bowl, 39.1ms
2: 640x384 1 person, 1 bowl, 1 cake, 39.1ms
Speed: 1.6ms preprocess, 39.1ms inference, 5.1ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:07:55,988 [INFO] MultimodalExtractor: YOLO batch inference finished. Total objects: 10
2025-08-21 04:07:55,989 [INFO] MultimodalExtractor: Running BLIP batch captioning on 3 images.
2025-08-21 04:07:56,484 [INFO] MultimodalExtractor: BLIP batch captioning finished. Total captions: 3
2025-08-21 04:07:56,486 [INFO] MultimodalExtractor: GPT4O BATCH PROMPT MONITORING
2025-08-21 04:07:56,487 [INFO] MultimodalExtractor: Number of frames: 3
2025-08-21 04:07:56,488 [INFO] MultimodalExtractor: Prompt length: 8691 characters
2025-08-21 04:07:56,488 [INFO] MultimodalExtractor: Prompt content:
2025-08-21 04:07:56,489 [INFO] MultimodalExtractor: ----------------------------------------
2025-08-21 04:07:56,490 [INFO] MultimodalExtractor: You are a professional TikTok video content analysis expert with comprehensive knowledge of TikTok's content creator tag system. 
Analyze the provided video content (including frames, audio transcript, and metadata) and return ONLY a valid JSON respons


0: 640x384 1 person, 1 couch, 14.8ms
1: 640x384 1 person, 1 couch, 14.8ms
2: 640x384 1 person, 1 couch, 1 bed, 14.8ms
3: 640x384 1 person, 1 bed, 2 books, 14.8ms
4: 640x384 1 person, 1 couch, 2 books, 14.8ms
5: 640x384 1 person, 1 couch, 14.8ms
6: 640x384 1 person, 1 couch, 1 bed, 14.8ms
7: 640x384 1 person, 1 couch, 14.8ms
8: 640x384 1 person, 1 couch, 14.8ms
9: 640x384 1 person, 14.8ms
10: 640x384 1 person, 14.8ms
11: 640x384 (no detections), 14.8ms
12: 640x384 2 persons, 14.8ms
13: 640x384 1 person, 14.8ms
Speed: 3.0ms preprocess, 14.8ms inference, 4.4ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:08:19,316 [INFO] frame_analyzer: Found 2 product frames, 12 non-product frames
2025-08-21 04:08:19,725 [INFO] frame_analyzer: Selected 1 frames with CLIP.
2025-08-21 04:08:19,729 [INFO] frame_analyzer: Selected 3 representative frames for video: alexandrasfinds1_7540784657613294862
2025-08-21 04:08:19,930 [INFO] frame_analyzer: After cleanup, video_dir files: ['alexandrasfinds1_7540784657613294862.wav', 'audio_analysis_results.json', 'non_vocals.wav', 'representative_alexandrasfinds1_7540784657613294862_frame_0000.jpg', 'representative_alexandrasfinds1_7540784657613294862_frame_0001.jpg', 'representative_alexandrasfinds1_7540784657613294862_frame_0002.jpg', 'speech_transcription.txt', 'vocals.wav']
2025-08-21 04:08:19,940 [INFO] TikTokFeatureExtractor: Multimodal feature extraction started (GPT4O enabled)
2025-08-21 04:08:19,942 [INFO] MultimodalExtractor: Configuration: batch_image_upload=True, embedding_analysis=False
2025-08-21 04:08:19,944 [INFO] MultimodalExtractor: 


0: 640x384 1 person, 1 couch, 2 books, 22.5ms
1: 640x384 1 person, 1 couch, 1 book, 22.5ms
2: 640x384 1 person, 1 handbag, 22.5ms
Speed: 2.5ms preprocess, 22.5ms inference, 4.8ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:08:20,259 [INFO] MultimodalExtractor: YOLO batch inference finished. Total objects: 6
2025-08-21 04:08:20,262 [INFO] MultimodalExtractor: Running BLIP batch captioning on 3 images.
2025-08-21 04:08:21,420 [INFO] MultimodalExtractor: BLIP batch captioning finished. Total captions: 3
2025-08-21 04:08:21,424 [INFO] MultimodalExtractor: GPT4O BATCH PROMPT MONITORING
2025-08-21 04:08:21,427 [INFO] MultimodalExtractor: Number of frames: 3
2025-08-21 04:08:21,429 [INFO] MultimodalExtractor: Prompt length: 9114 characters
2025-08-21 04:08:21,431 [INFO] MultimodalExtractor: Prompt content:
2025-08-21 04:08:21,433 [INFO] MultimodalExtractor: ----------------------------------------
2025-08-21 04:08:21,436 [INFO] MultimodalExtractor: You are a professional TikTok video content analysis expert with comprehensive knowledge of TikTok's content creator tag system. 
Analyze the provided video content (including frames, audio transcript, and metadata) and return ONLY a valid JSON response


0: 640x384 2 persons, 1 bottle, 4.9ms
1: 640x384 1 person, 1 cup, 1 book, 4.9ms
2: 640x384 2 persons, 1 bottle, 4.9ms
3: 640x384 2 persons, 1 bottle, 4.9ms
4: 640x384 2 persons, 1 bottle, 4.9ms
5: 640x384 1 person, 1 bottle, 4.9ms
6: 640x384 1 bottle, 4.9ms
7: 640x384 1 person, 1 bottle, 4.9ms
8: 640x384 1 bottle, 4.9ms
9: 640x384 1 person, 1 cup, 4.9ms
10: 640x384 1 person, 1 cup, 4.9ms
11: 640x384 1 person, 1 book, 4.9ms
12: 640x384 1 person, 1 book, 4.9ms
13: 640x384 1 cup, 1 book, 4.9ms
14: 640x384 1 person, 1 book, 4.9ms
15: 640x384 1 person, 1 bottle, 4.9ms
Speed: 2.0ms preprocess, 4.9ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 1 bottle, 1 cup, 30.0ms
1: 640x384 1 person, 30.0ms
2: 640x384 1 person, 1 book, 30.0ms
3: 640x384 1 person, 1 cup, 1 book, 30.0ms
4: 640x384 1 person, 1 cup, 30.0ms
5: 640x384 1 person, 1 cup, 1 book, 30.0ms
Speed: 1.8ms preprocess, 30.0ms inference, 6.0ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:08:24,621 [INFO] frame_analyzer: Found 8 product frames, 14 non-product frames
2025-08-21 04:08:25,007 [INFO] frame_analyzer: Selected 3 frames with CLIP.
2025-08-21 04:08:25,010 [INFO] frame_analyzer: Selected 3 representative frames for video: lalalakays_7540575509026065677
2025-08-21 04:08:25,105 [INFO] frame_analyzer: After cleanup, video_dir files: ['audio_analysis_results.json', 'lalalakays_7540575509026065677.wav', 'non_vocals.wav', 'representative_lalalakays_7540575509026065677_frame_0000.jpg', 'representative_lalalakays_7540575509026065677_frame_0001.jpg', 'representative_lalalakays_7540575509026065677_frame_0002.jpg', 'speech_transcription.txt', 'vocals.wav']
2025-08-21 04:08:25,115 [INFO] TikTokFeatureExtractor: Multimodal feature extraction started (GPT4O enabled)
2025-08-21 04:08:25,117 [INFO] MultimodalExtractor: Configuration: batch_image_upload=True, embedding_analysis=False
2025-08-21 04:08:25,118 [INFO] MultimodalExtractor: Using batch image upload mode 


0: 640x384 1 person, 1 book, 19.5ms
1: 640x384 1 person, 1 book, 19.5ms
2: 640x384 1 person, 1 book, 19.5ms
Speed: 2.7ms preprocess, 19.5ms inference, 3.1ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:08:25,452 [INFO] MultimodalExtractor: YOLO batch inference finished. Total objects: 6
2025-08-21 04:08:25,454 [INFO] MultimodalExtractor: Running BLIP batch captioning on 3 images.
2025-08-21 04:08:26,309 [INFO] MultimodalExtractor: BLIP batch captioning finished. Total captions: 3
2025-08-21 04:08:26,313 [INFO] MultimodalExtractor: GPT4O BATCH PROMPT MONITORING
2025-08-21 04:08:26,315 [INFO] MultimodalExtractor: Number of frames: 3
2025-08-21 04:08:26,317 [INFO] MultimodalExtractor: Prompt length: 8680 characters
2025-08-21 04:08:26,319 [INFO] MultimodalExtractor: Prompt content:
2025-08-21 04:08:26,321 [INFO] MultimodalExtractor: ----------------------------------------
2025-08-21 04:08:26,323 [INFO] MultimodalExtractor: You are a professional TikTok video content analysis expert with comprehensive knowledge of TikTok's content creator tag system. 
Analyze the provided video content (including frames, audio transcript, and metadata) and return ONLY a valid JSON response


0: 640x384 2 persons, 1 bottle, 1 potted plant, 49.5ms
1: 640x384 2 persons, 1 bottle, 1 potted plant, 1 cell phone, 49.5ms
2: 640x384 2 persons, 1 bottle, 1 potted plant, 49.5ms
Speed: 5.2ms preprocess, 49.5ms inference, 2.3ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:08:51,280 [INFO] frame_analyzer: Found 0 product frames, 3 non-product frames
2025-08-21 04:08:51,285 [INFO] frame_analyzer: Selected 3 representative frames for video: alexandrasfinds1_7540744304839445815
2025-08-21 04:08:51,465 [INFO] frame_analyzer: After cleanup, video_dir files: ['alexandrasfinds1_7540744304839445815.wav', 'audio_analysis_results.json', 'non_vocals.wav', 'representative_alexandrasfinds1_7540744304839445815_frame_0000.jpg', 'representative_alexandrasfinds1_7540744304839445815_frame_0001.jpg', 'representative_alexandrasfinds1_7540744304839445815_frame_0002.jpg', 'speech_transcription.txt', 'vocals.wav']
2025-08-21 04:08:51,475 [INFO] TikTokFeatureExtractor: Multimodal feature extraction started (GPT4O enabled)
2025-08-21 04:08:51,478 [INFO] MultimodalExtractor: Configuration: batch_image_upload=True, embedding_analysis=False
2025-08-21 04:08:51,480 [INFO] MultimodalExtractor: Using batch image upload mode for GPT4O analysis
2025-08-21 04:08:51,482 [INF


0: 640x384 2 persons, 1 bottle, 1 potted plant, 13.3ms
1: 640x384 2 persons, 1 bottle, 1 potted plant, 13.3ms
2: 640x384 2 persons, 1 bottle, 1 potted plant, 1 cell phone, 13.3ms
Speed: 2.3ms preprocess, 13.3ms inference, 3.6ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:08:51,721 [INFO] MultimodalExtractor: YOLO batch inference finished. Total objects: 12
2025-08-21 04:08:51,723 [INFO] MultimodalExtractor: Running BLIP batch captioning on 3 images.
2025-08-21 04:08:52,538 [INFO] MultimodalExtractor: BLIP batch captioning finished. Total captions: 3
2025-08-21 04:08:52,541 [INFO] MultimodalExtractor: GPT4O BATCH PROMPT MONITORING
2025-08-21 04:08:52,543 [INFO] MultimodalExtractor: Number of frames: 3
2025-08-21 04:08:52,544 [INFO] MultimodalExtractor: Prompt length: 9211 characters
2025-08-21 04:08:52,545 [INFO] MultimodalExtractor: Prompt content:
2025-08-21 04:08:52,546 [INFO] MultimodalExtractor: ----------------------------------------
2025-08-21 04:08:52,548 [INFO] MultimodalExtractor: You are a professional TikTok video content analysis expert with comprehensive knowledge of TikTok's content creator tag system. 
Analyze the provided video content (including frames, audio transcript, and metadata) and return ONLY a valid JSON respons




2025-08-21 04:08:54,475 [INFO] frame_analyzer: Selecting representative frames for video: lalalakays_7540573285457644813


0: 640x384 1 person, 4.0ms
1: 640x384 1 person, 1 handbag, 4.0ms
2: 640x384 1 person, 4.0ms
3: 640x384 1 person, 1 cake, 4.0ms
4: 640x384 1 person, 4.0ms
5: 640x384 2 persons, 4.0ms
6: 640x384 1 person, 4.0ms
7: 640x384 1 cake, 4.0ms
8: 640x384 1 person, 1 cake, 4.0ms
9: 640x384 1 person, 4.0ms
10: 640x384 (no detections), 4.0ms
11: 640x384 1 person, 1 book, 4.0ms
12: 640x384 1 person, 1 umbrella, 4.0ms
13: 640x384 1 person, 4.0ms
14: 640x384 1 person, 1 couch, 4.0ms
15: 640x384 1 person, 1 bed, 4.0ms
Speed: 2.0ms preprocess, 4.0ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 4.4ms
1: 640x384 1 person, 4.4ms
2: 640x384 1 person, 4.4ms
3: 640x384 1 person, 4.4ms
4: 640x384 1 person, 4.4ms
5: 640x384 1 person, 4.4ms
6: 640x384 1 person, 1 couch, 4.4ms
7: 640x384 1 person, 4.4ms
8: 640x384 1 person, 4.4ms
9: 640x384 1 person, 4.4ms
10: 640x384 1 person, 1 couch, 4.4ms
11: 640x384 1 person, 4.4ms
12: 640x384 1 person, 4.4ms
13: 640x384 1 person, 4.

2025-08-21 04:08:57,053 [INFO] frame_analyzer: Found 11 product frames, 50 non-product frames
2025-08-21 04:08:57,448 [INFO] frame_analyzer: Selected 3 frames with CLIP.
2025-08-21 04:08:57,450 [INFO] frame_analyzer: Selected 3 representative frames for video: boise_brooke_7540833756756725006


0: 640x384 1 bottle, 8.8ms
1: 640x384 1 bottle, 1 toothbrush, 8.8ms
2: 640x384 1 bottle, 1 toothbrush, 8.8ms
3: 640x384 1 bottle, 1 toothbrush, 8.8ms
4: 640x384 2 bottles, 1 apple, 8.8ms
5: 640x384 2 bottles, 1 apple, 8.8ms
6: 640x384 2 bottles, 1 apple, 8.8ms
7: 640x384 1 bottle, 8.8ms
8: 640x384 1 bottle, 1 cell phone, 8.8ms
9: 640x384 1 bottle, 1 cell phone, 1 toothbrush, 8.8ms
10: 640x384 2 bottles, 8.8ms
11: 640x384 2 bottles, 8.8ms
Speed: 1.8ms preprocess, 8.8ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:08:57,605 [INFO] frame_analyzer: Found 15 product frames, 13 non-product frames
2025-08-21 04:08:57,673 [INFO] frame_analyzer: After cleanup, video_dir files: ['audio_analysis_results.json', 'boise_brooke_7540833756756725006.wav', 'non_vocals.wav', 'representative_boise_brooke_7540833756756725006_frame_0000.jpg', 'representative_boise_brooke_7540833756756725006_frame_0001.jpg', 'representative_boise_brooke_7540833756756725006_frame_0002.jpg', 'speech_transcription.txt', 'vocals.wav']
2025-08-21 04:08:57,682 [INFO] TikTokFeatureExtractor: Multimodal feature extraction started (GPT4O enabled)
2025-08-21 04:08:57,683 [INFO] MultimodalExtractor: Configuration: batch_image_upload=True, embedding_analysis=False
2025-08-21 04:08:57,684 [INFO] MultimodalExtractor: Using batch image upload mode for GPT4O analysis
2025-08-21 04:08:57,685 [INFO] MultimodalExtractor: Compressing images to optimize token usage...
2025-08-21 04:08:57,690 [INFO] image_compressor: Original image: 1080x19


0: 640x384 1 person, 8.5ms
1: 640x384 1 person, 1 couch, 1 book, 8.5ms
2: 640x384 1 person, 8.5ms
Speed: 1.6ms preprocess, 8.5ms inference, 2.7ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:08:57,897 [INFO] MultimodalExtractor: YOLO batch inference finished. Total objects: 5
2025-08-21 04:08:57,899 [INFO] MultimodalExtractor: Running BLIP batch captioning on 3 images.
2025-08-21 04:08:58,131 [INFO] frame_analyzer: Selected 3 frames with CLIP.
2025-08-21 04:08:58,134 [INFO] frame_analyzer: Selected 3 representative frames for video: lalalakays_7540573285457644813
2025-08-21 04:08:58,260 [INFO] frame_analyzer: After cleanup, video_dir files: ['audio_analysis_results.json', 'lalalakays_7540573285457644813.wav', 'non_vocals.wav', 'representative_lalalakays_7540573285457644813_frame_0000.jpg', 'representative_lalalakays_7540573285457644813_frame_0001.jpg', 'representative_lalalakays_7540573285457644813_frame_0002.jpg', 'speech_transcription.txt', 'vocals.wav']
2025-08-21 04:08:58,271 [INFO] TikTokFeatureExtractor: Multimodal feature extraction started (GPT4O enabled)
2025-08-21 04:08:58,273 [INFO] MultimodalExtractor: Configuration: batch_image_upload=True, embed


0: 640x384 2 bottles, 15.2ms
1: 640x384 2 bottles, 15.2ms
2: 640x384 1 person, 1 bottle, 15.2ms
Speed: 1.6ms preprocess, 15.2ms inference, 3.8ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:08:57,845 [INFO] MultimodalExtractor: YOLO batch inference finished. Total objects: 6
2025-08-21 04:08:57,847 [INFO] MultimodalExtractor: Running BLIP batch captioning on 3 images.
2025-08-21 04:08:57,234 [INFO] MultimodalExtractor: BLIP batch captioning finished. Total captions: 3
2025-08-21 04:08:57,238 [INFO] MultimodalExtractor: GPT4O BATCH PROMPT MONITORING
2025-08-21 04:08:57,241 [INFO] MultimodalExtractor: Number of frames: 3
2025-08-21 04:08:57,243 [INFO] MultimodalExtractor: Prompt length: 9305 characters
2025-08-21 04:08:57,245 [INFO] MultimodalExtractor: Prompt content:
2025-08-21 04:08:57,247 [INFO] MultimodalExtractor: ----------------------------------------
2025-08-21 04:08:57,249 [INFO] MultimodalExtractor: You are a professional TikTok video content analysis expert with comprehensive knowledge of TikTok's content creator tag system. 
Analyze the provided video content (including frames, audio transcript, and metadata) and return ONLY a valid JSON response


0: 640x384 2 persons, 1 cell phone, 5.1ms
1: 640x384 2 persons, 1 bottle, 1 potted plant, 1 book, 5.1ms
2: 640x384 2 persons, 1 cup, 5.1ms
3: 640x384 1 person, 1 cup, 1 couch, 1 cell phone, 5.1ms
4: 640x384 1 person, 1 bottle, 2 books, 5.1ms
5: 640x384 1 person, 1 bottle, 1 book, 5.1ms
6: 640x384 1 person, 1 bottle, 1 cell phone, 1 book, 5.1ms
7: 640x384 1 person, 1 bottle, 1 book, 5.1ms
8: 640x384 2 persons, 1 bottle, 1 book, 5.1ms
9: 640x384 1 person, 1 bottle, 1 book, 5.1ms
10: 640x384 1 person, 1 bottle, 1 book, 5.1ms
11: 640x384 1 person, 1 bottle, 1 cup, 2 cell phones, 1 book, 5.1ms
12: 640x384 1 person, 1 bottle, 1 cell phone, 2 books, 5.1ms
13: 640x384 1 person, 1 bottle, 1 cell phone, 2 books, 5.1ms
14: 640x384 2 persons, 2 bottles, 2 cell phones, 3 books, 5.1ms
15: 640x384 2 persons, 1 bottle, 1 cell phone, 2 books, 5.1ms
Speed: 14.2ms preprocess, 5.1ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 1 bottle, 1 cell phone, 2 books, 3.

2025-08-21 04:10:22,344 [INFO] frame_analyzer: Found 20 product frames, 17 non-product frames
2025-08-21 04:10:22,997 [INFO] frame_analyzer: Selected 3 frames with CLIP.
2025-08-21 04:10:22,999 [INFO] frame_analyzer: Selected 3 representative frames for video: boise_brooke_7540799670956690702
2025-08-21 04:10:23,253 [INFO] frame_analyzer: After cleanup, video_dir files: ['audio_analysis_results.json', 'boise_brooke_7540799670956690702.wav', 'non_vocals.wav', 'representative_boise_brooke_7540799670956690702_frame_0000.jpg', 'representative_boise_brooke_7540799670956690702_frame_0001.jpg', 'representative_boise_brooke_7540799670956690702_frame_0002.jpg', 'speech_transcription.txt', 'vocals.wav']
2025-08-21 04:10:23,263 [INFO] TikTokFeatureExtractor: Multimodal feature extraction started (GPT4O enabled)
2025-08-21 04:10:23,264 [INFO] MultimodalExtractor: Configuration: batch_image_upload=True, embedding_analysis=False
2025-08-21 04:10:23,265 [INFO] MultimodalExtractor: Using batch image u


0: 640x384 1 person, 2 bottles, 1 book, 8.0ms
1: 640x384 2 persons, 1 bottle, 1 couch, 1 book, 8.0ms
2: 640x384 2 persons, 1 cup, 8.0ms
Speed: 1.4ms preprocess, 8.0ms inference, 2.1ms postprocess per image at shape (1, 3, 640, 384)


2025-08-21 04:10:23,490 [INFO] MultimodalExtractor: YOLO batch inference finished. Total objects: 9
2025-08-21 04:10:23,491 [INFO] MultimodalExtractor: Running BLIP batch captioning on 3 images.
2025-08-21 04:10:23,357 [INFO] MultimodalExtractor: BLIP batch captioning finished. Total captions: 3
2025-08-21 04:10:23,359 [INFO] MultimodalExtractor: GPT4O BATCH PROMPT MONITORING
2025-08-21 04:10:23,361 [INFO] MultimodalExtractor: Number of frames: 3
2025-08-21 04:10:23,362 [INFO] MultimodalExtractor: Prompt length: 12788 characters
2025-08-21 04:10:23,364 [INFO] MultimodalExtractor: Prompt content:
2025-08-21 04:10:23,365 [INFO] MultimodalExtractor: ----------------------------------------
2025-08-21 04:10:23,367 [INFO] MultimodalExtractor: You are a professional TikTok video content analysis expert with comprehensive knowledge of TikTok's content creator tag system. 
Analyze the provided video content (including frames, audio transcript, and metadata) and return ONLY a valid JSON respons

CancelledError: 

In [None]:
df = extractor.extract_features_from_single_video(
    video_path="data/tiktok_videos/yasmalas.mp4", 
    output_folder="data/tiktok_frames", 
    csv_output_path=csv_output
)

In [None]:
df = extractor.extract_features_from_folder(
    video_folder=video_folder,
    output_folder=output_folder,
    csv_output_path=csv_output
)

In [None]:
import json
import matplotlib.pyplot as plt
from glob import glob

RESULTS_ROOT = 'data/tiktok_frames/' 

result_files = glob(os.path.join(RESULTS_ROOT, '**', 'audio_analysis_results.json'), recursive=True)

step_timings_list = []
video_names = []

for file in result_files:
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        meta = data.get('metadata', {})
        timings = meta.get('step_timings', {})
        if timings:
            step_timings_list.append(timings)
            video_names.append(os.path.basename(os.path.dirname(file)))

if not step_timings_list:
    print('No step_timings found in results.')
else:
    df = pd.DataFrame(step_timings_list, index=video_names)
    print('Per-video step timings:')
    print(df)
    print('\nAverage step timings:')
    print(df.mean().sort_values(ascending=False))

    # Plot
    plt.figure(figsize=(10, 5))
    df.mean().sort_values(ascending=False).plot(kind='bar')
    plt.ylabel('Average Time (s)')
    plt.title('Average Step Timings Across Videos')
    plt.tight_layout()
    plt.show() 