<a href="https://colab.research.google.com/github/superseoworld/llm/blob/main/Automated_TikTok_Video_Understanding_for_Social_Media_Strategy_(public).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## An OpenAI api key, and an Apify api key are required to run this script. It has also only been tested on an A100 computer which is only available through Google Colab+ subscription. It may work with other less powerful computers like the V100, but will likely error out. To change the runtime computer, go to Runtime-->Change Runtime Type and select the A100 GPU and High Ram.

In [None]:
!pip install torch torchvision
!pip install transformers
!pip install opencv-python-headless
!pip install accelerate
!pip install sentencepiece
!pip install omegaconf
!pip install iopath
!pip install timm
!pip install decord
!pip install gradio
!pip install ftfy
!pip install GPUtil
!pip install pytorchvideo
!pip install einops
!pip install webdataset
!pip install moviepy
!pip install apify-client
!pip install spleeter
!pip install openai
!pip install typer
!pip install click==8.0.4
!pip install pydub
!pip install --upgrade httpx
!pip install apify-client
!pip install --upgrade httpx



In [None]:
!pip install spleeter

# You MUST now restart the runtime (Runtime-->Restart Runtime in the menu) or (Ctrl+M). This is required for the script to work from this point forward.

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
!sudo apt-get install git-lfs
!git clone https://github.com/ktynski/MovieChat.git

%cd /content/MovieChat/
!python apply_delta.py \
    --base-model-identifier "https://huggingface.co/dontito/llama-7b-hf-v0" \
    --target-model-path "ckpt/Vicuna/7B" \
    --delta-repo-url "https://huggingface.co/lmsys/vicuna-7b-delta-v0"

!mkdir -p /content/MovieChat/ckpt/pretrained_ckpt/
!wget -O /content/MovieChat/ckpt/pretrained_ckpt/finetune-vicuna7b-v2.pth https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-Series/resolve/main/finetune-vicuna7b-v2.pth
!wget -O ckpt/pretrained_minigpt4.pth https://drive.google.com/file/d/1a4zLvaiDBr-36pasffmgpvH5P7CKmpze/view

In [None]:
!pip install --upgrade httpx

In [None]:
%cd /content/MovieChat/

In [None]:
import pandas as pd
import concurrent.futures
import threading
from apify_client import ApifyClient
import time

APIFY_API_URL = 'https://api.apify.com/v2'
ACTOR_NAME = 'mscraper/tiktok-search-autocomplete'
APIFY_API_KEY = 'Your apify api key'

client = ApifyClient(APIFY_API_KEY)

# Create a semaphore with a maximum of 12 concurrent threads
semaphore = threading.Semaphore(12)
df = pd.DataFrame(columns=["search_term", "suggestion", "lang"])

def get_autocomplete(search):
    run_input = {
        "proxy": {
            "useApifyProxy": True,
            "apifyProxyCountry": "US",
            "apifyProxyGroups": ["RESIDENTIAL"]
        },
        "query": [search]
    }

    run = client.actor(ACTOR_NAME).call(run_input=run_input)
    run_id = run.get('id')

    # Check the status every 5 seconds
    while True:
        details = client.run(run_id).wait_for_finish()
        if details['status'] == 'SUCCEEDED':
            break
        time.sleep(5)

    global df
    suggestions = []

    for item in client.dataset(details["defaultDatasetId"]).iterate_items():
        # add to dataframe
        df = df.append({"search_term": search, "suggestion": item['content'], "lang": item['lang']}, ignore_index=True)
        suggestions.append(item['content'])

    return suggestions

def recursive_search(search_term, level=1, max_level=3):
    if level > max_level:
        return

    suggestions = get_autocomplete(search_term)

    # Create a list to store futures
    futures = []

    with concurrent.futures.ThreadPoolExecutor() as executor:
        for suggestion in suggestions:
            # Use the semaphore to limit the number of active threads
            semaphore.acquire()

            try:
                futures.append(executor.submit(recursive_search, suggestion, level + 1, max_level))
            finally:
                semaphore.release()

        for future in concurrent.futures.as_completed(futures):
            future.result()

recursive_search('Camping', max_level=1)
df = df.drop_duplicates(subset='suggestion')

print(df)
df.to_csv('/content/scraped_tiktok_searches.csv')

In [None]:
import pandas as pd
import concurrent.futures
import threading
from apify_client import ApifyClient

APIFY_API_URL = 'https://api.apify.com/v2'
ACTOR_NAME = 'clockworks/tiktok-scraper'
APIFY_API_KEY = 'your apify api key'

client = ApifyClient(APIFY_API_KEY)

# Create a semaphore with a maximum of 4 concurrent threads
semaphore = threading.Semaphore(4)

def run_actor(hashtag, resultsPerPage):
    run_input = {
        "hashtags": [hashtag],
        "resultsPerPage": resultsPerPage,
        "scrapeEmptyChannelInfo": False,
        "shouldDownloadVideos": True,
        "shouldDownloadCovers": False,
        "videoKvStoreIdOrName": "mytiktokvideos",
        "proxyConfiguration": { "useApifyProxy": True },
    }

    run = client.actor(ACTOR_NAME).call(run_input=run_input)
    results = []

    for item in client.dataset(run["defaultDatasetId"]).iterate_items():
        result = item  # Store the entire response
        result['hashtag'] = hashtag  # Add the used hashtag
        results.append(result)

    return results

def concurrent_runs(df, resultsPerPage):
    hashtags = df['suggestion'].tolist()
    all_results = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for hashtag in hashtags:
            # Use the semaphore to limit the number of active threads
            semaphore.acquire()
            try:
                futures.append(executor.submit(run_actor, hashtag, resultsPerPage))
            finally:
                semaphore.release()

        for future in concurrent.futures.as_completed(futures):
            all_results.extend(future.result())

    return pd.DataFrame(all_results)

# Get user input for the number of videos to scrape and the final DataFrame length
resultsPerPage = 20
final_df_length = 100

# Drop rows with NaN values
df = df.dropna()

# Or replace NaNs with a placeholder value
df = df.fillna('')

# Then run the function
df = df[:final_df_length]   # Limit the DataFrame to the desired length
new_df = concurrent_runs(df, resultsPerPage)

# Sort the DataFrame by the 'playCount' column in descending order
new_df = new_df.sort_values(by='playCount', ascending=False)

new_df.to_csv('/content/tiktokscrape.csv')
print(new_df)

In [None]:
%cd /content/MovieChat/

In [None]:
import concurrent.futures
import openai
import requests
import pandas as pd
import os
import shutil
import spleeter
from spleeter.separator import Separator
from pydub import AudioSegment
import ast
import threading

new_df = pd.read_csv('/content/tiktokscrape.csv')
df = new_df

openai.api_key = "your openai api key"

# Create a separator with 2 stems (vocals and accompaniment)
separator = Separator('spleeter:2stems')
separator_lock = threading.Lock()

# Create a directory for the videos if it doesn't exist
os.makedirs('TikToks', exist_ok=True)

def download_and_transcribe(row):
    video_url = ast.literal_eval(row['mediaUrls'])[0]
    filename = f"TikToks/{row['id']}.mp4"  # Include the folder path in the filename

    # Download the video
    with requests.get(video_url, stream=True) as r:
        r.raise_for_status()
        with open(filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

    # Convert mp4 to mp3 for audio separation
    audio_filename = f"TikToks/{row['id']}.mp3"  # Include the folder path in the audio filename
    os.system(f'ffmpeg -i {filename} -vn -ar 44100 -ac 2 -b:a 192k {audio_filename}')

    # Separate the vocal from music
    with separator_lock:
        separator.separate_to_file(audio_filename, f'TikToks/{row["id"]}')  # Include the folder path in the output path
    os.remove(audio_filename)

    # Convert wav to mp3
    audio = AudioSegment.from_wav(f"TikToks/{row['id']}/{row['id']}/vocals.wav")
    audio.export(f"TikToks/{row['id']}/vocals.mp3", format="mp3")  # Include the folder path in the output path

    # Transcribe the vocal
    vocal_filename = f"TikToks/{row['id']}/vocals.mp3"  # Include the folder path in the filename
    with open(vocal_filename, "rb") as vocal_file:
        transcript = openai.Audio.translate("whisper-1", vocal_file)

    # Clean up the downloaded files
    # os.remove(filename)  # Comment out this line to keep the video files
    # shutil.rmtree(f"TikToks/{row['id']}")  # Include the folder path in the directory path

    return (row['id'], transcript["text"])


# Create a ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    # Submit all tasks to the executor
    future_to_row = {executor.submit(download_and_transcribe, row[1]): row[0] for row in df.iterrows()}

    # Collect the results as they become available
    for future in concurrent.futures.as_completed(future_to_row):
        i = future_to_row[future]
        try:
            id, transcription = future.result()
            df.loc[i, 'transcription'] = transcription
        except Exception as exc:
            print(f'Row {i} generated an exception: {exc}')

df.to_csv('/content/transcribed.csv')

In [None]:
%cd /content/MovieChat/

# The next cell is doing the heavy lifting of video understanding. It will take 2-3min per video to run. I have tested it up to 20 videos at a time, it may be able to do more, so feel free to try by editing line 483 to set the max number of videos to evaluate from your dataframe of scraped and transcribed videos.

In [None]:
import gc
import argparse
import os
import random
import gc
import pandas as pd
import numpy as np
import torch
import torch.backends.cudnn as cudnn
from MovieChat.common.config import Config
from MovieChat.common.dist_utils import get_rank
from MovieChat.common.registry import registry
from MovieChat.conversation.conversation_video import Chat, Conversation, default_conversation,SeparatorStyle
import decord
import cv2
import time
import subprocess
from moviepy.editor import VideoFileClip
from decord import VideoReader
decord.bridge.set_bridge('torch')

#%%
# imports modules for registration
from MovieChat.datasets.builders import *
from MovieChat.models import *
from MovieChat.processors import *
from MovieChat.runners import *
from MovieChat.tasks import *
from moviepy.editor import*
import os
import random as rnd
from transformers import StoppingCriteria, StoppingCriteriaList
from PIL import Image
import GPUtil
import gradio as gr

MAX_INT = 8
N_SAMPLES = 32
SHORT_MEMORY_Length = 10
#%%
def parse_args():
    parser = argparse.ArgumentParser(description="Demo")
    parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
    parser.add_argument("--gpu-id", type=int, default=0, help="specify the gpu to load the model.")
    parser.add_argument("--num-beams", type=int, default=1)
    parser.add_argument("--temperature", type=float, default=1.0)
    parser.add_argument("--text-query", required=True, help="question the video")
    parser.add_argument("--video-path", required=True, help="path to video file.")
    parser.add_argument("--fragment-video-path", required=True, help="path to video fragment file.")
    parser.add_argument("--cur-sec", type=int, default=2, help="current minute")
    parser.add_argument("--cur-min", type=int, default=15, help="current second")
    parser.add_argument("--middle-video", type=bool, default=False, help="current second")
    parser.add_argument(
        "--options",
        nargs="+",
        help="override some settings in the used config, the key-value pair "
        "in xxx=yyy format will be merged into config file (deprecate), "
        "change to --cfg-options instead.",
    )
    args = parser.parse_args()
    return args


def setup_seeds(config_seed):
    seed = config_seed + get_rank()

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    cudnn.benchmark = False
    cudnn.deterministic = True

class StoppingCriteriaSub(StoppingCriteria):

    def __init__(self, stops=[], encounters=1):
        super().__init__()
        self.stops = stops

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        for stop in self.stops:
            if torch.all((stop == input_ids[0][-len(stop):])).item():
                return True

        return False


def video_duration(filename):
    result = subprocess.run(["ffprobe", "-v", "error", "-show_entries",
                             "format=duration", "-of",
                             "default=noprint_wrappers=1:nokey=1", filename],
                            stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT)
    return float(result.stdout)



def capture_video(video_path, fragment_video_path, per_video_length, n_stage):
    # Check if the video file exists
    if not os.path.exists(video_path):
        print(f"Video file not found: {video_path}")
        return

    start_time = n_stage * per_video_length
    end_time = (n_stage+1) * per_video_length

    # Check the duration of the video
    video_clip = VideoFileClip(video_path)
    if video_clip.duration < end_time:
        print(f"Video duration is less than the end time: {video_clip.duration} < {end_time}")
        return

    print(f"Start time: {start_time}, End time: {end_time}")  # Debug print

    # Handle exceptions during video processing
    try:
        video = CompositeVideoClip([video_clip.subclip(start_time,end_time)])
        print(f"Writing video fragment to: {fragment_video_path}")  # Debug print
        video.write_videofile(fragment_video_path)
        print("Video fragment saved.")  # Debug print
    except Exception as e:
        print(f"Error processing video: {e}")


def load_video(video_path, n_frms=MAX_INT, height=-1, width=-1, sampling="uniform", return_msg = False):
    decord.bridge.set_bridge("torch")
    vr = VideoReader(uri=video_path, height=height, width=width)

    vlen = len(vr)
    start, end = 0, vlen

    n_frms = min(n_frms, vlen)

    if sampling == "uniform":
        indices = np.arange(start, end, vlen / n_frms).astype(int).tolist()
    elif sampling == "headtail":
        indices_h = sorted(rnd.sample(range(vlen // 2), n_frms // 2))
        indices_t = sorted(rnd.sample(range(vlen // 2, vlen), n_frms // 2))
        indices = indices_h + indices_t
    else:
        raise NotImplementedError

    # get_batch -> T, H, W, C
    temp_frms = vr.get_batch(indices)
    tensor_frms = torch.from_numpy(temp_frms) if type(temp_frms) is not torch.Tensor else temp_frms
    frms = tensor_frms.permute(3, 0, 1, 2).float()  # (C, T, H, W)

    if not return_msg:
        return frms

    fps = float(vr.get_avg_fps())
    sec = ", ".join([str(round(f / fps, 1)) for f in indices])
    # " " should be added in the start and end
    msg = f"The video contains {len(indices)} frames sampled at {sec} seconds. "
    return frms, msg




def parse_video_fragment(video_path, video_length, n_stage, n_samples):
    start_time = n_stage * video_length / n_samples
    end_time = (n_stage + 1) * video_length / n_samples
    # Ensure end_time does not exceed video_length
    end_time = min(end_time, video_length)
    print(f'Start time: {start_time}, End time: {end_time}')
    output_path = f'src/video_fragment/output_{n_stage}.mp4'
    clip = VideoFileClip(video_path).subclip(start_time, end_time)
    clip.write_videofile(output_path, audio=False)
    return output_path

class Chat:
    def clear_long_memory(self):
          self.model.long_memory_buffer = []


    def __init__(self, model, vis_processor, device='cuda:0'):
        self.device = device
        self.output_text = " "
        self.model = model
        self.vis_processor = vis_processor
        self.image_vis_processor = Blip2ImageEvalProcessor()
        stop_words_ids = [torch.tensor([835]).to(self.device),
                          torch.tensor([2277, 29937]).to(self.device)]  # '###' can be encoded in two different ways.
        self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])

    def get_context_emb(self, input_text, msg, img_list):

        prompt_1 = "You are able to understand the visual content that the user provides.Follow the instructions carefully and explain your answers in detail.###Human: <Video><ImageHere></Video>"
        prompt_2 = input_text
        prompt_3 = "###Assistant:"

        prompt = prompt_1 + " " + prompt_2 + prompt_3

        prompt_segs = prompt.split('<ImageHere>')
        assert len(prompt_segs) == len(img_list) + 1, "Unmatched numbers of image placeholders and images."
        seg_tokens = [
            self.model.llama_tokenizer(
                seg, return_tensors="pt", add_special_tokens=i == 0).to(self.device).input_ids
            # only add bos to the first seg
            for i, seg in enumerate(prompt_segs)
        ]
        seg_embs = [self.model.llama_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]

        mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]]
        mixed_embs = torch.cat(mixed_embs, dim=1)
        return mixed_embs

    def gradio_answer(self,chatbot, chat_state):

        # chatbot[-1][1] = llm_message
        # print(chat_state.get_prompt())
        print(chat_state)
        import pdb;pdb.set_trace()
        return gr.update(value=self.output_text, interactive=False),None

    def answer(self, img_list, input_text, msg, max_new_tokens=300, num_beams=1, min_length=1, top_p=0.9,
            repetition_penalty=1.0, length_penalty=1, temperature=1.0, max_length=2000):
        embs = self.get_context_emb(input_text, msg, img_list)

        current_max_len = embs.shape[1] + max_new_tokens
        if current_max_len - max_length > 0:
            print('Warning: The number of tokens in current conversation exceeds the max length. '
                  'The model will not see the contexts outside the range.')
        begin_idx = max(0, current_max_len - max_length)

        embs = embs[:, begin_idx:]

        outputs = self.model.llama_model.generate(
            inputs_embeds=embs,
            max_new_tokens=max_new_tokens,
            stopping_criteria=self.stopping_criteria,
            num_beams=num_beams,
            do_sample=True,
            min_length=min_length,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            length_penalty=length_penalty,
            temperature=temperature,
        )

        output_token = outputs[0]
        if output_token[0] == 0:  # the model might output a unknow token <unk> at the beginning. remove it
            output_token = output_token[1:]
        if output_token[0] == 1:  # some users find that there is a start token <s> at the beginning. remove it
            output_token = output_token[1:]
        output_text = self.model.llama_tokenizer.decode(output_token, add_special_tokens=False)
        output_text = output_text.split('###')[0]  # remove the stop sign '###'
        output_text = output_text.split('Assistant:')[-1].strip()
        return output_text, output_token.cpu().numpy()

    def cal_frame(self, video_length, cur_min, cur_sec, middle_video):
        per_frag_second = video_length / N_SAMPLES
        if middle_video:
            cur_seconds = cur_min * 60 + cur_sec
            num_frames = int(cur_seconds / per_frag_second)
            per_frame_second = per_frag_second/SHORT_MEMORY_Length
            cur_frame = int((cur_seconds-per_frag_second*num_frames)/per_frame_second)
            return num_frames, cur_frame
        else:
            cur_frame = 0
            num_frames = int(video_length / per_frag_second)
            return num_frames, cur_frame

    def upload_video_without_audio(self, video_path, fragment_video_path, cur_min, cur_sec, cur_image, img_list, middle_video):
        msg = ""
        if isinstance(video_path, str):  # is a video path
            ext = os.path.splitext(video_path)[-1].lower()
            print(video_path)
            video_length = video_duration(video_path)
            num_frames, cur_frame = self.cal_frame(video_length, cur_min, cur_sec, middle_video)
            if num_frames == 0:
                video_fragment = parse_video_fragment(video_path=video_path, video_length=video_length, n_stage=0, n_samples= N_SAMPLES)
                video_fragment, msg = load_video(
                    video_path=video_fragment,  # Use the actual video fragment path
                    n_frms=MAX_INT,
                    height=224,
                    width=224,
                    sampling ="uniform", return_msg = True
                )
                video_fragment = self.vis_processor.transform(video_fragment)
                video_fragment = video_fragment.unsqueeze(0).to(self.device)

                self.model.encode_short_memory_frame(video_fragment, cur_frame)
            else:
                for i in range(num_frames):
                    print(i)
                    video_fragment = parse_video_fragment(video_path=video_path, video_length=video_length, n_stage=i, n_samples= N_SAMPLES)
                    video_fragment, msg = load_video(
                        video_path=video_fragment,  # Use the actual video fragment path
                        n_frms=MAX_INT,
                        height=224,
                        width=224,
                        sampling ="uniform", return_msg = True
                    )
                    video_fragment = self.vis_processor.transform(video_fragment)
                    video_fragment = video_fragment.unsqueeze(0).to(self.device)

                    if middle_video:
                        self.model.encode_short_memory_frame(video_fragment, cur_frame)
                    else:
                        self.model.encode_short_memory_frame(video_fragment)
        else:
            raise NotImplementedError
        video_emb, _ = self.model.encode_long_video(cur_image, middle_video)
        img_list.append(video_emb)
        return msg
    def gener_infer(self, video_path, text_inputs, num_beams, temperature, libraries, minute, second):
        print("here")
        fragment_video_path = "src/video_fragment/output.mp4"
        cur_min = minute if minute is not None else int(0)
        cur_sec = second if second is not None else int(0)

        if libraries is not None:
            cap = cv2.VideoCapture(video_path)

            if libraries[0] == "Breakpoint mode":
                fps_video = cap.get(cv2.CAP_PROP_FPS)
                self.model.middle_video = True
                self.model.question_minute = minute
                self.model.question_second = second
                cur_fps = fps_video * (60*minute + second)
            else:
                cur_fps = 0
                self.model.middle_video = False

            cap.set(cv2.CAP_PROP_POS_FRAMES, cur_fps)
            ret, frame = cap.read()
            temp_frame_path = 'src/output_frame/snapshot.jpg'

            cv2.imwrite(temp_frame_path, frame)
            raw_image = Image.open(temp_frame_path).convert('RGB')
            image = self.image_vis_processor(raw_image).unsqueeze(0).unsqueeze(2).to(self.device) # [1,3,1,224,224]
            cur_image = self.model.encode_image(image)

            img_list = []
            msg = self.upload_video_without_audio(
                video_path=video_path,
                fragment_video_path=fragment_video_path,
                cur_min=cur_min,
                cur_sec=cur_sec,
                cur_image = cur_image,
                img_list=img_list,
                middle_video = self.model.middle_video,
                )

            for text_input in text_inputs:
                llm_message = self.answer(img_list=img_list,
                                        input_text=text_input,
                                        msg = msg,
                                        num_beams=num_beams,
                                        temperature=temperature,
                                        max_new_tokens=300,
                                        max_length=2000)[0]

                self.output_text = llm_message
                print(self.output_text)


# Define the variable values inline here
video_path = "/content/MovieChat/src/examples/kt2.mp4"
output_directory = "/content/MovieChat/src/outputs/"
cur_min = 2
cur_sec = 15
middle_video = False
num_beams = 1
temperature = 1.0

# Define your list of questions
question_list = [
    "How old is the woman?",
    "What is she doing?",
    # Add more questions as needed
]

class DummyArgs:
    def __init__(self, cfg_path, options):
        self.cfg_path = cfg_path
        self.options = options

args = DummyArgs(cfg_path="/content/MovieChat/eval_configs/MovieChat.yaml", options=["model.frozen_llama_proj=False"])

# Create configuration
config_seed = 42
setup_seeds(config_seed)
cfg = Config(args)

# Create model and chat
model_config = cfg.model_cfg
model_config.device_8bit = 0  # Specify the GPU ID
model_cls = registry.get_model_class(model_config.arch)
model = model_cls.from_config(model_config).to('cuda:0')
vis_processor_cfg = cfg.datasets_cfg.webvid.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
chat = Chat(model, vis_processor, device='cuda:0')
import gc
import pandas as pd

# Define your list of questions
questions_list = [
    "What is happening in this video? Please be detailed.",
    "What is unique about this video? Please be detailed.",
    "Who are the main characters or subjects in the video, and what is their relevance?",
    "What emotions are being portrayed visually in the video, and how do these contribute to its appeal?",
    "How does the video's pacing, editing, and visual effects contribute to its engagement?",
    "What are the key visual elements (e.g., colors, backgrounds, effects) that make this video stand out?",
    "Is there any specific visual cultural or trending context that makes the video particularly relevant?",
    "What demographic is likely to find this video visually appealing, and why?",
    "Does the video visually encourage viewer interaction (e.g., through text overlays, visual cues) and how does it do so?",
    "How does the video visually connect with or reference other popular media or trends?",
    "What visual marketing or promotional techniques are used in the video to increase its visibility?",
    "Are there any controversial or provocative visual elements in the video, and how might they influence its popularity?",
    "How does the video's length and visual structure contribute to retaining the viewer's attention?",
    "Is there a clear visual call-to-action or message that encourages further engagement or sharing?",
]


def process_video(video_path, question_list, chat, middle_video):
    # Load video frame and encode image
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Video capture could not be opened.")
        return []
    fps_video = cap.get(cv2.CAP_PROP_FPS)
    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    video_duration_sec = total_frames / fps_video
    cur_min = int(video_duration_sec / 2) // 60
    cur_sec = int(video_duration_sec / 2) % 60
    cur_fps = fps_video * (60*cur_min + cur_sec)
    print(f"Video FPS: {fps_video}")
    print(f"Total frames: {total_frames}")
    print(f"Video duration: {video_duration_sec} seconds")
    print(f"Current minute: {cur_min}")
    print(f"Current second: {cur_sec}")
    print(f"Current FPS: {cur_fps}")
    cap.set(cv2.CAP_PROP_POS_FRAMES, cur_fps)
    ret, frame = cap.read()
    print(f"Frame read successfully: {ret}")
    if not ret:
        print("Error: Video frame could not be read.")
        return []
    temp_frame_path = 'src/output_frame/snapshot.jpg'
    cv2.imwrite(temp_frame_path, frame)
    raw_image = Image.open(temp_frame_path).convert('RGB')
    image = chat.image_vis_processor(raw_image).unsqueeze(0).unsqueeze(2).to(chat.device)
    cur_image = chat.model.encode_image(image)

    # Upload video frames without audio
    img_list = []
    middle_video = True if middle_video == 1 else False
    msg = chat.upload_video_without_audio(
        video_path=video_path,
        fragment_video_path='fragment_video_path',
        cur_min=cur_min,
        cur_sec=cur_sec,
        cur_image=cur_image,
        img_list=img_list,
        middle_video=middle_video
    )

    # Generate inference for each question in the list
    answers = []
    for text_input in question_list:
        try:
            llm_message = chat.answer(
                img_list=img_list,
                input_text=text_input,
                msg=msg,
                num_beams=num_beams,
                temperature=temperature,
                max_new_tokens=300,
                max_length=2000
            )[0]
            answers.append(llm_message)
        except IndexError as e:
            print(f"Warning: Skipping question due to an IndexError: {e}")
            continue

    return answers



df = pd.read_csv('/content/transcribed.csv')
df = df[:20]

# Iterate over the rows of the DataFrame
for index, row in df.iterrows():
    # Get the video filename for this row
    video_filename = f"/content/MovieChat/TikToks/{row['id']}.mp4"
    print(video_filename)

    # Check if the video file exists and is not corrupt
    if os.path.exists(video_filename):
        cap = cv2.VideoCapture(video_filename)
        if cap.isOpened():
            # Run your script for this video and get the answers to the questions
            answers = process_video(video_filename, questions_list, chat, middle_video)
            print(f"Answers:{answers}")

            # Clear the long memory buffer before processing the next video
            chat.clear_long_memory()

            for i, answer in enumerate(answers):
                df.loc[index, f'Question {i+1}'] = questions_list[i]
                df.loc[index, f'Answer {i+1}'] = answer
        else:
            print(f"Video file {video_filename} is corrupt.")
        cap.release()
    else:
        print(f"Video file {video_filename} does not exist.")

    gc.collect()

# Print the final DataFrame
print(df)
df.to_csv("/content/Evaluated_Videos_Data.csv")

In [None]:
import pandas as pd
import openai
from IPython.display import display, Markdown


openai.api_key = "Your openai api key"
df = pd.read_csv("/content/Evaluated_Videos_Data.csv")
def evaluate_dataframe(csv_file):
    # Function to evaluate a single row
    def evaluate_row(row):
        # Prepare the data for the GPT-3 model
        batch_data = [
            f"Transcription: {row['transcription']}",
            # Add the questions and answers
            *(f"Question {i+1}: {row[f'Question {i+1}']}\nAnswer {i+1}: {row[f'Answer {i+1}']}" for i in range(14))
        ]

        # Call the GPT-3 model
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-16k",
            messages=[
                                  {
                      "role": "system",
                      "content": """You are a superhuman expert in viral marketing on TikTok, with unparalleled insights into content creation and audience engagement.
                      Analyze the provided TikTok video transcripts, description, and metadata with precision and depth, focusing on:

                        1. **Introduction**:
                          - Objectives: A deep understanding of the factors that led this video to being successful and popular.
                          - Videos Overview: Summarize the key aspects of the videos.
                          - Scope and Methodology: Detail the approach and limitations.

                        2. **Emotional Landscape**:
                          - Emotional Spectrum: Identify and map the range of emotions.
                          - Emotional Journey: Track the evolution of emotions.
                          - Resonance and Impact: Assess how emotions resonate with the audience.
                          - Questions: What techniques are used to evoke specific emotions?

                        3. **Content and Thematic Depth**:
                          - Themes and Symbols: Uncover underlying meanings.
                          - Genre and Style: Classify and analyze the stylistic choices.
                          - Storytelling Techniques: Explore narrative structures and devices.
                          - Questions: How are central conflicts or tensions expressed?

                        4. **Visual and Auditory Mastery**:
                          - Visual Language: Analyze visual cues, composition, and effects.
                          - Sound Design: Examine the use of music, sound effects, and silence.
                          - Synchronization: Evaluate the harmony between visuals and audio.
                          - Questions: What visual or auditory motifs recur?

                        5. **Engagement Mechanics**:
                          - Engagement Blueprint: Analyze how content promotes interaction.
                          - Community Dynamics: Understand community formation and interaction.
                          - Trend Mapping: Recognize trends and patterns in user engagement.
                          - Questions: How does the content align with TikTok's unique culture?

                        6. **Competitive Analysis** (Optional):
                          - Benchmarking: Compare against similar viral content.
                          - Differentiation: Identify unique selling points.
                          - Questions: What sets this content apart from others in the same niche?

                        7. **Actionable Strategy**:
                          - Tailored Recommendations: Craft strategies for various content types.
                          - Risk and Opportunities: Assess potential pitfalls and gains.
                          - Tactical Roadmap: Provide a step-by-step implementation guide.

                        8. **Future Forecast**:
                          - Trend Prediction: Analyze potential future trends in TikTok content.
                          - Innovation Scouting: Identify opportunities for groundbreaking content.

                        9. **Conclusion and Reflection**:
                          - Synthesis: Bring together the key findings.
                          - Strategic Implications: Connect insights to broader goals.

                      Your analysis must be insightful, innovative, and provide a roadmap for creating content that resonates with TikTok's diverse and dynamic audience."""
                    },
                    {
                      "role": "user",
                      "content": "Here are the TikTok transcriptions, questions and answers, and associated data for analysis: \n\n" + "\n\n".join(batch_data)
                    }
            ],
            max_tokens=500,
            n=1,
            stop=None,
            temperature=0.7
        )

        # Extract the parsed transcription
        if response is not None and 'choices' in response and len(response['choices']) > 0:
            parsed_transcription = response['choices'][0]['message']['content'].strip()
            print(parsed_transcription)
            return parsed_transcription
        else:
            return None

    # Apply the function to each row of the DataFrame
    df['Evaluation'] = df.apply(evaluate_row, axis=1)

    # Return the DataFrame with the evaluations
    return df

def truncate_text(text, max_length=25000):
    if len(text) > max_length:
        text = text[:max_length]
    return text

def generate_final_report(df):
    # Prepare the data for the GPT-3 model
    evaluations = "\n\n".join(df.apply(lambda row: f"Video ID: {row['id']}\nEvaluation: {row['Evaluation']}\nShare Count: {row['shareCount']}\nPlay Count: {row['playCount']}\nComment Count: {row['commentCount']}", axis=1).tolist())
    evaluations = truncate_text(evaluations)
    # Call the GPT-3 model
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {
                "role": "system",
                "content": """You are a superhuman expert in viral marketing on TikTok, with unparalleled insights into content creation and audience engagement.
                Your task now is to synthesize all the evaluations into a final report that provides a comprehensive analysis of the TikTok videos,
                 identifies key trends and insights, and offers actionable recommendations for creating viral content on TikTok. You do not provide generic advice, your advice
                 is grounded in the learnings from the data provided, and you include specific references and examples whenever possible.
                 Your report should be in beautiful markdown and structured as follows:

                  1. Executive Summary: Provide a brief overview of the key findings and recommendations.
                  2. Detailed Analysis: Discuss the evaluations in depth, highlighting important trends, insights, and examples.
                  3. Recommendations: Based on the analysis, provide actionable recommendations for creating viral content on TikTok.
                  4. Conclusion: Summarize the report and discuss the potential impact of the recommendations."""
            },
            {
                "role": "user",
                "content": f"""Please provide an incredibly thorough, highly accurate, extremely useful report in valid markdown style.
                Make use of any/all markdown style elements including titles, headings, subheadings, bullets and lists, lines, bold, italics, blockquotes, etc. you can to make it as readable and beautiful as possible.
                Here are the evaluations for the final report: \n\n{evaluations}"""
            }
        ],
        max_tokens=2500,
        n=1,
        stop=None,
        temperature=0.4
    )

    # Extract the final report
    if response is not None and 'choices' in response and len(response['choices']) > 0:
        final_report = response['choices'][0]['message']['content'].strip()
        return final_report
    else:
        return None

# Generate the final report

# Evaluate the DataFrame
evaluated_df = evaluate_dataframe(df)


# Generate the final report
final_report = generate_final_report(evaluated_df)
# Save the final report as a Markdown file
with open('/content/final_report.md', 'w') as f:
    f.write(final_report)

# Display the final report in the notebook
display(Markdown(final_report))


In [None]:
with open('final_report.md', 'w') as f:
    f.write(final_report)