In [1]:
import os
import dotenv
dotenv.load_dotenv()
import pandas as pd
import yt_dlp
from openai import OpenAI
from pydub import AudioSegment
from tqdm import tqdm
tqdm.pandas()

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
# Create the downloads folder for project if it does not exist
PROJECT = "market-signals"
folder_path = os.path.join("downloads", PROJECT)
os.makedirs(folder_path, exist_ok=True)

# Download Video

In [None]:
video_metadata = pd.read_csv(f"results/{PROJECT}/video_metadata.csv")
video_metadata.dropna(subset=["id"], inplace=True)
video_metadata["id"] = video_metadata["id"].astype(int)
video_metadata["id"] = video_metadata["id"].astype(str)
video_metadata["video_filename"] = video_metadata.apply(lambda row: f"{row['id']}.mp4", axis=1)

# Filter out videos that have been downloaded
downloaded_videos = [f for f in os.listdir(f"downloads/{PROJECT}/") if f.endswith('.mp4')]
video_metadata["downloaded"] = video_metadata["video_filename"].apply(lambda x: 1 if x in downloaded_videos else 0)
filtered_video_metadata = video_metadata[video_metadata["downloaded"]==0].reset_index(drop=True)
filtered_video_metadata.head()

In [None]:
def download_video(row):
    # The TikTok video link
    video_url = row["webVideoUrl"]

    # Output file name
    output_file = f"downloads/{PROJECT}/{row['video_filename']}"

    # Options for yt-dlp
    ydl_opts = {
        "outtmpl": output_file,  # Save the video with this file name
        "format": "best",        # Download the best quality available
    }

    # Download the video
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([video_url])
    except Exception as e:
        print(f"An error occurred downloading {video_url}:", str(e))

filtered_video_metadata.progress_apply(download_video, axis=1)

# Perform Audio Transcription

In [None]:
openai_client = OpenAI()

def optimize_audio_file(input_file_path, output_file_path):
    # Load the audio file
    audio = AudioSegment.from_file(input_file_path)

    # Downsample the audio to 16 kHz and convert to mono
    audio = audio.set_frame_rate(16000).set_channels(1)

    # Export the optimized audio file
    audio.export(output_file_path, format="wav")

def transcribe_videos(row):
    input_file_path = f"downloads/{PROJECT}/{row['video_filename']}"
    optimized_file_path = f"downloads/{PROJECT}/optimized_{row['video_filename']}"

    try:
        with open(input_file_path, "rb") as audio_file:
            transcription = openai_client.audio.transcriptions.create(
                model="whisper-1", 
                file=audio_file, 
                response_format="text"
            )
        return transcription
    
    except FileNotFoundError:
        return None
    
    except Exception as e:
        if e.status_code == 413:
            print(f"Error: File {row['video_filename']} is too large to process. Optimizing the audio file...")
            # Optimize the audio file
            optimize_audio_file(input_file_path, optimized_file_path)
            try:
                with open(optimized_file_path, "rb") as audio_file:
                    transcription = openai_client.audio.transcriptions.create(
                        model="whisper-1", 
                        file=audio_file, 
                        response_format="text"
                    )
                return transcription
            except Exception as e:
                print(f"Error: File {optimized_file_path} is still too large after optimisation: {e}")
                return None
        else:
            print(f"Error encountered when transcribing {row['video_filename']}: {e}")
            return None

video_metadata["video_transcription"] = video_metadata.progress_apply(transcribe_videos, axis=1)
video_metadata.to_csv(f"results/{PROJECT}/video_metadata_with_transcript.csv", index=False)
