# Install package


In [19]:
!pip install yt-dlp -U -q
!pip install -q -U google-genai

# Import library


In [6]:
import yt_dlp
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import re
from google import genai
from google.genai import types

# Show all columns
pd.set_option('display.max_columns', None)

In [None]:
AUDIO_FOLDER = "audio"
TRANSCRIPT_FOLDER = "transcripts"

# Create audio folder if not exists
if not os.path.exists(AUDIO_FOLDER):
    os.makedirs(AUDIO_FOLDER)
# Create transcript folder if not exists
if not os.path.exists(TRANSCRIPT_FOLDER):
    os.makedirs(TRANSCRIPT_FOLDER)

# Các hàm tiện ích


In [None]:
prompt = """
Generate a transcript of the speech. The speech is in Vietnamese. If there is no speech in the file, return None.
Then generate 3 takeaways from the speech. The takeaways should be concise and informative, written in Vietnamese.
Check if the speech contains calls to action (CTA) sentences.
Check if the speech contains elements of curiosity gap.

Return the results in JSON format with fields:
{
    "transcript": "The transcript of the speech",
    "takeaways": ["Takeaway 1", "Takeaway 2", "Takeaway 3"],
    "has_call_to_action": true/false,
    "has_curiosity_gap": true/false
}
"""

In [None]:
def download_youtube_audio(url: str, video_id: str) -> str:
    # Define the file path for the target audio file
    output_path: str = AUDIO_FOLDER + f"/{video_id}.wav"

    # Check if the video is already downloaded
    if os.path.exists(output_path):
        print(f"Audio file already exists: {output_path}")
        return output_path

    # Download the audio from the YouTube video
    print(f"Downloading audio from YouTube: {url}")
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
        }],
        'outtmpl': output_path,
        'keepvideo': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([url])
        except Exception as e:
            print(f"Error downloading audio: {e}")
            return None

    # Check if the file was renamed to .wav.wav
    if os.path.exists(output_path + ".wav"):
        os.rename(output_path + ".wav", output_path)

    if os.path.exists(output_path):
        print(f"Audio download completed. File saved at: {output_path}")
        print(
            f"File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")
    else:
        print(f"Error: File {output_path} not found after download.")
        output_path = None

    return output_path


def process_audio(wav_file: str, client: genai.Client) -> str:
    # Open the audio file and read the content
    with open(wav_file, 'rb') as f:
        image_bytes = f.read()

    try:
        # Call the API to generate content
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=[
                prompt,
                types.Part.from_bytes(
                    data=image_bytes,
                    mime_type='audio/wav',
                )
            ]
        )

        # Extract JSON content from the markdown-formatted response
        json_text: str = response.text
        # Remove the markdown code block formatting
        json_text: str = re.sub(r'^```json\n|\n```$', '', json_text)

        return json_text

    except Exception as e:
        print(f"Error processing audio file {wav_file}: {e}")
        return None


def save_response(video_id: str, json_text: str) -> bool:
    # Define the file path for the target JSON file
    output_path: str = TRANSCRIPT_FOLDER + f"/{video_id}.json"

    # Save the JSON response to a file
    with open(output_path, 'w') as f:
        f.write(json_text)

    if os.path.exists(output_path):
        print(f"Transcript saved to file: {output_path}")
        return True
    else:
        print(f"Error: File {output_path} not found after saving.")
        return False

# Đọc dữ liệu vào dataframe


In [None]:
# Load data from parquet file
video_df = pd.read_parquet("top_viewed_videos.parquet")

# # Sort data by "statsV2.playCount" in descending order
# # then reset the index to start from 0
# video_df = video_df.sort_values(
#     by="statsV2.playCount",
#     ascending=False
# ).reset_index(drop=True)

video_df.info()

# Lấy danh sách các video đã được xử lý (đã được transcribe)


In [24]:
# Read text file to get a set of video IDs that have been transcribed
transcribed_video_ids = set()
if os.path.exists("transcribed_video_ids.txt"):
    with open("transcribed_video_ids.txt", "r") as f:
        transcribed_video_ids = set(f.read().splitlines())

In [25]:
# Print the number of videos that have been transcribed
print(f"Number of videos transcribed: {len(transcribed_video_ids)}")

Number of videos transcribed: 4449


# Chuẩn bị xử lý dữ liệu


In [40]:
# # Calculate the number of file in transcript folder
# transcript_files = os.listdir(TRANSCRIPT_FOLDER)
# start_index = len(transcript_files)

# # Print the start index
# print(f"Bắt đầu từ index: {start_index}")

In [None]:
# video_id_range = range(video_df.shape[0])[28000:29400]    # vmphat.24
# video_id_range = range(video_df.shape[0])[29400:30800]  # pvminh
# video_id_range = range(video_df.shape[0])[30800:32200]  # ngocquynh
# video_id_range = range(video_df.shape[0])[32200:33600]  # tdtkiet
# video_id_range = range(video_df.shape[0])[33600:35000]  # franievo
# video_id_range = range(video_df.shape[0])[35000:36400]  # denieltran
# video_id_range = range(video_df.shape[0])[36400:37800]  # tulindao
# video_id_range = range(video_df.shape[0])[37800:39200]  # martincung
# video_id_range = range(video_df.shape[0])[39200:40600]  # vmpha21
# video_id_range = range(video_df.shape[0])[40600:42000]  # khdludteam5


video_id_range = range(video_df.shape[0])[0:1400]  # vmphat.24
video_id_range = range(video_df.shape[0])[1400:2800]  # pvminh
video_id_range = range(video_df.shape[0])[2800:4200]  # ngocquynh
video_id_range = range(video_df.shape[0])[4200:5600]
video_id_range = range(video_df.shape[0])[5600:7000]
video_id_range = range(video_df.shape[0])[7000:8400]
video_id_range = range(video_df.shape[0])[8400:9800]
video_id_range = range(video_df.shape[0])[9800:11_200]  # vmphat.24
video_id_range = range(video_df.shape[0])[11_200:12_600]  # vmphat.24


video_id_range = range(video_df.shape[0])[0:20_000]
video_id_range

# Danh sách các API để chạy luân phiên


In [None]:
api_list = [
    # "AIzaSyCgr0Af_ph5vvql_VXpyIwfumJOaehbLDo",  # vmphat.24
    "AIzaSyAAmXLg2yM3Ygz3B_HYC4fcE1iJDNFhxm0",  # pvminh
    "AIzaSyAB9vrQbQPxOp1tbYWN9hjmmmno-9uGwR0",  # ngocquynh
    "AIzaSyCArspeWWKenZy4QSQlpBIrUAnXCWPRr90",  # kiet
    "AIzaSyBMcY_CGvsXGJSOMu3vLfWsd4-qL0bQflg",  # franie
    "AIzaSyAL9WZ2mO88O6DuwivJJWK2oqcy9_UXBNQ",  # daniel
    "AIzaSyDrD1yVeRW85VxX433JKFxKbtFuQ83UhMo",  # tulin
    "AIzaSyA8DDmJgizVgSiE2MdjnVpDZEXqTjEgBRg",  # martin

    # "AIzaSyAcvcAtAlMW4QD1OzCoIsmZl04qjFZ_AZo",  # khdludteam5
    # "AIzaSyCbs_KHkUr-BWL9X6_06kZb3brG7UI1a6w",  # vmphat21

    "AIzaSyBrTgG4YDzJMuK9WknMTbdnnoskSX1nvMY",  # pr

    # "AIzaSyDyjL0w1m1dWCNOP7_9UYXDQnNOqbAdbCw",  # vmphat.24
    "AIzaSyAHiAgc7tIuq4YKtswB-AaHa0W9eqQ5jGw",  # pvminh
    "AIzaSyCnUToo7FRJn8v3BwMOt3FWwrDDFf2b4UI",  # ngocquynh
    "AIzaSyCAnhUoYz6YAYCSfSFF-JmGNbMdxzhDKYU",  # kiet
    "AIzaSyBqu4Xbby4sc0vsCUbxhjqYcqOwKKAwaT4",  # franie
    "AIzaSyDh32FdRtHzuRUaZUXafcmlPHqYQtbRx3A",  # daniel
    "AIzaSyBRhc3Q6rdz3Ok93V5xB76Lfk3mNtdzQEI",  # tulin
    "AIzaSyDPUFWmBABBPAYEa_lOkeony8C2eqKkXTw",  # martin
    "AIzaSyAY8nfoP7DXfL571ovT8V_HlMWCTdHqdgc",  # khdludteam5
    "AIzaSyC4WprE1HsmCUwOoGi4HFfA1Lzg5XSE0Cg",  # vmphat21

    "AIzaSyC-letXWg8hVdOA8H6BlEXb-TXF7W7twQM",
    "AIzaSyCmJQlfuGKf2FNvrUWYd-fPuxYRcmm3p4Q",
    "AIzaSyDlKoywc1dVIaiv4UGVDc0OuaEBFluS2IU",
    "AIzaSyDk5UZkrHP6H3fgAI0FidWJKcVptQdEWBE",
    "AIzaSyBkVUkCK_mMBhJnyi9KoZ9WFf1tfJnlOac",
    "AIzaSyATHBdVQsH-7J8M2v6UcciZyWbzkr13uTA",
    "AIzaSyAvAt0as8Zs0r_iustkbWyimOhdLOzCm8w",
    "AIzaSyDaUPT6NQS8sqs16_hm9_A8ONHsVbh8QiY",
]

assert len(api_list) == len(set(api_list)), "Duplicate API keys found"
n_apis = len(api_list)
api_request_threshold = 1
api_idx = 0

# Đoạn chương trình chính


In [None]:
for row_id in tqdm(video_id_range):
    # Extract the author and video ID from the DataFrame
    author_id = video_df.loc[row_id, "author.uniqueId"]
    video_id = video_df.loc[row_id, "video.id"]

    # Check if the video has been transcribed
    if video_id in transcribed_video_ids:
        # print(f"Video {video_id} has been transcribed. Skipping...")
        continue

    # Construct the URL for the video
    url = f"https://www.tiktok.com/@{author_id}/video/{video_id}"
    # print("URL:", url)

    # ========================================================
    # **** CAN CONTINUE IF ERROR NOT RELATED TO API QUOTA ****
    # ========================================================
    # Download the audio from the video
    wav_file = download_youtube_audio(url, video_id)
    if not wav_file:
        print(f"Error downloading audio for the row: {row_id}")
        continue

    # ========================================================
    # ********** MUST STOP IF API QUOTA IS EXCEEDED **********
    # ========================================================
    # Process the audio to generate the transcript
    client = genai.Client(api_key=api_list[api_idx])
    json_text = process_audio(wav_file=wav_file, client=client)

    print(f"Row {row_id} => Change API key")
    api_idx = (api_idx + 1) % n_apis

    if not json_text:
        print(f"Error processing audio for the row: {row_id}")
        # break
        continue

    # Save the transcript to a JSON file
    if not save_response(video_id, json_text):
        print(f"Error saving transcript for the row: {row_id}")
        # break
        continue

In [None]:
!zip -r {TRANSCRIPT_FOLDER}.zip {TRANSCRIPT_FOLDER}

In [None]:
!ls