# Install package


In [1]:
!pip install yt-dlp

Collecting yt-dlp
  Downloading yt_dlp-2025.2.19-py3-none-any.whl.metadata (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m153.6/171.9 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.9/171.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.2.19-py3-none-any.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2025.2.19


# Import library


In [2]:
import yt_dlp
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import re
from google import genai
from google.genai import types

In [3]:
AUDIO_FOLDER = "data/audio"
TRANSCRIPT_FOLDER = "data/transcripts"

# Create audio folder if not exists
if not os.path.exists(AUDIO_FOLDER):
    os.makedirs(AUDIO_FOLDER)
# Create transcript folder if not exists
if not os.path.exists(TRANSCRIPT_FOLDER):
    os.makedirs(TRANSCRIPT_FOLDER)

# Các hàm tiện ích


In [4]:
client = genai.Client(api_key="AIzaSyBPAzHod49PIRQFewxCHobxSLuC43OFyTI")
prompt = """
Generate a transcript of the speech. The speech is in Vietnamese. If there is no speech in the file, return None.
Then generate 3 takeaways from the speech. The takeaways should be concise and informative, written in Vietnamese.
Check if the speech contains calls to action (CTA) sentences.
Check if the speech contains elements of curiosity gap.

Return the results in JSON format with fields:
{
    "transcript": "The transcript of the speech",
    "takeaways": ["Takeaway 1", "Takeaway 2", "Takeaway 3"],
    "has_call_to_action": true/false,
    "has_curiosity_gap": true/false
}
"""

In [5]:
def download_youtube_audio(url: str, video_id: str) -> str:
    # Define the file path for the target audio file
    output_path: str = AUDIO_FOLDER + f"/{video_id}.wav"

    # Check if the video is already downloaded
    if os.path.exists(output_path):
        print(f"Audio file already exists: {output_path}")
        return output_path

    # Download the audio from the YouTube video
    print(f"Downloading audio from YouTube: {url}")
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
        }],
        'outtmpl': output_path,
        'keepvideo': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([url])
        except Exception as e:
            print(f"Error downloading audio: {e}")
            return None

    # Check if the file was renamed to .wav.wav
    if os.path.exists(output_path + ".wav"):
        os.rename(output_path + ".wav", output_path)

    if os.path.exists(output_path):
        print(f"Audio download completed. File saved at: {output_path}")
        print(
            f"File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")
    else:
        print(f"Error: File {output_path} not found after download.")
        output_path = None

    return output_path


def process_audio(wav_file: str) -> str:
    # Open the audio file and read the content
    with open(wav_file, 'rb') as f:
        image_bytes = f.read()

    try:
        # Call the API to generate content
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=[
                prompt,
                types.Part.from_bytes(
                    data=image_bytes,
                    mime_type='audio/wav',
                )
            ]
        )

        # Extract JSON content from the markdown-formatted response
        json_text: str = response.text
        # Remove the markdown code block formatting
        json_text: str = re.sub(r'^```json\n|\n```$', '', json_text)

        return json_text

    except Exception as e:
        print(f"Error processing audio file {wav_file}: {e}")
        return None


def save_response(video_id: str, json_text: str) -> bool:
    # Define the file path for the target JSON file
    output_path: str = TRANSCRIPT_FOLDER + f"/{video_id}.json"

    # Save the JSON response to a file
    with open(output_path, 'w') as f:
        f.write(json_text)

    if os.path.exists(output_path):
        print(f"Transcript saved to file: {output_path}")
        return True
    else:
        print(f"Error: File {output_path} not found after saving.")
        return False

# Đọc dữ liệu vào dataframe


In [6]:
# Define data types of some columns
dtypes = {
    "author.uniqueId": np.object_,
    "video.id": np.object_,
}

# Load data from CSV file
video_df = pd.read_csv("top_20_weekly_videos.csv",
                       dtype=dtypes)
video_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   year               1400 non-null   int64  
 1   week               1400 non-null   int64  
 2   weekly_score       1400 non-null   float64
 3   weekly_score_rank  1400 non-null   float64
 4   author.uniqueId    1400 non-null   object 
 5   video.id           1400 non-null   object 
 6   desc               1399 non-null   object 
 7   video.duration     1400 non-null   float64
 8   hashtags           1393 non-null   object 
 9   num_hashtags       1400 non-null   int64  
 10  engagement_rate    1400 non-null   float64
 11  video.url          1400 non-null   object 
dtypes: float64(4), int64(3), object(5)
memory usage: 131.4+ KB


# Chuẩn bị xử lý dữ liệu


In [7]:
# Get set of video_id needed to transcript
with open("error_files.txt") as f:
  video_ids = set(f.read().splitlines())
len(video_ids)

29

In [8]:
for video_id in tqdm(video_ids):
    row_id = video_df[video_df["video.id"] == video_id].index[0]

    # Extract the video_id and url from the DataFrame
    video_id = video_df.loc[row_id, "video.id"]
    url = video_df.loc[row_id, "video.url"]

    # Download the audio from the video
    wav_file = download_youtube_audio(url, video_id)
    if not wav_file:
        print(f"Error downloading audio for the row: {row_id}")
        break

    # Process the audio to generate the transcript
    json_text = process_audio(wav_file)
    if not json_text:
        print(f"Error processing audio for the row: {row_id}")
        break

    # Save the transcript to a JSON file
    if not save_response(video_id, json_text):
        print(f"Error saving transcript for the row: {row_id}")
        break

[download] 100% of    7.03MiB in 00:00:11 at 645.65KiB/s 
[ExtractAudio] Destination: data/audio/7391715153219177735.wav.wav
Audio download completed. File saved at: data/audio/7391715153219177735.wav
File size: 10.17 MB


100%|██████████| 29/29 [05:14<00:00, 10.85s/it]

Transcript saved to file: data/transcripts/7391715153219177735.json





In [9]:
!zip -r data.zip data

  adding: data/ (stored 0%)
  adding: data/transcripts/ (stored 0%)
  adding: data/transcripts/7338403909465148680.json (deflated 25%)
  adding: data/transcripts/7376567397316103442.json (deflated 46%)
  adding: data/transcripts/7415180429818449185.json (deflated 39%)
  adding: data/transcripts/7460441203784518919.json (deflated 64%)
  adding: data/transcripts/7406709590051638548.json (deflated 39%)
  adding: data/transcripts/7380318377786543367.json (deflated 49%)
  adding: data/transcripts/7351068138017918210.json (deflated 46%)
  adding: data/transcripts/7376297082832833793.json (deflated 42%)
  adding: data/transcripts/7464831392392826133.json (deflated 12%)
  adding: data/transcripts/7404415194140658977.json (deflated 40%)
  adding: data/transcripts/7363561574721785106.json (deflated 50%)
  adding: data/transcripts/7391715153219177735.json (deflated 27%)
  adding: data/transcripts/7332097613728189698.json (deflated 35%)
  adding: data/transcripts/7330239538108910849.json (deflated

In [10]:
!ls

data  data.zip	error_files.txt  sample_data  top_20_weekly_videos.csv
