# Install package


In [1]:
# !pip install yt-dlp

# Import library


In [2]:
import yt_dlp
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import List
import json
import re
from google import genai
from google.genai import types


# Create audio folder if not exists
if not os.path.exists('../data/audio'):
    os.makedirs('../data/audio')
# Create transcript folder if not exists
if not os.path.exists('../data/transcripts'):
    os.makedirs('../data/transcripts')

# Các hàm tiện ích


In [3]:
client = genai.Client(api_key="AIzaSyBYqr4g63GOBTslf5xP0-AbIcSSlAuvMnM")
prompt = """
Generate a transcript of the speech. The speech is in Vietnamese. If there is no speech in the file, return None.
Then generate 3 takeaways from the speech. The takeaways should be concise and informative, written in Vietnamese.
Check if the speech contains calls to action (CTA) sentences.
Check if the speech contains elements of curiosity gap.

Return the results in JSON format with fields: 
{
    "transcript": "The transcript of the speech",
    "takeaways": ["Takeaway 1", "Takeaway 2", "Takeaway 3"],
    "has_call_to_action": true/false,
    "has_curiosity_gap": true/false
}
"""

```python
prompt = """
Generate a transcript of the speech. The speech is in Vietnamese. 
If there is no speech in the file, return None.

Then generate 3 takeaways from the speech. 
The takeaways should be concise and informative, written in Vietnamese.

Check if the speech contains calls to action (CTA) sentences.
Check if the speech contains elements of curiosity gap.

Return the results in JSON format with fields: 
{
    "transcript": "The transcript of the speech",
    "takeaways": ["Takeaway 1", "Takeaway 2", "Takeaway 3"],
    "has_call_to_action": true/false,
    "has_curiosity_gap": true/false
}
"""
```

In [4]:
def download_youtube_audio(url: str, video_id: str) -> str:
    # Define the file path for the target audio file
    output_path: str = f"../data/audio/{video_id}.wav"

    # Check if the video is already downloaded
    if os.path.exists(output_path):
        print(f"Audio file already exists: {output_path}")
        return output_path

    # Download the audio from the YouTube video
    print(f"Downloading audio from YouTube: {url}")
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
        }],
        'outtmpl': output_path,
        'keepvideo': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([url])
        except Exception as e:
            print(f"Error downloading audio: {e}")
            return None

    # Check if the file was renamed to .wav.wav
    if os.path.exists(output_path + ".wav"):
        os.rename(output_path + ".wav", output_path)

    if os.path.exists(output_path):
        print(f"Audio download completed. File saved at: {output_path}")
        print(
            f"File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")
    else:
        print(f"Error: File {output_path} not found after download.")
        output_path = None

    return output_path


def process_audio(wav_file: str) -> str:
    # Open the audio file and read the content
    with open(wav_file, 'rb') as f:
        image_bytes = f.read()

    try:
        # Call the API to generate content
        response = client.models.generate_content(
            model='gemini-2.0-flash',
            contents=[
                prompt,
                types.Part.from_bytes(
                    data=image_bytes,
                    mime_type='audio/wav',
                )
            ]
        )

        # Extract JSON content from the markdown-formatted response
        json_text: str = response.text
        # Remove the markdown code block formatting
        json_text: str = re.sub(r'^```json\n|\n```$', '', json_text)

        return json_text

    except Exception as e:
        print(f"Error processing audio file {wav_file}: {e}")
        return None


def save_response(video_id: str, json_text: str) -> bool:
    # Define the file path for the target JSON file
    output_path: str = f"../data/transcripts/{video_id}.json"

    # Save the JSON response to a file
    with open(output_path, 'w') as f:
        f.write(json_text)

    if os.path.exists(output_path):
        print(f"Transcript saved to file: {output_path}")
        return True
    else:
        print(f"Error: File {output_path} not found after saving.")
        return False

# Đọc dữ liệu vào dataframe


In [5]:
# Define data types of some columns
dtypes = {
    "author.uniqueId": np.object_,
    "video.id": np.object_,
}

# Load data from CSV file
video_df = pd.read_csv("../data/interim/top_20_weekly_videos.csv",
                       dtype=dtypes)
video_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   year               1400 non-null   int64  
 1   week               1400 non-null   int64  
 2   weekly_score       1400 non-null   float64
 3   weekly_score_rank  1400 non-null   float64
 4   author.uniqueId    1400 non-null   object 
 5   video.id           1400 non-null   object 
 6   desc               1399 non-null   object 
 7   video.duration     1400 non-null   float64
 8   hashtags           1393 non-null   object 
 9   num_hashtags       1400 non-null   int64  
 10  engagement_rate    1400 non-null   float64
 11  video.url          1400 non-null   object 
dtypes: float64(4), int64(3), object(5)
memory usage: 131.4+ KB


# Chuẩn bị xử lý dữ liệu


In [6]:
# Calculate the number of file in transcript folder
transcript_files = os.listdir("../data/transcripts")
start_index = len(transcript_files)

# Print the start index
print(f"Bắt đầu từ index: {start_index}")

Bắt đầu từ index: 1400


In [7]:
for row_id in tqdm(range(video_df.shape[0])[start_index:]):
    # Extract the video_id and url from the DataFrame
    video_id = video_df.loc[row_id, "video.id"]
    url = video_df.loc[row_id, "video.url"]

    # Download the audio from the video
    wav_file = download_youtube_audio(url, video_id)
    if not wav_file:
        print(f"Error downloading audio for the row: {row_id}")
        break

    # Process the audio to generate the transcript
    json_text = process_audio(wav_file)
    if not json_text:
        print(f"Error processing audio for the row: {row_id}")
        break

    # Save the transcript to a JSON file
    if not save_response(video_id, json_text):
        print(f"Error saving transcript for the row: {row_id}")
        break

0it [00:00, ?it/s]


# Đọc các file JSON và chuyển thành dataframe

In [9]:
import pandas as pd
import numpy as np
import os
import json
from typing import List
from tqdm import tqdm
import re

## Tìm ra các file JSON trong thư mục

In [10]:
def list_file_types(directory: str, file_extension: str) -> List[str]:
    """ List all files with a specific extension in a directory.

    Args:
        directory (str): Directory path.
        file_extension (str): File extension.

    Returns:
        List[str]: List of file paths.
    """

    file_list: List[str] = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(file_extension):
                file_list.append(os.path.join(root, file))
    return file_list

In [11]:
json_files = list_file_types("../data/transcripts", ".json")
print(f"Number of JSON files found: {len(json_files)}")
print(json_files[:5])

Number of JSON files found: 13738
['../data/transcripts/7305335962324749576.json', '../data/transcripts/7305337630361718023.json', '../data/transcripts/7305340095152786696.json', '../data/transcripts/7305340906297675026.json', '../data/transcripts/7305341470884515079.json']


## Đọc mỗi file JSON và chuyển thành dataframe

In [12]:
# Read each JSON file, extract the fields, and store the data in a list
# The data will be used to create a DataFrame

# Valid data
data: List[dict] = []
# Files don't start with "{"
files_not_start_with_curly_brace: List[str] = []
# Files don't end with "}"
files_not_end_with_curly_brace: List[str] = []
# Files with "no speech"
files_no_speech: List[str] = []
# General errors
error_files: List[str] = []

for json_file in tqdm(json_files):
    with open(json_file, 'r') as f:
        # Remove redundant newlines and spaces
        json_text: str = re.sub(r'\n+', ' ', f.read()).strip()

        # Find the first occurrence of "{"
        start_index: int = json_text.find("{")
        if start_index > 0:
            # Remove any text before the first "{"
            json_text = json_text[start_index:]

        # Check if the file contains "no speech"
        if "speech" in json_text.lower():
            # print(f"File contains 'no speech': {json_file}")
            files_no_speech.append(json_file)
            continue

        # Check if the file starts with "{"
        if not json_text.startswith("{"):
            # print(f"File does not start with curly brace: {json_file}")
            files_not_start_with_curly_brace.append(json_file)
            continue

        # Check if the file ends with "}"
        if not json_text.endswith("}"):
            # print(f"File does not end with curly brace: {json_file}")
            files_not_end_with_curly_brace.append(json_file)
            continue

        try:
            # Load the JSON data from the file
            json_data: dict = json.loads(json_text)

            # Extract the fields from the JSON data
            transcript: str = json_data.get("transcript")
            takeaways: List[str] = json_data.get("takeaways")
            call_to_action: bool = json_data.get("has_call_to_action")
            curiosity_gap: bool = json_data.get("has_curiosity_gap")

            # Append the data to the list
            # Lowercase all the text fields
            data.append({
                "video.id": os.path.basename(json_file).replace(".json", "").strip(),
                "transcript": transcript.lower().strip() if transcript else None,
                "takeaway_1": takeaways[0].lower().strip() if takeaways else None,
                "takeaway_2": takeaways[1].lower().strip() if takeaways else None,
                "takeaway_3": takeaways[2].lower().strip() if takeaways else None,
                "transcript_call_to_action": call_to_action,
                "transcript_curiosity_gap": curiosity_gap,
            })
        except Exception as e:
            # print(f"Error processing file {json_file}: {e}")
            error_files.append(json_file)

# Make sure the data is loaded correctly
print(f"Number of records loaded: {len(data)}")  # 1382
assert len(data) == len(json_files) - len(error_files) - \
    len(files_not_start_with_curly_brace) - \
    len(files_not_end_with_curly_brace) - len(files_no_speech)

100%|██████████| 13738/13738 [01:48<00:00, 126.45it/s]

Number of records loaded: 13346





Đảm bảo không có file nào gặp lỗi mà ta không kiểm soát được

In [14]:
# assert len(error_files) == 0
len(error_files), error_files

(22,
 ['../data/transcripts/7317822265704418568.json',
  '../data/transcripts/7327617072853191944.json',
  '../data/transcripts/7328004148551601416.json',
  '../data/transcripts/7328751522936835330.json',
  '../data/transcripts/7339810597799824648.json',
  '../data/transcripts/7342855954473536769.json',
  '../data/transcripts/7346926600711048450.json',
  '../data/transcripts/7351005132278009106.json',
  '../data/transcripts/7355354140643577104.json',
  '../data/transcripts/7367685812185517313.json',
  '../data/transcripts/7380656319851760903.json',
  '../data/transcripts/7383628597300382992.json',
  '../data/transcripts/7388426518373977362.json',
  '../data/transcripts/7394844927202446599.json',
  '../data/transcripts/7421441000935050503.json',
  '../data/transcripts/7428934479126744327.json',
  '../data/transcripts/7430808778091613456.json',
  '../data/transcripts/7442296223660264722.json',
  '../data/transcripts/7454170454400453895.json',
  '../data/transcripts/7455241274547817736.js

Các file không có giọng nói

In [15]:
len(files_no_speech), files_no_speech

(227,
 ['../data/transcripts/7305786485846854920.json',
  '../data/transcripts/7306011728054095105.json',
  '../data/transcripts/7306526019022802183.json',
  '../data/transcripts/7307239097892949256.json',
  '../data/transcripts/7307592825904958728.json',
  '../data/transcripts/7307620045776030994.json',
  '../data/transcripts/7307931176646266113.json',
  '../data/transcripts/7307990190323223815.json',
  '../data/transcripts/7308213369704615175.json',
  '../data/transcripts/7308687385108188424.json',
  '../data/transcripts/7308963596849974535.json',
  '../data/transcripts/7309385579295247634.json',
  '../data/transcripts/7309737167532690696.json',
  '../data/transcripts/7310068100093742344.json',
  '../data/transcripts/7310982782866525441.json',
  '../data/transcripts/7311682555437255937.json',
  '../data/transcripts/7312073521146940690.json',
  '../data/transcripts/7312300971005168898.json',
  '../data/transcripts/7314262596704603393.json',
  '../data/transcripts/7314325034636463362.j

Các file không bắt đầu bằng "{"

In [16]:
len(files_not_start_with_curly_brace), files_not_start_with_curly_brace

(118,
 ['../data/transcripts/7307181087564893448.json',
  '../data/transcripts/7308395723727408391.json',
  '../data/transcripts/7309133015576268033.json',
  '../data/transcripts/7313138182718016770.json',
  '../data/transcripts/7313205714200071442.json',
  '../data/transcripts/7313864443170000129.json',
  '../data/transcripts/7313884922828164353.json',
  '../data/transcripts/7314968733531901191.json',
  '../data/transcripts/7315421672771702024.json',
  '../data/transcripts/7317120189789670663.json',
  '../data/transcripts/7319124573046312194.json',
  '../data/transcripts/7319504418947271954.json',
  '../data/transcripts/7321656912892660994.json',
  '../data/transcripts/7324291698383703303.json',
  '../data/transcripts/7324540392949484818.json',
  '../data/transcripts/7329091466335178002.json',
  '../data/transcripts/7330051069000699154.json',
  '../data/transcripts/7330185685954989319.json',
  '../data/transcripts/7332446113649167637.json',
  '../data/transcripts/7334710515425381650.j

Các file không kết thúc bằng "}"

In [17]:
len(files_not_end_with_curly_brace), files_not_end_with_curly_brace

(25,
 ['../data/transcripts/7307971671414164743.json',
  '../data/transcripts/7310922101316242695.json',
  '../data/transcripts/7310933483927375111.json',
  '../data/transcripts/7316487523872656658.json',
  '../data/transcripts/7322072706998914312.json',
  '../data/transcripts/7325805421325815047.json',
  '../data/transcripts/7333628092121894152.json',
  '../data/transcripts/7346917017733172487.json',
  '../data/transcripts/7347547821605801234.json',
  '../data/transcripts/7348770062393756935.json',
  '../data/transcripts/7359212290891123986.json',
  '../data/transcripts/7382211270017158418.json',
  '../data/transcripts/7382913132567612689.json',
  '../data/transcripts/7384433811246927112.json',
  '../data/transcripts/7389476742194351380.json',
  '../data/transcripts/7398026563486289159.json',
  '../data/transcripts/7404846479690566930.json',
  '../data/transcripts/7406644503412509960.json',
  '../data/transcripts/7407806659520630034.json',
  '../data/transcripts/7424153956684696852.js

Lưu id của các file gặp lỗi

In [18]:
error_files = []
for file in files_not_start_with_curly_brace:
    error_files.append(os.path.basename(file).replace(".json", ""))
for file in files_not_end_with_curly_brace:
    error_files.append(os.path.basename(file).replace(".json", ""))

# Save the id data to a text file
with open("../data/error/error_files.txt", 'w') as f:
    for item in error_files:
        f.write("%s\n" % item)

Chuyển các file thành dataframe

In [19]:
# Convert the list of dictionaries to a DataFrame
transcript_df = pd.DataFrame(data)
transcript_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13346 entries, 0 to 13345
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   video.id                   13346 non-null  object
 1   transcript                 12958 non-null  object
 2   takeaway_1                 12630 non-null  object
 3   takeaway_2                 12630 non-null  object
 4   takeaway_3                 12630 non-null  object
 5   transcript_call_to_action  13345 non-null  object
 6   transcript_curiosity_gap   13345 non-null  object
dtypes: object(7)
memory usage: 730.0+ KB


In [20]:
# Save to parquet file
transcript_df.to_parquet("../transcripts.parquet", index=False)

In [22]:
new_df = pd.read_parquet("../transcripts.parquet")
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13346 entries, 0 to 13345
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   video.id                   13346 non-null  object
 1   transcript                 12958 non-null  object
 2   takeaway_1                 12630 non-null  object
 3   takeaway_2                 12630 non-null  object
 4   takeaway_3                 12630 non-null  object
 5   transcript_call_to_action  13345 non-null  object
 6   transcript_curiosity_gap   13345 non-null  object
dtypes: object(7)
memory usage: 730.0+ KB


In [23]:
filtered_authors = None
with open("../data/filters/author_unique_ids.txt", 'r') as f:
    filtered_authors = f.read().splitlines()
filtered_authors

['spicykim9386',
 'haidangrevieww',
 'khaikhampha',
 'putaangi',
 'trangtam2607',
 'huynhanhtuan_dienvien']

In [25]:
full_df = pd.read_parquet("../preprocessed_videos.parquet")
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70996 entries, 0 to 70995
Data columns (total 52 columns):
 #   Column                           Non-Null Count  Dtype                           
---  ------                           --------------  -----                           
 0   CategoryType                     70996 non-null  object                          
 1   author.downloadSetting           70996 non-null  object                          
 2   author.duetSetting               70996 non-null  object                          
 3   author.id                        70996 non-null  object                          
 4   author.nickname                  70996 non-null  object                          
 5   author.openFavorite              70996 non-null  bool                            
 6   author.secUid                    70996 non-null  object                          
 7   author.signature                 70996 non-null  object                          
 8   author.stitchSet

In [27]:
# Get video.id of filtered authors
filtered_df = full_df[full_df["author.uniqueId"].isin(filtered_authors)]
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1335 entries, 3127 to 63020
Data columns (total 52 columns):
 #   Column                           Non-Null Count  Dtype                           
---  ------                           --------------  -----                           
 0   CategoryType                     1335 non-null   object                          
 1   author.downloadSetting           1335 non-null   object                          
 2   author.duetSetting               1335 non-null   object                          
 3   author.id                        1335 non-null   object                          
 4   author.nickname                  1335 non-null   object                          
 5   author.openFavorite              1335 non-null   bool                            
 6   author.secUid                    1335 non-null   object                          
 7   author.signature                 1335 non-null   object                          
 8   author.stitchSettin

In [30]:
filtered_author_df = pd.merge(
    left=filtered_df, right=transcript_df,
    how="left", left_on="video.id", right_on="video.id"
)

In [33]:
list(transcript_df.columns)

['video.id',
 'transcript',
 'takeaway_1',
 'takeaway_2',
 'takeaway_3',
 'transcript_call_to_action',
 'transcript_curiosity_gap']

In [34]:
selected_cols = ["author.uniqueId", "desc", "hashtags", "hashtag_count"] + list(transcript_df.columns)
selected_cols

['author.uniqueId',
 'desc',
 'hashtags',
 'hashtag_count',
 'video.id',
 'transcript',
 'takeaway_1',
 'takeaway_2',
 'takeaway_3',
 'transcript_call_to_action',
 'transcript_curiosity_gap']

In [38]:
selected_cols = ["author.uniqueId", "desc", "hashtags", "hashtag_count"] + list(transcript_df.columns)
filtered_videos_df = filtered_author_df[selected_cols]
filtered_videos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1335 entries, 0 to 1334
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   author.uniqueId            1335 non-null   object
 1   desc                       1335 non-null   object
 2   hashtags                   1335 non-null   object
 3   hashtag_count              1335 non-null   int64 
 4   video.id                   1335 non-null   object
 5   transcript                 1318 non-null   object
 6   takeaway_1                 1315 non-null   object
 7   takeaway_2                 1315 non-null   object
 8   takeaway_3                 1315 non-null   object
 9   transcript_call_to_action  1327 non-null   object
 10  transcript_curiosity_gap   1327 non-null   object
dtypes: int64(1), object(10)
memory usage: 114.9+ KB


In [None]:
# Save to parquest
filtered_videos_df.to_parquet(
    "../transcript_video_6_authors.parquet", index=False)

In [46]:
pd.read_parquet("../transcript_video_6_authors.parquet").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1335 entries, 0 to 1334
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   author.uniqueId            1335 non-null   object
 1   desc                       1335 non-null   object
 2   hashtags                   1335 non-null   object
 3   hashtag_count              1335 non-null   int64 
 4   video.id                   1335 non-null   object
 5   transcript                 1318 non-null   object
 6   takeaway_1                 1315 non-null   object
 7   takeaway_2                 1315 non-null   object
 8   takeaway_3                 1315 non-null   object
 9   transcript_call_to_action  1327 non-null   object
 10  transcript_curiosity_gap   1327 non-null   object
dtypes: int64(1), object(10)
memory usage: 114.9+ KB


# Merge các dataframe lại với nhau dựa trên `video_id`

In [17]:
# Merge the transcript data with the video data
# using left join to keep all video data
video_transcript_df = pd.merge(
    video_df, transcript_df, on="video.id", how="left")
video_transcript_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   year                       1400 non-null   int64  
 1   week                       1400 non-null   int64  
 2   weekly_score               1400 non-null   float64
 3   weekly_score_rank          1400 non-null   float64
 4   author.uniqueId            1400 non-null   object 
 5   video.id                   1400 non-null   object 
 6   desc                       1399 non-null   object 
 7   video.duration             1400 non-null   float64
 8   hashtags                   1393 non-null   object 
 9   num_hashtags               1400 non-null   int64  
 10  engagement_rate            1400 non-null   float64
 11  video.url                  1400 non-null   object 
 12  transcript                 1332 non-null   object 
 13  takeaway_1                 1297 non-null   objec

In [18]:
video_transcript_df.sample(5)

Unnamed: 0,year,week,weekly_score,weekly_score_rank,author.uniqueId,video.id,desc,video.duration,hashtags,num_hashtags,engagement_rate,video.url,transcript,takeaway_1,takeaway_2,takeaway_3,transcript_call_to_action,transcript_curiosity_gap
1051,2024,47,24.350258,12.0,ancathegioi321,7438575079010602257,Lại thèm hàu rồi #mukbang #mukbangvideo #mukba...,13.0,"mukbang,mukbangvideo,mukbangeatingshow",3,0.02428,https://www.tiktok.com/@ancathegioi321/video/7...,,,,,False,False
1260,2025,6,75.287849,1.0,bon.tq1,7468628859911539969,Tết xong rồi nay chuyển qua series cơm nhà nhe...,252.0,"ancungtiktok,learnontiktok,anngonnaugon,tranqu...",4,0.034641,https://www.tiktok.com/@bon.tq1/video/74686288...,sườn non mua về mấy chị đừng có đi ra bình thư...,"món sườn non rim khóm dễ làm, ai cũng có thể n...",sườn nên được trụng sơ và ướp gia vị kỹ trước ...,nêm nếm nước sốt theo khẩu vị cá nhân để món ă...,True,True
1208,2025,3,33.287207,9.0,phongvnguyntrn,7459246331064896786,"Tính ra ăn ăn cái combo này hợp lý ghê, có cả ...",76.0,"phongvureview,dcgr,reviewanngon,ancungtiktok,l...",6,0.029233,https://www.tiktok.com/@phongvnguyntrn/video/7...,ngày anh cứ bán khoảng tầm 4 500 con là chuyện...,"gà nướng chum giòn ngon, được nướng trong chum...","combo gà bao gồm gà nướng, nộm chân gà hoa chu...","quán rộng rãi, sạch sẽ, phục vụ tốt, có bán cả...",True,True
597,2024,24,30.33213,18.0,ancungmaimai,7379267420579499271,Lại là tớ đây. Mời mọi người ăn viên hải sản s...,158.0,"ancungmaimai,ancungtiktok,fyp,eating,mukbang,p...",6,0.087441,https://www.tiktok.com/@ancungmaimai/video/737...,mời mọi người ăn viên hải sản sốt phô mai cùng...,giới thiệu món viên hải sản sốt phô mai chiên.,món ăn có lớp vỏ giòn và nhân phô mai.,mời mọi người cùng thưởng thức món ăn và hẹn g...,False,True
261,2024,8,66.052829,2.0,chiecbungmo97,7339020465945595137,Seri mỗi ngày một món nước. TRÀ TẮC #chiecbung...,28.0,"chiecbungmo97,eating,eatingshow,mukbang,mukban...",17,0.031221,https://www.tiktok.com/@chiecbungmo97/video/73...,em và em đang cười tươi thì coi như là em đã đ...,"bài hát tạo không khí vui vẻ, sôi động.",sử dụng các hiệu ứng âm thanh điện tử.,"lời bài hát đơn giản, dễ nhớ.",False,False


Lưu dataframe cuối cùng thành file CSV

In [19]:
# Save the merged data to a CSV file
video_transcript_df.to_csv(
    "../data/interim/weekly_videos_with_transcripts.csv", index=False)