# **Prototyping Data Pipeline**
In this notebook, I'm going to be writing a bunch of functions that prototype a data pipeline for this app. Once I write the functions, I'll move them out of this notebook and into a utility file that the main pipeline script can also access. 

# Setup
The cells below will set up the rest of the notebook.

I'll start by configuring the kernel: 

In [None]:
# Change the working directory 
%cd ..

# Enable the autoreload extension, which will automatically load in new code as it's written
%load_ext autoreload
%autoreload 2

Now I'll import some necessary modules:

In [98]:
# General import statements
import pandas as pd
from pytubefix import YouTube, Channel
from google.cloud import bigquery, storage
import traceback
import time
import random
from tqdm import tqdm
import pandas_gbq
import datetime
from pathlib import Path
from google.cloud.exceptions import NotFound
import whisper
import numpy as np

# Importing custom utility functions
import utils.gbq as gbq_utils
import utils.youtube as youtube_utils
import utils.gcs as gcs_utils
import utils.logging as log_utils
import utils.enriching_data as enrichment_utils
import utils.openai as openai_utils

# Indicate whether or not we want tqdm progress bars
tqdm_enabled = True

# Set some constants for the project
GBQ_PROJECT_ID = "neural-needledrop"
GBQ_DATASET_ID = "backend_data"

I'm going to configure the logger for the pipeline below: 

In [None]:
# Define a logger
logger = log_utils.get_logger(name="pipeline", log_to_console=True)

We'll also load in a whisper model:

In [None]:
# Load in the Whisper model of choice
whisper_model_size = "tiny"
whisper_model = whisper.load_model(whisper_model_size)

# Checking GBQ Table
The **very** first thing I need to do: check the actual `video_metadata` GBQ table to determine the most recent video I've downloaded. 

In [None]:
# Define the query that'll grab the most recent video url
most_recent_video_url_query = """
SELECT
  metadata.url
FROM
  `neural-needledrop.backend_data.video_metadata` metadata
ORDER BY
  publish_date DESC, scrape_date DESC
LIMIT 1
"""

# Use pandas-gbq to run the query
most_recent_video_url_df = pd.read_gbq(most_recent_video_url_query, project_id=GBQ_PROJECT_ID)

# If the length of the dataframe is zero, then we need to set the url to None
if len(most_recent_video_url_df) == 0:
    most_recent_video_url = None

# Otherwise, we can just grab the url from the dataframe
else:
    most_recent_video_url = most_recent_video_url_df.iloc[0]["url"]

# Identifying New Videos
The first portion of the pipeline: determining if there are any videos to work with in the first place! 

I'll start by parameterizing the method: 

In [None]:
# Parameterize the identification method
video_limit = 1000 # If video_limit is None, then we're going to download information for all of the videos

# Define the channel of interest
channel_url = "https://www.youtube.com/c/theneedledrop"

most_recent_video_url = None

# Indicate the step size for parsing the videos
video_parse_step_size = 350

Now that I've got the method scoped out, I'm going to write it. I'll identify the first couple of videos. 

In [None]:
def get_video_urls_from_channel(channel, most_recent_video_url=None, video_limit=None, video_parse_step_size=10):
    """
    Helper method to identify all of the video URLs from a channel.
    If `most_recent_video_url` is None, then we're going to download information for all of the videos we can, 
    all the way up to the `video_limit`. If *that* is None, then we're going to download information for all of the videos.
    The `video_parse_step_size` indicates how many videos we're going to parse at a time.
    """
    
    # Initialize the video URLs
    video_urls = []
    
    # Initialize the video count
    video_count = 0
    
    # Iterate through the channel's videos until we find the `most_recent_video_url`
    while most_recent_video_url not in video_urls:
        
        # Fetch the video URLs
        new_video_urls = channel.video_urls[video_count:video_count+video_parse_step_size]
        
        # Break out if no new video URLs were found
        if len(new_video_urls) == 0:
            break
        
        video_urls.extend(new_video_urls)
        
        # Update the video count
        video_count += video_parse_step_size
        
        # If we've reached the video limit, then break
        if (video_limit is not None and video_count >= video_limit):
            break
    
    # Return the video URLs
    return video_urls

This method should function to do what I want. Let's test it: 

In [None]:
# This is the list of video URLs we're going to parse
video_urls_to_parse = get_video_urls_from_channel(
    channel=Channel(channel_url),
    most_recent_video_url=None,
    video_limit=video_limit,
    video_parse_step_size=video_parse_step_size,
)

# If the most_recent_video_url is not None, then we're going to remove all of the videos that come after it
try:
    if most_recent_video_url is not None:
        video_urls_to_parse = video_urls_to_parse[
            : video_urls_to_parse.index(most_recent_video_url)
        ]
    else:
        pass
# If we run into an error, then we're going to print out the traceback
except Exception as e:
    traceback.print_exc()

# Print some information about the video URLs we're going to parse
print(f"Identified {len(video_urls_to_parse)} videos to parse.")

### Removing Already Parsed Videos
We're going to upload a temporary table with all of the video URLs to GBQ. 

In [None]:
# Create a DataFrame from the video URLs
video_urls_to_parse_df = pd.DataFrame(video_urls_to_parse, columns=["url"])

# Create a temporary table in GBQ
temporary_table_name = gbq_utils.create_temporary_table_in_gbq(
    dataframe=video_urls_to_parse_df,
    project_id=GBQ_PROJECT_ID,
    dataset_name=GBQ_DATASET_ID,
    table_name="temporary_video_urls_to_parse",
    if_exists="replace",
)

With this temporary table in hand, we'll query GBQ to figure out the *actual* videos to download. 

In [None]:
# Create the query to identify the videos that we need to parse
actual_videos_to_parse_query = f"""
SELECT
  temp_urls.url
FROM
  `{temporary_table_name}` temp_urls
LEFT JOIN
  `backend_data.video_metadata` metadata
ON
  metadata.url = temp_urls.url
WHERE
  metadata.id IS NULL
"""

# Execute the query
actual_videos_to_parse_df = pd.read_gbq(
    actual_videos_to_parse_query, project_id=GBQ_PROJECT_ID
)

Finally, some cleanup: setting the `actual_videos_to_parse_df` contents to the video_urls_to_parse, and deleting the temporary table. 

In [None]:
# Print some information about the videos we're going to parse
print(f"After filtering out videos that have already been parsed, we have {len(actual_videos_to_parse_df)} videos to parse.")

# Overriding the video_urls_to_parse with the contents of the actual_videos_to_parse_df
video_urls_to_parse = list(actual_videos_to_parse_df["url"])

# Use the gbq_utils to delete the temporary table
temp_table_project_id, temp_table_dataset_id, temp_table_name = temporary_table_name.split(".")
gbq_utils.delete_table(
    project_id=temp_table_project_id,
    dataset_id=temp_table_dataset_id,
    table_id=temp_table_name,
)

# Downloading Video Metadata
Below, I'm going to define a method that'll download a video's metadata. 

In [None]:
def parse_metadata_from_video(video_url):
    """
    This method will parse a dictionary containing metadata from a video, given its URL.
    """

    # Create a video object
    video = YouTube(video_url)

    # Keep a dictionary to keep track of the metadata we're interested in
    video_metadata_dict = {}

    # We'll wrap this in a try/except block so that we can catch any errors that occur
    try:
        # Parse the `videoDetails` from the video; this contains a lot of the metadata we're interested in
        vid_info_dict = video.vid_info
        video_info_dict = vid_info_dict.get("videoDetails")

    # If we run into an Exception this early on, we'll raise an Exception
    except Exception as e:
        raise Exception(
            f"Error parsing video metadata for video {video_url}: '{e}'\nTraceback is as follows:\n{traceback.format_exc()}"
        )

    # Extract different pieces of the video metadata
    video_metadata_dict["id"] = video_info_dict.get("videoId")
    video_metadata_dict["title"] = video_info_dict.get("title")
    video_metadata_dict["length"] = video_info_dict.get("lengthSeconds")
    video_metadata_dict["channel_id"] = video_info_dict.get("channelId")
    video_metadata_dict["channel_name"] = video_info_dict.get("author")
    video_metadata_dict["short_description"] = video_info_dict.get("shortDescription")
    video_metadata_dict["view_ct"] = video_info_dict.get("viewCount")
    video_metadata_dict["url"] = video_info_dict.get("video_url")
    video_metadata_dict["small_thumbnail_url"] = (
        video_info_dict.get("thumbnail").get("thumbnails")[0].get("url")
    )
    video_metadata_dict["large_thumbnail_url"] = (
        video_info_dict.get("thumbnail").get("thumbnails")[-1].get("url")
    )

    # Try and extract the the publish_date
    try:
        publish_date = video.publish_date
        video_metadata_dict["publish_date"] = publish_date
    except:
        video_metadata_dict["publish_date"] = None

    # Try and extract the full description
    try:
        full_description = video.description
        video_metadata_dict["description"] = full_description
    except:
        video_metadata_dict["description"] = None
    
    # Use datetime to get the scrape_date (the current datetime)
    video_metadata_dict["scrape_date"] = datetime.datetime.now()
    
    # Add the url to the video_metadata_dict
    video_metadata_dict["url"] = video_url

    # Finally, return the video metadata dictionary
    return video_metadata_dict

Now: we'll need to iterate through each of the videos and download their metadata. 

In [None]:
# Parameterize the video metadata parsing
time_to_sleep_between_parsing = 5
sleep_randomization_factor = 3.5

# We'll iterate through each of the videos in the list and parse their metadata
video_metadata_dicts_by_video_url = {}
for video_url in tqdm(video_urls_to_parse, disable=not tqdm_enabled):
    
    # We'll wrap this in a try/except block so that we can catch any errors that occur
    try:
        # Parse the metadata from the video
        video_metadata_dict = parse_metadata_from_video(video_url)
        
        # Add the video metadata dictionary to the dictionary of video metadata dictionaries
        video_metadata_dicts_by_video_url[video_url] = video_metadata_dict
        
        # Sleep for a random amount of time
        time_to_sleep = random.uniform(time_to_sleep_between_parsing, time_to_sleep_between_parsing + (sleep_randomization_factor * time_to_sleep_between_parsing))
        time.sleep(time_to_sleep)
    
    # If we run into an Exception, then we'll print out the traceback
    except Exception as e:
        traceback.print_exc()

## Storing the Metadata
Now that I've downloaded some metadata about different videos, I need to store it. 

In [None]:
# Create a list of the rows to add to the table
rows_to_add = [val for val in video_metadata_dicts_by_video_url.values()]

# Add the rows to the table
gbq_utils.add_rows_to_table(
    project_id=GBQ_PROJECT_ID,
    dataset_id=GBQ_DATASET_ID,
    table_id="video_metadata",
    rows=rows_to_add   
)

# Downloading Video Audio
Next up, I need to download some video audio. This one probably needs to go a lot slower than the metadata fetching 😅

### Determining Audio to Download
I need to check with GBQ to see if there are any videos that I need to download. 

In [None]:
# Parameterize the query
n_max_video_urls = 50

# The query below will determine which videos we need to download audio for
videos_for_audio_parsing_query = f"""
SELECT
  video.url
FROM
  `backend_data.video_metadata` video
LEFT JOIN
  `backend_data.audio` audio
ON
  audio.video_url = video.url
WHERE
  audio.audio_gcs_uri IS NULL
LIMIT {n_max_video_urls}
"""

# Execute the query
videos_for_audio_parsing_df = pd.read_gbq(
    videos_for_audio_parsing_query, project_id=GBQ_PROJECT_ID
)

### Downloading Audio
Next, I'm going to use `pytube` to download the audio of these videos.

In [None]:
# Parameterize the download
time_to_sleep_between_downloads = 25
sleep_randomization_factor = 3.5
download_directory = Path("temp_data/")

# Iterate through the videos and download their audio
for video_url in tqdm(videos_for_audio_parsing_df["url"], disable=not tqdm_enabled):
    # We'll wrap this in a try/except block so that we can catch any errors that occur
    try:
        # Download the audio from the video
        youtube_utils.download_audio_from_video(
            video_url=video_url, data_folder_path=download_directory
        )

    # If we run into an Exception, then we'll print out the traceback
    except Exception as e:
        traceback.print_exc()

### Uploading Audio to GCS
Now that I've downloaded the audio, I need to upload it to GCS. 

I'll start by creating the bucket if it doesn't exist:

In [None]:
# Make sure that the neural-needledrop-audio bucket exists
gcs_utils.create_bucket(
    "neural-needledrop-audio", project_id=GBQ_PROJECT_ID, delete_if_exists=False
)

Next: upload all of the audio. 

In [None]:
# Parameterize the audio upload process
delete_files_after_upload = True

# Iterate through all of the video urls in the videos_for_audio_parsing_df
for row in tqdm(
    list(videos_for_audio_parsing_df.itertuples()), disable=not tqdm_enabled
):
    # We'll wrap this in a try/except block so that we can catch any errors that occur
    try:
        # Get the video url
        video_url = row.url

        # Get the video id
        video_id = video_url.split("watch?v=")[-1]

        # Get the path to the audio file
        audio_file_path = download_directory / f"{video_id}.m4a"

        # Check to see if this file exists
        if not Path(audio_file_path).exists():
            # If it doesn't exist, then we'll continue. Print out a warning
            print(f"Warning: {audio_file_path} does not exist. Skipping...")
            continue

        # Get the GCS URI
        gcs_uri = f"neural-needledrop-audio"

        # Upload the audio file to GCS
        audio_file_path_str = str(audio_file_path)

        # Convert the audio file to .mp3 using youtube_utils
        youtube_utils.convert_m4a_to_mp3(
            input_file_path=audio_file_path_str,
            output_file_path=audio_file_path_str.replace(".m4a", ".mp3"),
        )

        # Remove the .m4a file
        audio_file_path.unlink()

        # Update the audio_file_path_str
        audio_file_path = Path(audio_file_path_str.replace(".m4a", ".mp3"))
        audio_file_path_str = str(audio_file_path)

        gcs_utils.upload_file_to_bucket(
            file_path=audio_file_path_str,
            bucket_name=gcs_uri,
            project_id=GBQ_PROJECT_ID,
        )

        # Create a dictionary to store the audio metadata
        audio_metadata_dict = {
            "video_url": video_url,
            "audio_gcs_uri": f"gs://{gcs_uri}/{audio_file_path.name}",
            "scrape_date": datetime.datetime.now(),
        }

        # Add the audio metadata to the table
        try:
            gbq_utils.add_rows_to_table(
                project_id=GBQ_PROJECT_ID,
                dataset_id=GBQ_DATASET_ID,
                table_id="audio",
                rows=[audio_metadata_dict],
            )
        except NotFound:
            gbq_utils.generate_audio_table(
                project_id=GBQ_PROJECT_ID,
                dataset_id=GBQ_DATASET_ID,
                delete_if_exists=False,
            )
            gbq_utils.add_rows_to_table(
                project_id=GBQ_PROJECT_ID,
                dataset_id=GBQ_DATASET_ID,
                table_id="audio",
                rows=[audio_metadata_dict],
            )

        # Delete the audio file if delete_files_after_upload
        if delete_files_after_upload:
            audio_file_path.unlink()

    # If we run into an Exception, then we'll print out the traceback
    except Exception as e:
        traceback.print_exc()

# If we're deleting the files after upload, then we'll delete the download_directory
if delete_files_after_upload:
    Path(download_directory).rmdir()

Now, a super quick fix that I ought to handle: I'm going to deduplicate the `backend_data.audio` table.

In [None]:
# This query will deduplicate the audio table
deduplicate_audio_table_query = f"""
CREATE OR REPLACE TABLE `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.audio` AS (
    SELECT
        video_url,
        audio_gcs_uri,
        scrape_date
    FROM (
        SELECT
            *,
            ROW_NUMBER() OVER (PARTITION BY video_url ORDER BY scrape_date DESC) AS row_num
        FROM
            `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.audio`
    ) ordered_table
    WHERE
        ordered_table.row_num = 1
)
"""

# Execute the query
pandas_gbq.read_gbq(deduplicate_audio_table_query, project_id=GBQ_PROJECT_ID)

# Transcribing Audio with Whisper
Now that I've downloaded some audio, I need to figure out what needs to be transcribed. I can do that by checking the `audio` and `transcriptions` table. 

In [None]:
# Parameterize the transcription
n_max_to_transcribe = 3

# This query will determine all of the videos we need to transcribe
videos_for_transcription_query = f"""
SELECT
  DISTINCT(audio.video_url) AS url,
  audio.audio_gcr_uri
FROM
  `backend_data.audio` audio 
LEFT JOIN
  `backend_data.transcriptions` transcript
ON
  audio.video_url = transcript.url
WHERE
  transcript.created_at IS NULL
LIMIT {n_max_to_transcribe}
"""

# Execute the query
videos_for_transcription_df = pd.read_gbq(
    videos_for_transcription_query, project_id=GBQ_PROJECT_ID
)

Now, with all of the audio specified, we need to try and download it from `GCS`. 

In [None]:
# Iterate through all of the video urls in the videos_for_transcription_df
for row in tqdm(
    list(videos_for_transcription_df.itertuples()), disable=not tqdm_enabled
):
    # Parse the GCS URI
    split_gcs_uri = row.audio_gcr_uri.split("gs://")[-1]
    bucket_name, file_name = split_gcs_uri.split("/")[0], "/".join(
        split_gcs_uri.split("/")[1:]
    )

    # Download the audio
    gcs_utils.download_file_from_bucket(
        bucket_name=bucket_name,
        file_name=file_name,
        destination_folder="temp_data/",
        project_id=GBQ_PROJECT_ID,
    )

In [None]:
# We'll store the audio metadata in a dictionary
audio_metadata_dict_by_video_url = {}

# Iterate through each of the files in the `temp_data` directory and transcribe them
for child_file in tqdm(list(Path("temp_data/").iterdir()), disable=not tqdm_enabled):
    try:
        if child_file.suffix != ".mp3":
            continue

        # Extract some data about the file
        video_url = f"https://www.youtube.com/watch?v={child_file.stem}"
        video_id = child_file.stem

        # Use whisper to transcribe the audio
        whisper_transcription = whisper_model.transcribe(str(child_file), fp16=False)

        # Store the transcription in the audio_metadata_dict_by_video_url
        audio_metadata_dict_by_video_url[video_url] = whisper_transcription
    except Exception as e:
        raise Exception(
            f"Error getting audio file path for video {video_url}: '{e}'\nTraceback is as follows:\n{traceback.format_exc()}"
        )

### Uploading Transcription to GBQ
Now that I've transcribed all of these videos, I'm going to upload the transcriptions to GBQ. 

First, I'll transform the DataFrame to add some needed data:

In [None]:
# Create a DataFrame from the audio_metadata_dict_by_video_url
audio_metadata_df = pd.DataFrame.from_dict(
    audio_metadata_dict_by_video_url, orient="index"
)

# Reset the index into a "url" column
audio_metadata_df.reset_index(inplace=True, names=["url"])

# Explode the "segments" column
audio_metadata_df = audio_metadata_df.explode("segments")

# Rename the "segment" column to "segment" in the audio_metadata_df
audio_metadata_df = audio_metadata_df.rename(columns={"segments": "segment"})

# Add a "created_at" column to the audio_metadata_df
audio_metadata_df["created_at"] = datetime.datetime.now()

# Alter the "text" column so that it's extracted from the "segment" column
audio_metadata_df["text"] = audio_metadata_df["segment"].apply(
    lambda x: x.get("text", None)
)

# Add a "segment_type" column to the audio_metadata_df
audio_metadata_df["segment_type"] = "small_segment"

# We're going to extract some columns from the `segment` dictionary
segment_columns_to_keep = ["id", "seek", "start", "end"]
normalized_segments_df = pd.json_normalize(audio_metadata_df["segment"])
normalized_segments_df = normalized_segments_df[segment_columns_to_keep]

# Rename all of the columns so that they have "segment_" prepended to them
normalized_segments_df = normalized_segments_df.rename(
    columns={col: f"segment_{col}" for col in normalized_segments_df.columns}
)

# Make the final_transcription_df
final_transcription_df = pd.concat(
    [
        audio_metadata_df.drop(columns=["segment"]).reset_index(drop=True),
        normalized_segments_df.reset_index(drop=True),
    ],
    axis=1,
).copy()

Then, I'll make a temporary table. This will help me ensure that I'm not re-uploading any transcriptions. 

In [None]:
# Define the name of the table we're going to create
table_name = "temp_transcriptions"

# Create the table
gbq_utils.create_temporary_table_in_gbq(
    dataframe=final_transcription_df,
    project_id=GBQ_PROJECT_ID,
    dataset_name=GBQ_DATASET_ID,
    table_name=table_name,
    if_exists="replace"
)

Now, with this temporary table in hand, I'm going to try and identify the videos whose transcriptions haven't been added yet. 

In [None]:
# The following query will determine which transcripts we need to upload
transcripts_to_upload_query = f"""
SELECT
  DISTINCT(temp_transcript.url)
FROM
  `backend_data.temp_transcriptions` temp_transcript
LEFT JOIN
  `backend_data.transcriptions` transcript
ON
  transcript.url = temp_transcript.url
WHERE
  transcript.created_at IS NULL
"""

# Execute the query
transcripts_to_upload_df = pd.read_gbq(
    transcripts_to_upload_query, project_id=GBQ_PROJECT_ID
)

Now that we've cross-referenced with the table, let's upload them. 

In [None]:
# Create a DataFrame containing the transcripts that we need to upload
final_transcriptions_to_upload_df = final_transcription_df.merge(
    transcripts_to_upload_df, on="url"
)

# Use the gbq_utils to add rows to the `backend_data.transcriptions` table
gbq_utils.add_rows_to_table(
    project_id=GBQ_PROJECT_ID,
    dataset_id=GBQ_DATASET_ID,
    table_id="transcriptions",
    rows=final_transcription_df.to_dict(orient="records"),
)

Finally, we'll delete the `temp_transactions` table and the `temp_data` directory. 

In [None]:
# Delete the temporary table
gbq_utils.delete_table(
    project_id=GBQ_PROJECT_ID,
    dataset_id=GBQ_DATASET_ID,
    table_id=table_name,
)

# Delete the temp_data directory and everything in it
for child_file in Path("temp_data/").iterdir():
    child_file.unlink()
Path("temp_data/").rmdir()

# Enriching Video Metadata
The next part of the pipeline involves enriching the video data. 

I'll start by determining which videos need their metadata enriched: 

In [None]:
# The query below will define the videos that we need to enrich
videos_to_enrich_query = f"""
SELECT
  metadata.id,
  metadata.url,
  metadata.title,
  metadata.description
FROM
  `backend_data.video_metadata` metadata
WHERE NOT EXISTS (
  SELECT 1
  FROM `backend_data.enriched_video_metadata` enriched_metadata
  WHERE enriched_metadata.url = metadata.url
)
"""

# Execute the query
videos_to_enrich_df = pd.read_gbq(videos_to_enrich_query, project_id=GBQ_PROJECT_ID)

Next, we're going to enrich the video metadata. Right now, this is a pretty simple exercise. There are only two new fields we're looking to add: 

- `video_type`
- `review_score`

In [None]:
# Add a column containing the video type
videos_to_enrich_df["video_type"] = videos_to_enrich_df["title"].apply(
    lambda x: enrichment_utils.classify_video_type(x)
)

# Add a column containing the review score
videos_to_enrich_df["review_score"] = videos_to_enrich_df["description"].apply(
    lambda x: enrichment_utils.extract_review_score(x)
)

Now that we've enriched this video metadata, we can upload it to the table. 

In [None]:
# Make a copy of the DataFrame that we're going to upload
enriched_metadata_to_upload_df = videos_to_enrich_df.copy()

# Drop the title and description columns
enriched_metadata_to_upload_df.drop(columns=["title", "description"], inplace=True)

# Replace the NaN values with None
enriched_metadata_to_upload_df["review_score"] = enriched_metadata_to_upload_df[
    "review_score"
].replace({np.nan: None})

# Add the rows to the `backend_data.enriched_video_metadata` table
gbq_utils.add_rows_to_table(
    project_id=GBQ_PROJECT_ID,
    dataset_id=GBQ_DATASET_ID,
    table_id="enriched_video_metadata",
    rows=enriched_metadata_to_upload_df.to_dict(orient="records"),
)

# Embedding Transcriptions
Next up: we're going to embed some of the different audio transcriptions we've got. 

I'll start by determining which pieces of text we need to embed. This will involve checking the `transcriptions` table for video URLs that aren't represented within the `embeddings` table. 

In [36]:
# The query below will determine which transcriptions we need to embed
transcriptions_to_embed_query = f"""
SELECT
  transcript.*
FROM
  `backend_data.transcriptions` transcript
LEFT JOIN
  `backend_data.embeddings` embedding
ON
  embedding.video_url = transcript.url
WHERE
  embedding.id IS NULL
"""

# Execute the query
transcriptions_to_embed_df = pd.read_gbq(
    transcriptions_to_embed_query, project_id=GBQ_PROJECT_ID
)

Next up: we need to create chunks of segments. There's a **whole** ton of literature on different chunking methods ([see here](https://www.pinecone.io/learn/chunking-strategies/)), but for right now, I'm going to basically ignore all of that in lieu of making some fixed-size chunks. 

I can always come back later to re-evaluate the chunking situation. For now, I'm going to try and embed both 4-segment and 8-segment chunks. In the future, I'll revisit this to try and make smarter decisions about it. 

In [58]:
# Define the segment chunk sizes
segment_chunk_sizes = [4, 8]

# Initialize a list to hold the segment_chunk rows
segment_chunks = []

# Iterate through each unique URL in the DataFrame
for url in transcriptions_to_embed_df["url"].unique():
    # Filter the DataFrame for the current URL
    url_df = transcriptions_to_embed_df[transcriptions_to_embed_df["url"] == url]

    # Extract the video ID from the URL
    video_id = url.split("watch?v=")[-1]

    # Sort the DataFrame by the segment_start
    url_df = url_df.sort_values(by=["segment_start"])

    # Iterate through the defined chunk sizes
    for chunk_size in segment_chunk_sizes:
        # Iterate through the segments in steps of chunk_size
        for i in range(0, len(url_df), chunk_size):
            # Get the chunk of segments
            segment_chunk_df = url_df.iloc[i : i + chunk_size]

            # Concatenate the text of the segments to form the chunk text
            chunk_text = " ".join(segment_chunk_df["text"].tolist())

            # Determine a new ID for this particular segment chunk
            segment_chunk_id = f"{video_id}_{i}_{i+chunk_size}"

            # Create a dictionary for the segment_chunk row
            segment_chunk_row = {
                "id": segment_chunk_id,
                "video_url": video_id,
                "embedding_type": "segment_chunk",
                "start_segment": i,
                "end_segment": i + chunk_size,
                "segment_length": chunk_size,
                "text": chunk_text.strip(),
            }

            # Append the segment_chunk_row to the list of segment_chunks
            segment_chunks.append(segment_chunk_row)

# Create a DataFrame that has the segment_chunks
segment_chunks_df = pd.DataFrame(segment_chunks)

Now that we have all of these chunks, we're going to embed them. 

In [65]:
# We're going to collect all of the embeddings in this list
embeddings_col = openai_utils.embed_text_list(list(segment_chunks_df["text"]))

# Add this column containing all of the embeddings to the segment_chunks_df
segment_chunks_df["embedding"] = embeddings_col

Embedding Texts: 100%|██████████| 139/139 [00:01<00:00, 78.87it/s] 


Now that I've got all of the embeddings, I'll upload them to GCS. I'll check if the bucket exists first:

In [69]:
# Create the embeddings bucket if it doesn't exist
gcs_utils.create_bucket(
    bucket_name="neural-needledrop-embeddings", project_id=GBQ_PROJECT_ID
)

Bucket neural-needledrop.neural-needledrop-embeddings created


Next, I'll try to save all of the embeddings as temporary `.npy` files. This will help me upload them to GCS: 

In [110]:
# Create the temp_embeddings folder if it doesn't exist
temp_folder_name = "temp_embeddings"
Path(temp_folder_name).mkdir(exist_ok=True, parents=True)

# We're going to keep a list of the file paths of the embeddings we save
embedding_file_paths = []
for row in segment_chunks_df.itertuples():
    video_id = row.id
    emb_filename = f"{temp_folder_name}/{video_id}.npy"
    emb = row.embedding
    openai_utils.save_as_npy(
        embedding=emb,
        file_name=emb_filename,
    )
    embedding_file_paths.append(emb_filename)

Finally, we're going to upload all of the `.npy` files to the GCS bucket, and then delete the files:

In [111]:
# Create a GCS client
gcs_client = storage.Client(project=GBQ_PROJECT_ID)

for file_path in tqdm(embedding_file_paths, disable=not tqdm_enabled):
    gcs_utils.upload_file_to_bucket(
        file_path=file_path,
        bucket_name="neural-needledrop-embeddings",
        project_id=GBQ_PROJECT_ID,
        gcs_client=gcs_client
    )

    # Remove the file
    Path(file_path).unlink()

# Delete the temp_embeddings folder
Path(temp_folder_name).rmdir()

100%|██████████| 139/139 [00:24<00:00,  5.77it/s]


Finally: now that we've uploaded all of the embeddings to GCS, we can add some rows to the `embeddings` table that contain information about these. 

In [114]:
# Define the rows_to_upload list
rows_to_upload = []

# Iterate through the segment_chunks_df and extract the rows
for row in segment_chunks_df.itertuples():
    # Extract the row
    row_dict = {
        "id": row.id,
        "video_url": row.video_url,
        "gcs_uri": f"gs://neural-needledrop-embeddings/{row.id}.npy",
        "embedding_type": row.embedding_type,
        "start_segment": row.start_segment,
        "end_segment": row.end_segment,
        "segment_length": row.segment_length,
    }

    # Append the row to the rows_to_upload
    rows_to_upload.append(row_dict)

# Add the rows to the `backend_data.embeddings` table
gbq_utils.add_rows_to_table(
    project_id=GBQ_PROJECT_ID,
    dataset_id=GBQ_DATASET_ID,
    table_id="embeddings",
    rows=rows_to_upload,
)

Loaded 139 row(s) into backend_data:embeddings.
