# **Uploading Data to Postgres Tables**
Now that I've spent some time initializing the tables, I can upload additional data to the database. 

# Setup
The cells below will set up the rest of the notebook.

I'll start by configuring the kernel: 

In [None]:
# Change the working directory 
%cd ..

# Enable the autoreload extension, which will automatically load in new code as it's written
%load_ext autoreload
%autoreload 2

# Set up some envvars
%env LOG_TO_CONSOLE=True
%env LOG_LEVEL=INFO
%env TQDM_ENABLED=True

Now I'll import some necessary modules:

In [None]:
# General import statements
import pandas as pd
from pandas_gbq import read_gbq
from sqlalchemy import create_engine, MetaData
from sqlalchemy.orm import sessionmaker, declarative_base
from tqdm import tqdm
from pathlib import Path
from google.cloud import storage
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np

# Importing modules custom-built for this project
from utils.settings import (
    GBQ_PROJECT_ID,
    GBQ_DATASET_ID,
    POSTGRES_USER,
    POSTGRES_PASSWORD,
    POSTGRES_HOST,
    POSTGRES_PORT,
    POSTGRES_DB,
    LOG_TO_CONSOLE,
)
from utils.logging import get_logger
from utils.gcs import download_file_from_bucket
from utils.postgres import query_postgres, upload_to_table

# Set up a logger for this notebook
logger = get_logger("postgres_notebook", log_to_console=LOG_TO_CONSOLE)

Finally, we're going to set up the Postgres engine via SQLAlchemy!

In [None]:
# Create the connection string to the database
postgres_connection_string = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"

# Create the connection engine
engine = create_engine(postgres_connection_string)
metadata = MetaData()
session = sessionmaker(bind=engine)()
Base = declarative_base()

Next, we're going to set up the Postgres engine via SQLAlchemy!

# **Downloading Data from GBQ**

Before I do anything with `postgres`, I'm just going to download all of the data from GBQ. This will save me some time now (since I can more easily check the Postgres table to understand which data to upload), but I should probably change this in the future to optimize performance & speed. 

### **`video_metadata`**
The first table is the `video_metadata` table. I'll download that (and some columns from the `enriched_video_metadata` table) for all of the videos that I've transcribed and embedded. 

In [None]:
# Define a query that'll grab all of the video metadata from the GBQ database
video_metadata_query = f"""
-- This query will select metadata for all of the videos that have transcriptions & embeddings
SELECT
  video.*,
  enriched_video.video_type,
  enriched_video.review_score
FROM
  `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.video_metadata` video
JOIN
  `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.enriched_video_metadata` enriched_video
ON
  video.id = enriched_video.id
WHERE
  video.url IN (SELECT DISTINCT(url) FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.transcriptions`)
  AND
  video.url IN (SELECT DISTINCT(video_url) AS url FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.embeddings`)
"""

# Execute the above query
video_metadata_df = read_gbq(video_metadata_query)

### **`transcriptions`**
Next, I'm going to download all of the transcriptions for the videos I'd identified above. 

In [None]:
# Declare the query that will download all of the relevant rows from the 
# transcription table
transcriptions_query = f"""
SELECT 
  transcription.url,
  transcription.text,
  transcription.segment_id,
  transcription.segment_seek,
  transcription.segment_start,
  transcription.segment_end
FROM
  `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.transcriptions` transcription
JOIN (
  SELECT
    video.url
  FROM
    `neural-needledrop.backend_data.video_metadata` video
  WHERE
    video.url IN (SELECT DISTINCT(url) FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.transcriptions`)
    AND
    video.url IN (SELECT DISTINCT(video_url) AS url FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.embeddings`)
) video
ON
  video.url = transcription.url
"""

# Execute the above query
transcriptions_df = read_gbq(transcriptions_query)

### **`embeddings`**
Next up, the `embeddings` table! This one will require a *little* more setup, since we'll need to separately download the embeddings themselves from GCS. (My GBQ dataset only has pointers to each of the GCS URLs.)

I'll start by downloading all of the GBQ data:

In [None]:
# Declare the query that will download the `embeddings` table
embeddings_query = f"""
SELECT 
  embedding.id,
  embedding.video_url AS url,
  embedding.embedding_type,
  embedding.start_segment,
  embedding.end_segment,
  embedding.segment_length,
  embedding.gcs_uri
FROM
  `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.embeddings` embedding
JOIN (
  SELECT
    video.url
  FROM
    `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.video_metadata` video
  WHERE
    video.url IN (SELECT DISTINCT(url) FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.transcriptions`)
    AND
    video.url IN (SELECT DISTINCT(video_url) AS url FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.embeddings`)
) video
ON
  video.url = embedding.video_url
GROUP BY
  embedding.id,
  embedding.video_url,
  embedding.embedding_type,
  embedding.start_segment,
  embedding.end_segment,
  embedding.segment_length,
  embedding.gcs_uri
"""

# Execute the above query
# TODO: UNCOMMENT THIS .head(1000) TO GET ALL OF THE EMBEDDINGS
embeddings_df = read_gbq(embeddings_query)

Now that I've got all of the embeddings metadata, I can download the embeddings themselves. I'll start by creating a temporary directory to store embeddings:

In [None]:
# Create a temporary directory to store the embeddings
temp_emb_directory_path = Path("temp_embeddings")
temp_emb_directory_path.mkdir(exist_ok=True, parents=True)

# Remove any files that're already in the directory if it exists
for file in temp_emb_directory_path.glob("*"):
    file.unlink()

# Create a GCS client
gcs_client = storage.Client(
    project=GBQ_PROJECT_ID
)


Now, I'll iterate through each of the GCS URIs and download them. 

In [None]:
# Prepare the list of GCS URIs
gcs_uris = embeddings_df["gcs_uri"].unique()

def download_embedding(idx_and_uri):
    idx, gcs_uri = idx_and_uri
    try:
        # Parse the GCS URI
        split_gcs_uri = gcs_uri.split("gs://")[-1]
        bucket_name, file_name = split_gcs_uri.split("/")[0], "/".join(
            split_gcs_uri.split("/")[1:]
        )
        
        # Download the embedding corresponding with this GCS URI
        download_file_from_bucket(
            bucket_name=bucket_name,
            file_name=file_name,
            destination_folder=str(temp_emb_directory_path) + "/",
            project_id=GBQ_PROJECT_ID,
            gcs_client=gcs_client,
            logger=logger,
        )

    except Exception as e:
        print(f"Error parsing GCS URI: {e}")
        pass

# Use ThreadPoolExecutor to parallelize the download process
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(download_embedding, idx_uri): idx_uri for idx_uri in enumerate(gcs_uris)}
    for future in tqdm(as_completed(futures), total=len(gcs_uris)):
        pass

Next, I'll load all of the embeddings into RAM. 

In [None]:
# We're going to store the embeddings in a dictionary, where the key is the
# embedding ID and the value is the ndarray of the embedding
embeddings = {}
for idx, emb_file in tqdm(list(enumerate(list(temp_emb_directory_path.iterdir())))):
    try:
        # Load in the .npy file as a numpy array
        embedding = np.load(emb_file)
        
        # If the embedding is empty, skip it
        # TODO: This is because it seems like a ton of embeddings are 
        # empty. We should figure out why that is.
        if embedding.shape == ():
            continue
        
        # Get the embedding ID
        embedding_id = emb_file.stem
        
        # Add the embedding to the dictionary, storing the list representation
        embeddings[embedding_id] = embedding.tolist()
        
    except Exception as e:
        print(f"Error loading embedding: {e}")
        pass

Now that I've loaded all of the embeddings, I'll delete all of the files in the temporary folder:

In [None]:
# Delete all of the files in the temp directory
for file in temp_emb_directory_path.glob("*"):
    file.unlink()
    
# Delete the temp directory
temp_emb_directory_path.rmdir()

Finally, I'll add the embeddings I've loaded to the embedding DataFrame:

In [None]:
# Make a "loaded_embeddings_df" that has the embeddings loaded in
loaded_embeddings_df = embeddings_df.copy()
loaded_embeddings_df["embedding"] = loaded_embeddings_df["id"].apply(
    lambda x: embeddings.get(x, None)
)

# Drop any rows where the embedding is None
loaded_embeddings_df = loaded_embeddings_df.dropna(subset=["embedding"]).drop_duplicates(
    subset=["id"]
)

# Uploading Data to the Postgres Database
Now that I've downloaded all of the data, I'm going to determine which data I need to upload, and then upload it!

### Determining Data to Upload
I need to identify which data is already in each of the `postgres` tables. 

In [None]:
# Determine the videos currently in the `video_metadata` table
cur_database_video_metadata_df = query_postgres(
    "SELECT id FROM video_metadata",
    engine=engine,
    logger=logger,
)

# Determine the transcriptions currently in the `transcriptions` table
cur_database_transcriptions_df = query_postgres(
    "SELECT DISTINCT(url) FROM transcriptions",
    engine=engine,
    logger=logger,
)

# Determine the embeddings currently in the `embeddings` table
cur_database_embeddings_df = query_postgres(
    "SELECT DISTINCT(id) FROM embeddings",
    engine=engine,
    logger=logger,
)

Now that we've got the data currently in the database, we'll figure out which files we need to download. 

In [None]:
# Determine which rows of the video_metadata_df we need to add to the database
video_metadata_df_to_add = video_metadata_df[
    ~video_metadata_df["id"].isin(cur_database_video_metadata_df["id"])
].copy()

# Determine which rows of the transcriptions_df we need to add to the database
transcriptions_df_to_add = transcriptions_df[
    ~transcriptions_df["url"].isin(cur_database_transcriptions_df["url"])
].copy()

# Determine which rows of the embeddings_df we need to add to the database
embeddings_df_to_add = loaded_embeddings_df[
    ~loaded_embeddings_df["id"].isin(cur_database_embeddings_df["id"])
].copy()

# Log some information about the number of rows we're adding
logger.info(
    f"Adding {len(video_metadata_df_to_add)} rows to the video_metadata table."
)
logger.info(
    f"Adding {len(transcriptions_df_to_add)} rows to the transcriptions table."
)
logger.info(
    f"Adding {len(embeddings_df_to_add)} rows to the embeddings table."
)

### Uploading Data to Postgres Table
Now, we'll upload each of the DataFrames to the Postgres server. 

In [None]:
# Upload the video metadata to the database
upload_to_table(
    video_metadata_df_to_add,
    "video_metadata",
    engine=engine,
    logger=logger,
)

# Upload the transcriptions to the database
upload_to_table(
    transcriptions_df_to_add,
    "transcriptions",
    engine=engine,
    logger=logger,
)

# Upload the embeddings to the database
upload_to_table(
    embeddings_df_to_add.drop(columns=["gcs_uri"]),
    "embeddings",
    engine=engine,
    logger=logger,
)