# **Functionalizing the Database Update**
Now that I've got the database auto-deploying via CI/CD, I need to write some sort of script to automatically update things. I'm going to prototype that method here.  

# Setup
The cells below will set up the rest of the notebook.

I'll start by configuring the kernel: 

In [1]:
# Change the working directory 
%cd ..

# Enable the autoreload extension, which will automatically load in new code as it's written
%load_ext autoreload
%autoreload 2

d:\data\programming\neural-needledrop\database


Now I'll import some necessary modules:

In [19]:
# General import statements
from pandas_gbq import read_gbq
from pathlib import Path
from google.cloud import storage
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
from sqlalchemy import create_engine, MetaData, Column, Integer, String, DateTime
from sqlalchemy.orm import sessionmaker, declarative_base
from sqlalchemy.sql import text
from pgvector.sqlalchemy import Vector
from tqdm import tqdm

# Importing modules custom-built for this project
from utils.settings import (
    POSTGRES_USER,
    POSTGRES_PASSWORD,
    POSTGRES_HOST,
    POSTGRES_PORT,
    POSTGRES_DB,
    LOG_TO_CONSOLE,
    GBQ_PROJECT_ID,
    GBQ_DATASET_ID
)
from utils.logging import get_logger
from utils.postgres import delete_table, create_table
from utils.gcs import download_file_from_bucket
from utils.postgres import query_postgres, upload_to_table, delete_table

# Set up a logger for this notebook
logger = get_logger("postgres_notebook", log_to_console=True)

I'll also set up the Postgres database connector: 

In [4]:
# Create the connection string to the database
postgres_connection_string = f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"

# Create the connection engine
engine = create_engine(postgres_connection_string)
metadata = MetaData()
session = sessionmaker(bind=engine)()
Base = declarative_base()

# Parameterizing Function
Below, I'm going to set up different arguments for the function. 

In [29]:
# Indicate the folder where the data is stored
database_folder = "data"

# Indicate whether we want to re-create tables if they already exist
recreate_tables = False

# Indicate the chunksize for the Postgres upload
postgres_upload_chunksize = 5000

# Indicate a couple of settings re: index creation 
postgres_maintenance_work_mem = "6GB"
postgres_max_parallel_maintenance_workers = 7

# Indicate a limit for the number of videos whose embeddings we want to upload
max_n_videos_to_update_embeddings = 1000

# Indicate some settings related to the embeddings index
recreate_embeddings_index_if_exists = True
embeddings_index_ivfflat_nlist = 500

# Running the Function
Below, I've got the entire function, runnable via different cells. 

### Optional Table Deletion
If the user wanted to delete any tables, then we'll do it below. 

In [6]:
# =======================
# OPTIONAL TABLE DELETION
# =======================
# If the user wants to delete the tables, then do so

if recreate_tables:
    # Log that we're deleting the tables
    logger.info("DELETING EACH OF THE POSTGRES TABLES...")
    tables_to_delete = ["video_metadata", "embeddings", "transcriptions"]
    for table in tables_to_delete:
        delete_table(table, engine, logger)

### Table Initialization Check
Below, we'll try and create the tables if they don't exist. 

In [12]:
# =============================
# VIDEO_METADATA INITIALIZATION
# =============================
# Below, we'll initialize the video_metadata table in the database

# Log that we're creating the video_metadata table
logger.info("CREATING THE video_metadata TABLE...")

try:

    # Define the schema that we'll be using for this table
    schema = [
        Column("id", String, primary_key=True),
        Column("title", String),
        Column("length", Integer),
        Column("channel_id", String),
        Column("channel_name", String),
        Column("short_description", String),
        Column("description", String),
        Column("view_ct", Integer),
        Column("url", String),
        Column("small_thumbnail_url", String),
        Column("large_thumbnail_url", String),
        Column("video_type", String),
        Column("review_score", Integer),
        Column("publish_date", DateTime),
        Column("scrape_date", DateTime),
    ]

    # Create the table
    create_table("video_metadata", schema, engine, metadata, logger)

except Exception as e:

    # Log the error
    logger.error(f"An error occurred while creating the video_metadata table: {e}")

# =============================
# TRANSCRIPTIONS INITIALIZATION
# =============================
# Below, we'll initialize the transcriptions table in the database

# Log that
logger.info("CREATING THE transcriptions TABLE...")

try:

    # Define the schema that we'll be using for this table
    transcriptions_table_schema = [
        Column("url", String),
        Column("text", String),
        Column("segment_id", Integer),
        Column("segment_seek", Integer),
        Column("segment_start", Integer),
        Column("segment_end", Integer),
    ]

    # Create the table
    create_table(
        "transcriptions", transcriptions_table_schema, engine, metadata, logger
    )

except Exception as e:

    # Log the error
    logger.error(f"An error occurred while creating the transcriptions table: {e}")

# =========================
# EMBEDDINGS INITIALIZATION
# =========================
# Below, we'll initialize the embeddings table in the database

# Log that
logger.info("CREATING THE embeddings TABLE...")

try:
    # Enable the pgvector Extension
    session.execute(text("CREATE EXTENSION IF NOT EXISTS vector"))
    session.commit()

    # Now, we're going to create a table for the embeddings
    embeddings_table_schema = [
        Column("id", String, primary_key=True),
        Column("url", String),
        Column("embedding_type", String),
        Column("start_segment", Integer),
        Column("end_segment", Integer),
        Column("segment_length", Integer),
        Column("embedding", Vector(1536)),
    ]

    # Now, we're going to create a table for the embeddings
    create_table("embeddings", embeddings_table_schema, engine, metadata, logger)

except Exception as e:
    # Log the error
    logger.error(f"An error occurred while creating the embeddings table: {e}")

finally:
    # Close the session
    session.close()

2024-02-18 23:07:51,765 - postgres_notebook - INFO - CREATING THE video_metadata TABLE...
2024-02-18 23:07:51,765 - postgres_notebook - ERROR - Error creating table 'video_metadata': Table 'video_metadata' is already defined for this MetaData instance.  Specify 'extend_existing=True' to redefine options and columns on an existing Table object.
2024-02-18 23:07:51,769 - postgres_notebook - ERROR - An error occurred while creating the video_metadata table: Table 'video_metadata' is already defined for this MetaData instance.  Specify 'extend_existing=True' to redefine options and columns on an existing Table object.
2024-02-18 23:07:51,770 - postgres_notebook - INFO - CREATING THE transcriptions TABLE...
2024-02-18 23:07:51,770 - postgres_notebook - ERROR - Error creating table 'transcriptions': Table 'transcriptions' is already defined for this MetaData instance.  Specify 'extend_existing=True' to redefine options and columns on an existing Table object.
2024-02-18 23:07:51,771 - postgr

### Updating Data
Now, we'll update all the tables using the freshest GBQ data.

In [13]:
# =============================
# CREATING TEMPORARY GBQ TABLES
# =============================
# Below, I'm going to create temporary tables in GBQ that will allow me to compare the current state of the Postgres database with the state of the GBQ database. 
# This will allow me to determine which videos, transcriptions, and embeddings are currently in the Postgres database, but not in the GBQ database.

# Log that we're creating the temporary GBQ tables
logger.info("CREATING TEMPORARY GBQ TABLES...")

# Determine the videos currently in the `video_metadata` table
cur_database_video_metadata_df = query_postgres(
    "SELECT id FROM video_metadata",
    engine=engine,
    logger=logger,
)

# Determine the transcriptions currently in the `transcriptions` table
cur_database_transcriptions_df = query_postgres(
    "SELECT DISTINCT(url) FROM transcriptions",
    engine=engine,
    logger=logger,
)

# Determine the embeddings currently in the `embeddings` table
cur_database_embeddings_df = query_postgres(
    "SELECT DISTINCT(id) FROM embeddings",
    engine=engine,
    logger=logger,
)

# Upload the `cur_database_video_metadata_df` dataframe to a temporary table in GBQ
cur_database_video_metadata_df.to_gbq(
    f"{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.cur_pg_db_video_metadata",
    if_exists="replace",
)

# Upload the `cur_database_transcriptions_df` dataframe to a temporary table in GBQ
cur_database_transcriptions_df.to_gbq(
    f"{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.cur_pg_db_transcriptions",
    if_exists="replace",
)

# Upload the `cur_database_embeddings_df` dataframe to a temporary table in GBQ
cur_database_embeddings_df.to_gbq(
    f"{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.cur_pg_db_embeddings",
    if_exists="replace",
)

2024-02-18 23:08:17,684 - postgres_notebook - INFO - CREATING TEMPORARY GBQ TABLES...


100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 703.62it/s]


### **`video_metadata`**
The first table is the `video_metadata` table. I'll download that (and some columns from the `enriched_video_metadata` table) for all of the videos that I've transcribed and embedded. 

In [14]:
# ====================================
# DETERMINING DELTA FOR video_metadata
# ====================================
# Below, I'm going to determine the delta between the current state of the Postgres database and the state of the GBQ database.

# Log that we're determining the delta for the video_metadata table
logger.info("DETERMINING DELTA FOR video_metadata...")

# Define a query that'll grab all of the video metadata from the GBQ database
video_metadata_query = f"""
-- This query will select metadata for all of the videos that have transcriptions & embeddings
SELECT
  video.*,
  enriched_video.video_type,
  enriched_video.review_score
FROM
  `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.video_metadata` video
JOIN
  `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.enriched_video_metadata` enriched_video
ON
  video.id = enriched_video.id
WHERE
  video.url IN (SELECT DISTINCT(url) FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.transcriptions`)
  AND
  video.url IN (SELECT DISTINCT(video_url) AS url FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.embeddings`)
  AND
  video.id NOT IN (SELECT id FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.cur_pg_db_video_metadata`)
"""

# Execute the above query
video_metadata_df = read_gbq(video_metadata_query)

2024-02-18 23:09:00,077 - postgres_notebook - INFO - DETERMINING DELTA FOR video_metadata...


Downloading: 100%|[32m██████████[0m|


### **`transcriptions`**
Next, I'm going to download all of the transcriptions for the videos I'd identified above. 

In [15]:
# ====================================
# DETERMINING DELTA FOR transcriptions
# ====================================
# Below, I'm going to determine the delta between the current state of the Postgres database and the state of the GBQ database.

# Log that we're determining the delta for the transcriptions table
logger.info("DETERMINING DELTA FOR transcriptions...")

# Declare the query that will download all of the relevant rows from the 
# transcription table
transcriptions_query = f"""
SELECT 
  transcription.url,
  transcription.text,
  transcription.segment_id,
  transcription.segment_seek,
  transcription.segment_start,
  transcription.segment_end
FROM
  `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.transcriptions` transcription
JOIN (
  SELECT
    video.url
  FROM
    `neural-needledrop.backend_data.video_metadata` video
  WHERE
    video.url IN (SELECT DISTINCT(url) FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.transcriptions`)
    AND
    video.url IN (SELECT DISTINCT(video_url) AS url FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.embeddings`)
    AND
    video.id NOT IN (SELECT id FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.cur_pg_db_video_metadata`)
) video
ON
  video.url = transcription.url
"""

# Execute the above query
transcriptions_df = read_gbq(transcriptions_query)

2024-02-18 23:09:13,637 - postgres_notebook - INFO - DETERMINING DELTA FOR transcriptions...


Downloading: 100%|[32m██████████[0m|


### **`embeddings`**
Next up, the `embeddings` table! This one will require a *little* more setup, since we'll need to separately download the embeddings themselves from GCS. (My GBQ dataset only has pointers to each of the GCS URLs.)

I'll start by downloading all of the GBQ data:

In [16]:
# ================================
# DETERMINING DELTA FOR embeddings
# ================================
# Below, I'm going to determine the delta between the current state of the Postgres database and the state of the GBQ database.

# Log that we're determining the delta for the embeddings table
logger.info("DETERMINING DELTA FOR embeddings...")

# Declare the query that will download the `embeddings` table
embeddings_query = f"""
SELECT 
  embedding.id,
  embedding.video_url AS url,
  embedding.embedding_type,
  embedding.start_segment,
  embedding.end_segment,
  embedding.segment_length,
  embedding.gcs_uri
FROM
  `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.embeddings` embedding
WHERE
  embedding.id NOT IN (SELECT id FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.cur_pg_db_embeddings`)
  AND embedding.video_url IN (
    SELECT DISTINCT(video_url) 
    FROM `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.embeddings` 
    LIMIT {max_n_videos_to_update_embeddings}
  )
GROUP BY
  embedding.id,
  embedding.video_url,
  embedding.embedding_type,
  embedding.start_segment,
  embedding.end_segment,
  embedding.segment_length,
  embedding.gcs_uri
"""

# Execute the above query
# TODO: UNCOMMENT THIS .head(1000) TO GET ALL OF THE EMBEDDINGS
embeddings_df = read_gbq(embeddings_query)

2024-02-18 23:09:37,656 - postgres_notebook - INFO - DETERMINING DELTA FOR embeddings...


Downloading: 100%|[32m██████████[0m|


I can delete the temporary tables now! 

In [17]:
# Indicate that we're deleting the temporary GBQ tables
logger.info("DELETING TEMPORARY GBQ TABLES...")

# Delete each of the cur_pg_db tables
try:
    read_gbq(f"DROP TABLE IF EXISTS `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.cur_pg_db_video_metadata`")
except:
    pass

try:
    read_gbq(f"DROP TABLE IF EXISTS `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.cur_pg_db_transcriptions`")
except:
    pass

try:
    read_gbq(f"DROP TABLE IF EXISTS `{GBQ_PROJECT_ID}.{GBQ_DATASET_ID}.cur_pg_db_embeddings`")
except:
    pass

2024-02-18 23:09:47,090 - postgres_notebook - INFO - DELETING TEMPORARY GBQ TABLES...


Downloading: |[32m          [0m|
Downloading: |[32m          [0m|
Downloading: |[32m          [0m|


Now that I've got all of the embeddings metadata, I can download the embeddings themselves. I'll start by creating a temporary directory to store embeddings:

In [18]:
# ===============================
# DOWNLOADING EMBEDDINGS FROM GCS
# ===============================
# Below, I'm going to download the embeddings from GCS to the local machine

# Log that we're downloading the embeddings from GCS
logger.info("DOWNLOADING EMBEDDINGS FROM GCS...")

# Create a temporary directory to store the embeddings
temp_emb_directory_path = Path("temp_embeddings")
temp_emb_directory_path.mkdir(exist_ok=True, parents=True)

# Remove any files that're already in the directory if it exists
for file in temp_emb_directory_path.glob("*"):
    file.unlink()

# Create a GCS client
gcs_client = storage.Client(
    project=GBQ_PROJECT_ID
)


2024-02-18 23:10:12,269 - postgres_notebook - INFO - DOWNLOADING EMBEDDINGS FROM GCS...


Now, I'll iterate through each of the GCS URIs and download them. 

In [20]:
# Prepare the list of GCS URIs
gcs_uris = embeddings_df["gcs_uri"].unique()

def download_embedding(idx_and_uri):
    idx, gcs_uri = idx_and_uri
    try:
        # Parse the GCS URI
        split_gcs_uri = gcs_uri.split("gs://")[-1]
        bucket_name, file_name = split_gcs_uri.split("/")[0], "/".join(
            split_gcs_uri.split("/")[1:]
        )
        
        # Download the embedding corresponding with this GCS URI
        download_file_from_bucket(
            bucket_name=bucket_name,
            file_name=file_name,
            destination_folder=str(temp_emb_directory_path) + "/",
            project_id=GBQ_PROJECT_ID,
            gcs_client=gcs_client,
            logger=logger,
        )

    except Exception as e:
        print(f"Error parsing GCS URI: {e}")
        pass

# Use ThreadPoolExecutor to parallelize the download process
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(download_embedding, idx_uri): idx_uri for idx_uri in enumerate(gcs_uris)}
    for future in tqdm(as_completed(futures), total=len(gcs_uris)):
        pass

100%|██████████| 3177/3177 [00:45<00:00, 70.45it/s]


Next, I'll load all of the embeddings into RAM. 

In [21]:
# ===========================
# LOADING EMBEDDINGS INTO RAM
# ===========================
# Below, I'm going to load the embeddings into RAM, so that I can then upload them to the Postgres database

# Log that we're loading the embeddings into the Postgres database
logger.info("LOADING EMBEDDINGS INTO RAM...")

# We're going to store the embeddings in a dictionary, where the key is the
# embedding ID and the value is the ndarray of the embedding
embeddings = {}
for idx, emb_file in tqdm(list(enumerate(list(temp_emb_directory_path.iterdir())))):
    try:
        # Load in the .npy file as a numpy array
        embedding = np.load(emb_file)
        
        # If the embedding is empty, skip it
        # TODO: This is because it seems like a ton of embeddings are 
        # empty. We should figure out why that is.
        if embedding.shape == ():
            continue
        
        # Get the embedding ID
        embedding_id = emb_file.stem
        
        # Add the embedding to the dictionary, storing the list representation
        embeddings[embedding_id] = embedding.tolist()
        
    except Exception as e:
        print(f"Error loading embedding: {e}")
        pass

2024-02-18 23:11:16,136 - postgres_notebook - INFO - LOADING EMBEDDINGS INTO POSTGRES...
100%|██████████| 3177/3177 [00:26<00:00, 119.66it/s]


Now that I've loaded all of the embeddings, I'll delete all of the files in the temporary folder:

In [22]:
# Delete all of the files in the temp directory
for file in temp_emb_directory_path.glob("*"):
    file.unlink()
    
# Delete the temp directory
temp_emb_directory_path.rmdir()

Finally, I'll add the embeddings I've loaded to the embedding DataFrame:

In [23]:
# Make a "loaded_embeddings_df" that has the embeddings loaded in
loaded_embeddings_df = embeddings_df.copy()
loaded_embeddings_df["embedding"] = loaded_embeddings_df["id"].apply(
    lambda x: embeddings.get(x, None)
)

# Drop any rows where the embedding is None
loaded_embeddings_df = loaded_embeddings_df.dropna(subset=["embedding"]).drop_duplicates(
    subset=["id"]
)

### Uploading Data to the Postgres Database
Now that I've downloaded all of the data, I'm going to determine which data I need to upload, and then upload it!

Now that we've got the data currently in the database, we'll figure out which files we need to download. 

In [25]:
# ==============================
# ADDING ROWS TO POSTGRES TABLES
# ==============================
# Below, I'm going to add the rows to the Postgres tables

# Log that we're adding rows to the Postgres tables
logger.info("ADDING ROWS TO POSTGRES TABLES...")

# Determine which rows of the video_metadata_df we need to add to the database
video_metadata_df_to_add = video_metadata_df[
    ~video_metadata_df["id"].isin(cur_database_video_metadata_df["id"])
].copy()

# Determine which rows of the transcriptions_df we need to add to the database
transcriptions_df_to_add = transcriptions_df[
    ~transcriptions_df["url"].isin(cur_database_transcriptions_df["url"])
].copy()

# Determine which rows of the embeddings_df we need to add to the database
embeddings_df_to_add = loaded_embeddings_df[
    ~loaded_embeddings_df["id"].isin(cur_database_embeddings_df["id"])
].copy()

# Log some information about the number of rows we're adding
logger.info(
    f"Adding {len(video_metadata_df_to_add)} rows to the video_metadata table."
)
logger.info(
    f"Adding {len(transcriptions_df_to_add)} rows to the transcriptions table."
)
logger.info(
    f"Adding {len(embeddings_df_to_add)} rows to the embeddings table."
)

2024-02-18 23:12:48,909 - postgres_notebook - INFO - ADDING ROWS TO POSTGRES TABLES...


2024-02-18 23:12:48,956 - postgres_notebook - INFO - Adding 56 rows to the video_metadata table.
2024-02-18 23:12:48,958 - postgres_notebook - INFO - Adding 9476 rows to the transcriptions table.
2024-02-18 23:12:48,958 - postgres_notebook - INFO - Adding 3177 rows to the embeddings table.


### Uploading Data to Postgres Table
Now, we'll upload each of the DataFrames to the Postgres server. 

In [26]:
# Upload the video metadata to the database
upload_to_table(
    video_metadata_df_to_add,
    "video_metadata",
    engine=engine,
    logger=logger,
)

# Upload the transcriptions to the database
upload_to_table(
    transcriptions_df_to_add,
    "transcriptions",
    engine=engine,
    logger=logger,
)

# Upload the embeddings to the database
upload_to_table(
    embeddings_df_to_add.drop(columns=["gcs_uri"]),
    "embeddings",
    engine=engine,
    logger=logger,
    chunksize=postgres_upload_chunksize,
)

### Table Indices
Below, I'm going to recreate the table indices. This will ensure that I'm able to create fast queries. 

In [30]:
# =========================
# EMBEDDINGS INDEX CREATION
# =========================
# Below, I'm going to create the index for the embeddings table

query_postgres(
    f"SET max_parallel_maintenance_workers = {postgres_max_parallel_maintenance_workers}; -- plus leader",
    engine=engine,
    logger=logger,
)
query_postgres(
    f"SET maintenance_work_mem = '{postgres_maintenance_work_mem}';",
    engine=engine,
    logger=logger,
)

# If the user wants to re-create the embeddings index, then do so
if recreate_embeddings_index_if_exists:
    # Log that we're re-creating the embeddings index
    logger.info("RE-CREATING THE EMBEDDINGS INDEX...")

    # Drop the index if it already exists
    query_postgres(
        "DROP INDEX IF EXISTS embeddings_embedding_idx;",
        engine=engine,
        logger=logger,
    )

    # Run the query that will create the IVFFlat index
    query_postgres(
        f"""CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = {embeddings_index_ivfflat_nlist});""",
        engine=engine,
        logger=logger,
    )

else:
    # Try to create the index if it doesn't exist
    try:
        # Log that we're creating the embeddings index
        logger.info("CREATING THE EMBEDDINGS INDEX...")

        # Run the query that will create the IVFFlat index
        query_postgres(
            f"""CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = {embeddings_index_ivfflat_nlist});""",
            engine=engine,
            logger=logger,
        )
    except Exception as e:
        # Log the error if the index already exists
        logger.error(f"An error occurred while creating the embeddings index: {e}")

2024-02-18 23:25:39,360 - postgres_notebook - INFO - RE-CREATING THE EMBEDDINGS INDEX...


### Creating the `embeddings_to_text` Table

In [None]:
# ===========================
# CREATING EMBEDDINGS TO TEXT
# ===========================
# Below, I'm going to create a table that maps embeddings to text

table_creation_query = f"""
CREATE TABLE embeddings_to_text AS (
    WITH 
    embedding_to_text_flattened AS (
        SELECT
        embeddings.id,
        embeddings.url,
        embeddings.start_segment,
        embeddings.end_segment,
        transcriptions.text,
        transcriptions.segment_id,
        transcriptions.segment_start,
        transcriptions.segment_end
        FROM
        embeddings
        LEFT JOIN
        transcriptions
        ON
        transcriptions.segment_id >= embeddings.start_segment
        AND
        transcriptions.segment_id < embeddings.end_segment
        AND
        transcriptions.url = embeddings.url
        ORDER BY
        url DESC,
        segment_id ASC
    ),
    
    embedding_to_text AS (
        SELECT
            id,
            url,
            start_segment,
            end_segment,
            ARRAY_TO_STRING(ARRAY_AGG(emb.text ORDER BY emb.segment_id), '') AS text,
            MIN(emb.segment_start) AS segment_start,
            MAX(emb.segment_end) AS segment_end
        FROM
            embedding_to_text_flattened emb
        GROUP BY
            id,
            url,
            start_segment,
            end_segment
    )

    SELECT * FROM embedding_to_text
)
"""

# Execute the above query
query_postgres(
    table_creation_query,
    engine=engine,
    logger=logger,
)