# Import Libraries

In [1]:
import openai
import os
import dotenv
from typing import List
from dotenv import load_dotenv
from langchain.schema.document import Document
from helper_functions import embed_documents
from langchain.vectorstores.pgvector import PGVector
from langchain.embeddings.openai import OpenAIEmbeddings

## Connect Database to Supabase

In [2]:
import vecs

DB_CONNECTION = "postgresql://postgres:supa-jupyteach@192.168.0.77:54328/postgres"
COLLECTION_NAME = "documents"

# create vector store client
vx = vecs.create_client(DB_CONNECTION)

# create a collection of vectors with 3 dimensions
docs = vx.get_or_create_collection(name="documents", dimension=1536)

# Define Chunk SRT Files Function

In [10]:
import re
rex = re.compile(r"\d+\n(\d{2}:\d{2}:\d{2}),\d{3} --> (\d{2}:\d{2}:\d{2}),\d{3}")

def chunk_srt_files(full_text, chunk_length):

    # split on the regex.
    splits = rex.split(full_text)[1:]

    # combine parts into a list of 3-tuples (start, end, txt)
    parts = []
    for i in range(0, len(splits), 3):
        start_time = splits[i]
        end_time = splits[i+1]
        content = splits[i+2].strip()
        parts.append((start_time, end_time, content))
        

    # combine multiple parts to get desired chunk length
    # will be a list of 3-tuples (start, end, txt)
    chunks = []
    ix = 0
    current_chunk_text = ""
    for i, part in enumerate(parts):
        current_chunk_text = current_chunk_text + " " + part[2]
        if len(current_chunk_text) > chunk_length or i == len(parts) - 1:
            # if we have a long enough chunk OR we are on the last piece of content...
            current_chunk = (
                parts[ix][0],  # starting timestamp
                part[1],
                current_chunk_text.strip()
            )
            chunks.append(current_chunk)
            ix = i  # we repeat this chunk one more time for overlap
            current_chunk_text =  part[2]

    return chunks

# Define Process Video Function

In [14]:
def read_txt_file(file_path):
    """Read the content of a .txt file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

In [15]:
# NOTE: use the srt files directly from whisper, not ones we have modified
# you can find these in this folder: jupyteach-ai/videos/transcripts_tiny

def process_video(path_to_transcript):
    # step 1: read in the srt_content from the file at `path_to_transcript`
    srt_content = read_txt_file(path_to_transcript)

    # step 2: call `chunk_srt_files` (see above) on the srt_content
    chunks = chunk_srt_files(srt_content, 1000)
    
    # step 3: convert the (start, end, txt) tuples you get back from `chunk_srt_files`
    #         into langchain.schema.document.Document with metadata set properly
    docs = []
    for i, chunk in enumerate(chunks):
        metadata = {
            "source": path_to_transcript, 
            "chunk_number": i, 
            "timestamps": f"{chunk[0]} --> {chunk[1]}"
        }
        doc = Document(page_content = chunk[2], metadata=metadata)
        docs.append(doc)
        
    # step 4: call `embed_documents` to create/store emebddings for the video
    return embed_documents(docs)

## Call Function on SRT Files

#### 2.1

In [16]:
pandas_intro = process_video("videos/transcripts_tiny/2.1.1 pandas intro.srt")

In [21]:
pandas_core = process_video("videos/transcripts_tiny/2.1.2 pandas core functionality.srt")

#### 2.10

In [28]:
markov_chains_1 = process_video("videos/transcripts_tiny/2.10.1 Markov Chains 1 (1 of 4).srt")

In [31]:
markov_chains_2 = process_video("videos/transcripts_tiny/2.10.2 Markov Chains 2 (2 of 4).srt")

In [32]:
markov_chains_3 = process_video("videos/transcripts_tiny/2.10.3 Markov Chains 3 (3 of 4).srt")

In [37]:
markov_chains_4 = process_video("videos/transcripts_tiny/2.10.4 Markov Chains 4 (4 of 4).srt")

#### 2.11

In [38]:
lake_model = process_video("videos/transcripts_tiny/2.11.1 Lake Model.srt")

In [40]:
lake_model_data_1 = process_video("videos/transcripts_tiny/2.11.2 Lake Model Meets Data (1 of 2).srt")

In [43]:
lake_model_data_2 = process_video("videos/transcripts_tiny/2.11.3 Lake Model Meets Data (2 of 2).srt")

#### 2.12

In [44]:
web_scraping = process_video("videos/transcripts_tiny/2.12.1 Web Scraping Intro.srt")

#### 2.13

In [47]:
sharing_results = process_video("videos/transcripts_tiny/2.13.1 Sharing Results.srt")

#### 2.2

In [48]:
pandas_index = process_video("videos/transcripts_tiny/2.2.1 the pandas index.srt")

In [49]:
data_storage = process_video("videos/transcripts_tiny/2.2.2 data storage formats with pandas.srt")

#### 2.3

In [53]:
data_cleaning = process_video("videos/transcripts_tiny/2.3.1 cleaning data with pandas.srt")

In [56]:
reshaping_data = process_video("videos/transcripts_tiny/2.3.2 Reshaping data with pandas.srt")

In [59]:
merging_datasets = process_video("videos/transcripts_tiny/2.3.3 Merging datasets with Pandas.srt")

#### 2.4

In [60]:
pandas_core_review = process_video("videos/transcripts_tiny/2.4.1 Pandas Core Review (1 of 2).srt")

In [63]:
pandas_review = process_video("videos/transcripts_tiny/2.4.2 Pandas Review UN Population data (2 of 2).srt")

#### 2.5

In [64]:
groupby_operations = process_video("videos/transcripts_tiny/2.5.1 Groupby Operations.srt")

#### 2.6

In [65]:
accessing_data = process_video("videos/transcripts_tiny/2.6.1 Accessing Data via APIs (1 of 4).srt")

In [68]:
labor_markets = process_video("videos/transcripts_tiny/2.6.2 Data on Labor Markets (2 of 4).srt")

In [69]:
bls_api = process_video("videos/transcripts_tiny/2.6.3 Using the BLS Api (3 of 4).srt")

In [70]:
exploring_labor_market = process_video("videos/transcripts_tiny/2.6.4 Exploring Labor Market Data from BLS (4 of 4).srt")

#### 2.7

In [73]:
plotting_intro = process_video("videos/transcripts_tiny/2.7.1 Plotting Introduction.srt")

In [74]:
visual_rules = process_video("videos/transcripts_tiny/2.7.2 Visualization Rules.srt")

In [75]:
web_plotting = process_video("videos/transcripts_tiny/2.7.3 Web plotting introduction.srt")

In [76]:
altair = process_video("videos/transcripts_tiny/2.7.4 Altair.srt")

#### 2.8

In [78]:
intro_sql_1 = process_video("videos/transcripts_tiny/2.8.1 Intro to SQL (part 1 of 2).srt")

In [79]:
intro_sql_2 = process_video("videos/transcripts_tiny/2.8.2 Intro to SQL (part 2 of 2).srt")

#### 2.9

In [80]:
temporal_data = process_video("videos/transcripts_tiny/2.9.1 Temporal Data In pandas.srt")