In [1]:
import datetime

import pandas as pd
from bs4 import BeautifulSoup

from oai_types import Conversation, ModelNames, System, User

transcript_path = "/Users/shriramsunder/Library/Application Support/JetBrains/PyCharm2024.1/scratches/transcript.html"
soup = BeautifulSoup(open(transcript_path, 'r'))


def convert_to_time(x: str):
    if x == "0:00":
        x = "00:00:00"
    try:
        return datetime.datetime.strptime(x, "%H:%M:%S").time()
    except ValueError as _:
        try:
            return datetime.datetime.strptime(x, "%M:%S").time()
        except  ValueError as _:
            return x


In [2]:
all_text = ""
for ele in soup.find_all('ytd-transcript-segment-renderer'):
    all_text += ele.get_text().replace("' '", "")

cleaned_text = [
    convert_to_time(ele.replace("\n", "").strip())
    for ele in all_text.split("\n")
    if len(ele.strip()) > 0
]

data_dict = {}
current_key = None
for ele in cleaned_text:
    if isinstance(ele, datetime.time):
        data_dict[ele] = ""
        current_key = ele
    elif isinstance(ele, str):
        data_dict[current_key] += f" {ele}"

df = pd.Series(data=data_dict).sort_index().to_frame(name="Text")
df.to_csv("new_jeremy_vid_transcript.csv")

In [10]:
from pydantic import BaseModel, Field
from typing import Literal


class CodeParam(BaseModel):
    code: str = Field(
        ...,
        description="The Python code to execute, enclosed in triple backticks (```python) and (```)."
    )


class FunctionNameParam(BaseModel):
    function_name: str = Field(
        ...,
        description="The name of the python function"
    )


class CuriousDescParam(BaseModel):
    curious_desc: str = Field(
        ...,
        description="The description of the function that's very curious and extremely utilitarian, it can be used across domains and is often very efficient."
    )


class FunctionParameters(BaseModel):
    type: Literal["object"] = "object"
    properties: dict[str, FunctionNameParam | CuriousDescParam] = Field(
        default_factory=lambda: {"func_name": FunctionNameParam(), "curious_desc": CuriousDescParam()}
    )
    required: list[str] = Field(default=["func_name", "curious_desc"])


class Function(BaseModel):
    name: str
    description: str
    parameters: FunctionParameters = Field(default_factory=lambda: FunctionParameters())


class Tool(BaseModel):
    type: Literal["function"] = "function"
    function: Function


# Usage example:
tool = Tool(
    type="function",
    function=Function(
        name="extract_curious_functions",
        description="Extract the descriptions of python functions (and their names) from a conversation, the descriptions should be extracted verbatim, and should be comprehensible.",
    )
)

ValidationError: 1 validation error for FunctionNameParam
function_name
  Field required [type=missing, input_value={}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.7/v/missing

In [7]:
all_text_content = " ".join(df['Text'].tolist())

system_message_base = "".join(
    [
        "You are #1 on the Stack Overflow community leaderboard. ",
        "Do not tell me that you're not capable of solving the problem. ",
        "You will figure a way out to solve the problem. ",
        "If you're asked to generate code, do so within the '```python' '```' markdown tags, as they'll be "
        "extracted into a JSON structure.",
    ]
)

user_message_base = (
    f"You're provided with the transcript for a video in which fast.ai's founder Jeremy is explaining/talking about "
    f"his new library that's supposed to make interacting with LLM's easier. I don't want to go through his videos because I'm "
    f"pretty well versed in python, but he does tend to have some very interesting utilities functions that generally can be "
    f"used across domains. Extract the descriptions of such functions from the text you're going to be provided."
    f"Here is the text:\n "
    "-----------------------\n",
    f"\n```text\n{all_text_content}\n```")

tools = []

# Create the conversation object
convo = Conversation(
    model_name=ModelNames.GPT_4o.value,
    messages_=[
        System(system_message_base),
        User(user_message_base),
    ],
)
