In [2]:
!pip install -r requirements.txt --quiet

# YouTube Indexing

This is a simple example of how to use OpenAI's Whisper with Pinecone and the OpenAI API to ask questions about any videos on YouTube.

In [1]:
import os
import whisper
import tiktoken
import openai
import pinecone
import tempfile
import numpy as np
import pandas as pd

from pytube import YouTube
from uuid import uuid4
from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV")

  from tqdm.autonotebook import tqdm


## Transcribing YouTube Videos

In [132]:
# YOUTUBE_VIDEOS = [
#     "https://www.youtube.com/watch?v=nNwE0sQq39w",
#     "https://www.youtube.com/watch?v=rUBw_F5uV4Q",
#     "https://www.youtube.com/watch?v=Y1-s37zrm1M",
#     "https://www.youtube.com/watch?v=9uTlRae2uQs",
#     "https://www.youtube.com/watch?v=7N_hJLl-BK8",
# ]
YOUTUBE_VIDEOS = ["https://www.youtube.com/watch?v=cdiD-9MMpb0"]

In [133]:
def transcribe(youtube_url, model):
    youtube = YouTube(youtube_url)

    audio = youtube.streams.filter(only_audio=True).first()

    with tempfile.TemporaryDirectory() as tmpdir:
        file = audio.download(output_path=tmpdir)
        title = os.path.basename(file)[:-4]
        result = model.transcribe(file, fp16=False)

    return title, youtube_url, result["text"].strip()


transcriptions = []
model = whisper.load_model("base")

for youtube_url in YOUTUBE_VIDEOS:
    transcriptions.append(transcribe(youtube_url, model))

df = pd.DataFrame(transcriptions, columns=["title", "url", "text"])
df.to_csv("text.csv")

df.head()

Unnamed: 0,title,url,text
0,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,I think it's possible that physics has exploit...


## Tokenizing The Text

In [26]:
MAX_TOKENS = 500

tokenizer = tiktoken.get_encoding("cl100k_base")

df = pd.read_csv("text.csv", index_col=0)
df["tokens"] = df.text.apply(lambda x: len(tokenizer.encode(x)))

df.head()

Unnamed: 0,title,url,text,tokens
0,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,I think it's possible that physics has exploit...,47003


In [38]:
def split_into_many(text, max_tokens):
    # Split the text into sentences
    sentences = text.split('. ')

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1
        
    # Add the last chunk to the list of chunks
    if chunk:
        chunks.append(". ".join(chunk) + ".")

    return chunks


data = []
for row in df.iterrows():
    title = row[1]["title"]
    url = row[1]["url"]
    text = row[1]["text"]
    tokens = row[1]["tokens"]

    if tokens <= MAX_TOKENS:
        data.append((title, url, text))
    else:
        for chunk in split_into_many(text, MAX_TOKENS):
            data.append((title, url, chunk))

df = pd.DataFrame(data, columns=["title", "url", "text"])
df["tokens"] = df.text.apply(lambda x: len(tokenizer.encode(x)))
df

Unnamed: 0,title,url,text,tokens
0,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,I think it's possible that physics has exploit...,489
1,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,So basically I'm underselling it by a lot beca...,500
2,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,"And when you give them a hard enough problem, ...",500
3,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,"Okay, so artificial neural networks are doing ...",497
4,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,"I almost understand everything else, I think i...",490
...,...,...,...,...
100,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,It seems like there's a huge incentive to auto...,12
101,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,"Yeah, it's very confusing. I don't know if you...",461
102,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,Are you excited about that feature? Just AI's ...,477
103,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,"And then for that, you sort of backtrack and s...",496


## Generate Embeddings

In [39]:
EMBEDDING_ENGINE = "text-embedding-ada-002"

df["embeddings"] = df.text.apply(
    lambda x: openai.Embedding.create(input=x, engine=EMBEDDING_ENGINE)["data"][0][
        "embedding"
    ]
)

df.to_csv("embeddings.csv")
df.head()

Unnamed: 0,title,url,text,tokens,embeddings
0,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,I think it's possible that physics has exploit...,489,"[-0.004084109328687191, 0.0062974547035992146,..."
1,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,So basically I'm underselling it by a lot beca...,500,"[-0.009419179521501064, -0.005321971606463194,..."
2,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,"And when you give them a hard enough problem, ...",500,"[-0.00832328386604786, -0.0038060909137129784,..."
3,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,"Okay, so artificial neural networks are doing ...",497,"[-0.023266108706593513, -0.0021498766727745533..."
4,Andrej Karpathy Tesla AI Self-Driving Optimus ...,https://www.youtube.com/watch?v=cdiD-9MMpb0,"I almost understand everything else, I think i...",490,"[0.0174504816532135, -0.01404550950974226, -0...."


## Indexing

In [6]:
PINECONE_INDEX = "youtube"

df = pd.read_csv('embeddings.csv', index_col=0)
df['embeddings'] = df.embeddings.apply(eval).apply(np.array)

embedding_dimension = len(df.iloc[0]["embeddings"])

pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
pinecone.whoami()

WhoAmIResponse(username='aeea18e', user_label='youtube', projectname='1ed26d1')

In [7]:
if PINECONE_INDEX not in pinecone.list_indexes():
    pinecone.create_index(
        PINECONE_INDEX,
        dimension=embedding_dimension,
        metric="cosine",
        metadata_config={"indexed": ["title", "url"]},
    )

index = pinecone.Index(PINECONE_INDEX)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [23]:
df["id"] = [str(uuid4()) for _ in range(len(df))]

batch_size = 2
items = df.to_dict(orient="records")

for i in range(0, len(items), batch_size):
    batch = items[i : min(len(items), i + batch_size)]

    ids = [b["id"] for b in batch]
    embeddings = [list(b["embeddings"]) for b in batch]
    metadata = [{"title": b["title"], "url": b["url"], "text": b["text"]} for b in batch]
    index.upsert(vectors=list(zip(ids, embeddings, metadata)))


In [24]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 105}},
 'total_vector_count': 105}

## Querying The Model

In [2]:
PROMPT_LIMIT = 3000 # This is number of characters, not tokens
COMPLETION_MODEL = "gpt-3.5-turbo"

PROMPT = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: [[CONTEXT]]

Question: [[QUESTION]]
"""


def get_prompt(query, prompt_limit=PROMPT_LIMIT):
    response = openai.Embedding.create(input=[query], engine=EMBEDDING_ENGINE)
    embedding = response["data"][0]["embedding"]

    response = index.query(embedding, top_k=3, include_metadata=True)

    context = [m["metadata"]["text"] for m in response["matches"]]

    for i in range(1, len(context)):
        if len(PROMPT.replace("[[CONTEXT]]", "\n\n \n\n".join(context[:i]))) >= prompt_limit:
            prompt = PROMPT.replace(
                "[[CONTEXT]]", "\n\n \n\n".join(context[: i - 1])
            ).replace("[[QUESTION]]", query)
        elif i == len(context) - 1:
            prompt = PROMPT.replace(
                "[[CONTEXT]]", "\n\n \n\n".join(context)
            ).replace("[[QUESTION]]", query)

    return prompt

def get_answer(query):
    prompt = get_prompt(query)
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=COMPLETION_MODEL,
        messages=messages,
        temperature=0,
    )

    return response.choices[0].message["content"]

In [66]:
get_answer("What do they say about Elon Musk?")

"They talk about what the speaker has learned from working with Elon Musk, including how to run organizations efficiently and fight entropy in an organization. They also mention that Elon is a very efficient warrior in the fight against entropy in organizations and that he hates meetings and encourages people to skip them if they're not useful."

In [55]:
get_answer("What do they say about problems with research papers?")

"They say that when you go to a conference or journal, no one discusses anything that's there because it's already irrelevant. The delay in publishing details of breakthrough performance can slow down the community's progress. They also mention that some prestigious venues still have value, but there is a little bit of delay that is part of their objective function."

In [5]:
pinecone.delete_index(PINECONE_INDEX)