In [1]:
import os
import getpass

import openai

import pandas as pd

import lancedb
from lancedb.context import contextualize
from lancedb.embeddings import with_embeddings

from langchain.prompts import PromptTemplate
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import LanceDB
from langchain.chains import LLMChain, ConversationChain
from langchain.llms import OpenAI

from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder
)
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

OpenAI API Key: ········


In [3]:
data = load_dataset('jamescalam/youtube-transcriptions', split='train')
data

Found cached dataset json (/Users/mukul/.cache/huggingface/datasets/jamescalam___json/jamescalam--youtube-transcriptions-08d889f6a5386b9b/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


Dataset({
    features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],
    num_rows: 208619
})

In [4]:
df = contextualize(data.to_pandas()).groupby('title').text_col('text').window(20).stride(4).to_df()
df.head()

Unnamed: 0,title,published,url,video_id,channel_id,id,text,start,end
177622,$5 MILLION AI for FREE,2022-08-12 15:18:07,https://youtu.be/3EjtHs_lXnk,3EjtHs_lXnk,UCfzlCWGWYyIQ0aLC5w48gBQ,3EjtHs_lXnk-t0.0,Imagine an AI where all in the same model you ...,0.0,24.0
177626,$5 MILLION AI for FREE,2022-08-12 15:18:07,https://youtu.be/3EjtHs_lXnk,3EjtHs_lXnk,UCfzlCWGWYyIQ0aLC5w48gBQ,3EjtHs_lXnk-t33.0,"So when you're done, you probably want to keep...",33.0,45.0
177630,$5 MILLION AI for FREE,2022-08-12 15:18:07,https://youtu.be/3EjtHs_lXnk,3EjtHs_lXnk,UCfzlCWGWYyIQ0aLC5w48gBQ,3EjtHs_lXnk-t66.0,You can download multiple size variants all th...,66.0,77.0
177634,$5 MILLION AI for FREE,2022-08-12 15:18:07,https://youtu.be/3EjtHs_lXnk,3EjtHs_lXnk,UCfzlCWGWYyIQ0aLC5w48gBQ,3EjtHs_lXnk-t89.0,So just how much memory are we talking here? W...,89.0,98.0
177638,$5 MILLION AI for FREE,2022-08-12 15:18:07,https://youtu.be/3EjtHs_lXnk,3EjtHs_lXnk,UCfzlCWGWYyIQ0aLC5w48gBQ,3EjtHs_lXnk-t110.0,So I know there are probably like a couple of ...,110.0,119.0


In [5]:
df.iloc[0]['text']

"Imagine an AI where all in the same model you could translate languages, write code, solve crossword puzzles, be a chatbot, and do a whole bunch of other crazy things. This sort of an AI would certainly require a supercomputer of hundreds of A100 GPUs and months of training, even on all that power. We would need a team of researchers, the best of the best. We're talking about a project in the realm of say, 5 plus million dollars. So when you're done, you probably want to keep it to yourself. Maybe you'll sell access via an API, but definitely you can't share the actual model because of AI safety or something like that. What if I told you though, for the last year, a group of over a thousand researchers has been quietly working on their own version of a 176 billion parameter model trained on the nuclear powered supercomputer, the Jonset, and is available now for you to download free of charge. You can download multiple size variants all the way up to 176 billion parameters for free. I'

In [6]:
def get_embeddings(text):
    res = openai.Embedding.create(input=text, engine='text-embedding-ada-002')
    embeddings = [record['embedding'] for record in res['data']]
    
    return embeddings

In [7]:
data = with_embeddings(func=get_embeddings, data=df, show_progress=True)
data.to_pandas().head()

  0%|                                                                                                                  | 0/49 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [74]:
data.shape

(48935, 10)

In [76]:
db = lancedb.connect('/tmp/lancedb')
table = db.create_table('my_table', data)
len(table)

48935

In [121]:
query = ("Why is the sky blue?")

emb = get_embeddings(query)[0]
context = table.search(emb).limit(3).to_df()

In [122]:
ct = ""
for item in context['text'].tolist():
    ct += f'"{item}"'
    ct += '\n\n'
    
print(ct)

"I like turtles. Okay, what about what is the color of the sky? What is the color of the sky? The color of the blue. Okay, okay. Did you vote for Hillary or Donald? Did you vote for Hillary or Donald? I voted for the president. Okay, okay. Let's ask some more pressing questions that we really need to know the answers for. What is the best song ever? What is the best song ever? Derrick, Sandstorm. No arguments there. What is the universe? What is the universe? The universe is a lie. At least I have my friend so you know are you my pal, buddy? Are you my pal, buddy? He is."

"What is going on everybody welcome to a new tutorial series where I show you how you can make your very own machine friend that you can talk to. For example I have my friend here, let's see what they have to say. So first we'll start off with some pretty pressing questions like do you like pancakes? We'd all like to know. I like turtles. Okay, what about what is the color of the sky? What is the color of the sky? Th

In [123]:
llm = OpenAI(model_name='gpt-3.5-turbo', temperature=0)

prompt_str = """
Answer the query based on the following context. If you do not know the answer
just say you don't know. Do not make up false information. If that answer cannot found
using the provide context, just say you don't know.

QUESTION: {query}

CONTEXT: {context}
"""

prompt = PromptTemplate(
    input_variables=['query', 'context'],
    template=prompt_str
)

print(prompt.format(query=query, context=ct))


Answer the query based on the following context. If you do not know the answer
just say you don't know. Do not make up false information. If that answer cannot found
using the provide context, just say you don't know.

QUESTION: Why is the sky blue?

CONTEXT: "I like turtles. Okay, what about what is the color of the sky? What is the color of the sky? The color of the blue. Okay, okay. Did you vote for Hillary or Donald? Did you vote for Hillary or Donald? I voted for the president. Okay, okay. Let's ask some more pressing questions that we really need to know the answers for. What is the best song ever? What is the best song ever? Derrick, Sandstorm. No arguments there. What is the universe? What is the universe? The universe is a lie. At least I have my friend so you know are you my pal, buddy? Are you my pal, buddy? He is."

"What is going on everybody welcome to a new tutorial series where I show you how you can make your very own machine friend that you can talk to. For example I



In [124]:
chain = LLMChain(llm=llm, prompt=prompt)
chain.predict(query=query, context=ct)

"I don't know."

In [143]:
system_prompt_template = """
The following is a conversation between a human and an AI.
Answer the question based on the context provided. If you do not know the answer
just say you don't know. Do not make up false information. If the answer cannot found
using the provide context, just say you don't know.
"""

prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_prompt_template),
    MessagesPlaceholder(variable_name='history'),
    HumanMessagePromptTemplate.from_template('{input}')
])

llm = ChatOpenAI(temperature=0)
memory = ConversationBufferMemory(return_messages=True)
conversation = ConversationChain(memory=memory, prompt=prompt, llm=llm)

def get_context(query):
    emb = get_embeddings(query)[0]
    context = table.search(emb).limit(3).to_df()
    
    ct = ""
    for item in context['text'].tolist():
        ct += f'"{item}"'
        ct += '\n\n'
        
    return ct

for i in range(10):
    query = input()
    context = get_context(query)
    
    user_input = f'QUESTION: {query} \n\n CONTEXT: {context}'
    
    res = conversation.predict(input=user_input)
    print(res)
    print()

 What are transfomers?


Transformers are a classic model that can be used to map one string into another string. They are used to do tasks like translating problems into solutions.



 Can provide some more details?


I apologize, but the provided context does not provide enough information to answer the question.



 How do I trained a transfomer model and what kind of data do I need?


To train a transformer model, you need to collect and preprocess data, initialize the Hugging Face Transformers framework, encode input data to get input ID and attention tensors, build the full model architecture, set the optimizer, metrics, and loss, and begin training. The type of data needed depends on the specific task the transformer model is being trained for. For example, in the provided context, the IMDB Movie Review dataset was used for sentiment analysis.



 Can you use use images with transfomers


InvalidRequestError: This model's maximum context length is 4097 tokens. However, your messages resulted in 4157 tokens. Please reduce the length of the messages.

In [146]:
print(conversation.memory.buffer)

[HumanMessage(content='QUESTION: What are transfomers? \n\n CONTEXT: "And I have learned it at some point, if you\'re trying to get into PAC base bounce, this is a I believe over 60 pages introduction to it that seems to be quite well written, introducing you to all the important concepts in it. So if you\'re interested, give it a try. Even face met whatever research releases, Xformers, hackable and optimized transformers building blocks supporting a composable construction. So if you\'re into transformers, and if you would like to recombine them, try out different things inside of them, Xformers might be a great library on doing that. So you see all of these boxes here, essentially, this library makes it pretty easy to just rearrange them, connect them differently, and so on. Xformerb is a speech processing universal performance benchmark. This means that this benchmark has a bunch of speech tasks, so tasks in machine learning, where the input is a piece of speech. But the goal here i