In [9]:
import requests
from bs4 import BeautifulSoup
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken

In [10]:
from dotenv import dotenv_values
config = dotenv_values(".env")
openai.api_key = config["OPENAI_API_KEY"]

In [11]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [12]:
def get_html_content(url):
    response = requests.get(url)
    return response.content

In [13]:
def remove_newlines(text):
    cleaned_text = text.replace('\n', '')
    return cleaned_text

In [14]:
def get_plain_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    for script in soup(["script"]):
        script.extract()
    return soup.get_text()
     

In [15]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000)
# text_splitter = CharacterTextSplitter(separator = "\n\n",chunk_size=2000,)

In [16]:
def scrape_text_from_url(url):
    html_content = get_html_content(url)
    plain_text_with_newline = get_plain_text(html_content)
    plain_text = remove_newlines(plain_text_with_newline)
    splitted_text = text_splitter.split_text(plain_text)
    return splitted_text
     

In [9]:
url = input("Enter the URL to scrape text from: ")
plain_text_chunks = scrape_text_from_url(url)
print(plain_text_chunks)

In [1]:
len(plain_text_chunks)

NameError: name 'plain_text_chunks' is not defined

In [13]:
plain_text_chunks[9]

'he came on in the 75th minute during a friendly against José Mourinho\'s Porto on 16 November 2003.[22][35] His performance, creating two chances and a shot on goal, impressed the technical staff, and he subsequently began training daily with the club\'s reserve side, Barcelona B, as well as weekly with the first team.[36] After his first training session with the senior squad, Barça\'s new star player, Ronaldinho, told his teammates that he believed the 16-year-old would become an even better player than himself.[37] Ronaldinho soon befriended Messi, whom he called "little brother", which greatly eased his transition into the first team.[38][39] Messi playing against Málaga in 2005To gain further match experience, Messi joined Barcelona C in addition to the Juveniles A, playing his first game for the third team on 29 November. He helped save them from the relegation zone of the Tercera División, scoring five goals in ten games, including a hat-trick in eight minutes during a Copa del

In [14]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [15]:
total_tokens = sum([len(enc.encode(chunk)) for chunk in plain_text_chunks])
print(total_tokens)

80175


In [16]:
cost = (total_tokens/1000) * 0.0004 
print(cost)

0.03207


CONNECTING PINECONE


In [17]:
import pinecone
pinecone.init(api_key= config["PINECONE_KEY"],
              environment="us-west4-gcp-free")


ADD DATA TO SUPABASE


In [20]:
index = pinecone.Index("chris")

In [20]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 29}},
 'total_vector_count': 29}

In [55]:
def addData(web_data,url):
    id  = index.describe_index_stats()['total_vector_count']
    for i in range(len(web_data)):
        chunk=web_data[i]
        chunkInfo=(str(id+i),
                get_embedding(chunk,model="text-embedding-ada-002"),
                {'title': url,'context': chunk})
        index.upsert(vectors=[chunkInfo])
     

In [56]:
addData(plain_text_chunks,url)

QUERY EMBEDDINGS


In [18]:
def find_match(query,k):
    query_em = get_embedding(query,model= "text-embedding-ada-002")
    result = index.query(query_em, top_k=k, includeMetadata=True)
    
    return [result['matches'][i]['metadata']['title'] for i in range(k)],[result['matches'][i]['metadata']['context'] for i in range(k)]

In [21]:
find_match("how old is chris martin?",2)

(['https://ewzrjnlqbkhwhkwjjsrl.supabase.co',
  'https://en.wikipedia.org/wiki/Chris_Martin'],
 ['Coldplay. For other people, see Chris Martin (disambiguation).English singer-songwriter (born 1977)Chris MartinMartin performing with Coldplay in 2017BornChristopher Anthony John Martin (1977-03-02) 2 March 1977 (age\xa046)Exeter, Devon, EnglandAlma\xa0materUniversity College LondonOccupationsSingersongwritermusicianproducerphilanthropistYears\xa0active1997–presentSpouseGwyneth Paltrow\u200b \u200b(m.\xa02003; div.\xa02016)\u200bPartner(s)Dakota Johnson(2017–present)Children2AwardsFull listMusical careerOriginLondon, EnglandGenresAlternative rockpop rockpost-BritpoppopInstrumentsVocalspianokeyboardsguitarharmonicaLabelsParlophoneAtlanticCapitolMember ofColdplayMusical artistWebsitecoldplay.comSignatureChristopher Anthony John Martin (born 2 March 1977) is an English singer-songwriter and musician. He is best known as the lead vocalist, pianist, rhythm guitarist and co-founder of the rock b

PROMPT CREATION


In [22]:
def create_prompt(context,query):
    header = "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query' \n"
    return header + context + "\n\n" + query + "\n"

def generate_answer(prompt):
    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    temperature=0,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop = [' END']
    )
    return (response.choices[0].text).strip()

In [23]:
query = "how old is Chris Martin?"
docs,res = find_match(query,2)

In [24]:
res

['Coldplay. For other people, see Chris Martin (disambiguation).English singer-songwriter (born 1977)Chris MartinMartin performing with Coldplay in 2017BornChristopher Anthony John Martin (1977-03-02) 2 March 1977 (age\xa046)Exeter, Devon, EnglandAlma\xa0materUniversity College LondonOccupationsSingersongwritermusicianproducerphilanthropistYears\xa0active1997–presentSpouseGwyneth Paltrow\u200b \u200b(m.\xa02003; div.\xa02016)\u200bPartner(s)Dakota Johnson(2017–present)Children2AwardsFull listMusical careerOriginLondon, EnglandGenresAlternative rockpop rockpost-BritpoppopInstrumentsVocalspianokeyboardsguitarharmonicaLabelsParlophoneAtlanticCapitolMember ofColdplayMusical artistWebsitecoldplay.comSignatureChristopher Anthony John Martin (born 2 March 1977) is an English singer-songwriter and musician. He is best known as the lead vocalist, pianist, rhythm guitarist and co-founder of the rock band Coldplay. Born in Exeter, Devon, he went to University College London, where he formed the b

In [25]:
context= "\n\n".join(res)
prompt = create_prompt(context,query)
print(prompt)
     

Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query' 
Coldplay. For other people, see Chris Martin (disambiguation).English singer-songwriter (born 1977)Chris MartinMartin performing with Coldplay in 2017BornChristopher Anthony John Martin (1977-03-02) 2 March 1977 (age 46)Exeter, Devon, EnglandAlma materUniversity College LondonOccupationsSingersongwritermusicianproducerphilanthropistYears active1997–presentSpouseGwyneth Paltrow​ ​(m. 2003; div. 2016)​Partner(s)Dakota Johnson(2017–present)Children2AwardsFull listMusical careerOriginLondon, EnglandGenresAlternative rockpop rockpost-BritpoppopInstrumentsVocalspianokeyboardsguitarharmonicaLabelsParlophoneAtlanticCapitolMember ofColdplayMusical artistWebsitecoldplay.comSignatureChristopher Anthony John Martin (born 2 March 1977) is an English singer-songwriter and 

In [26]:
reply = generate_answer(prompt)
print(reply)

46
