In [45]:
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [27]:
def get_html_content(url):
    response = requests.get(url)
    return response.content

In [28]:
def get_plain_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    for script in soup(["script"]):
        script.extract()
    return soup.get_text()

In [33]:
import re

def remove_newlines(html_text):
    """Removes new line characters from a string of HTML text."""
    pattern = re.compile(r'\n')
    return re.sub(pattern, '', html_text)

In [36]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000)

In [37]:
def scrape_text_from_url(url):
    html_content = get_html_content(url)
    plain_text_with_newline = get_plain_text(html_content)
    plain_text = remove_newlines(plain_text_with_newline)
    splitted_text = text_splitter.split_text(plain_text)
    return splitted_text

In [None]:
url = input("Enter the URL to scrape text from: ")
plain_text_chunks = scrape_text_from_url(url)
print(plain_text_chunks)

In [39]:
len(plain_text_chunks)

29

In [40]:
plain_text_chunks[10]

'Coldplay[edit]Main articles: Coldplay discography and songsParachutes (2000)A Rush of Blood to the Head (2002)X&Y (2005)Viva la Vida or Death and All His Friends (2008)Mylo Xyloto (2011)Ghost Stories (2014)A Head Full of Dreams (2015)Everyday Life (2019)Music of the Spheres (2021)Solo credits[edit]List of solo credits, showing year of release, song title, artist name, album title and role in the projectYearSongArtistAlbumRoleRef.2002"Where Is My Boy?"FaultlineYour Love Means EverythingCo-writer\xa0· featured artist[89]"Your Love Means Everything Part 2""Gold in Them Hills"Ron SexsmithCobblestone RunwayPiano[90]2003"Sliding"Ian McCullochSlidelingBacking vocals[91]"Arthur"Piano\xa0· backing vocals"See It in a Boy\'s Eyes"JameliaThank YouCo-writer\xa0· piano\xa0· backing vocals[92]2004"Everybody\'s Happy Nowadays"Ash"Orpheus"Backing vocals[93]"Do They Know It\'s Christmas?"Band Aid 20—Featured artist[94]2006"All Good Things (Come to an End)"Nelly FurtadoLooseCo-writer[95]"In the Sun"Mich

GENERATE EMBEDDINGS


In [42]:
import tiktoken

In [41]:
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [43]:
total_tokens = sum([len(enc.encode(chunk)) for chunk in plain_text_chunks])
print(total_tokens)

15480


In [44]:
cost = (total_tokens/1000) * 0.0004 
print(cost)

0.0061920000000000005


In [46]:
import openai

In [47]:
from dotenv import dotenv_values
config = dotenv_values(".env")
openai.api_key = config["OPENAI_API_KEY"]

In [48]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']


CONNECT TO PINECONE


In [50]:
import pinecone

In [51]:
pinecone.init(api_key= config["PINECONE_KEY"],
              environment="us-west4-gcp-free")

In [55]:
index = pinecone.Index("chris")

In [56]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 206}},
 'total_vector_count': 206}

In [57]:
def addData(web_data,url):
    id  = index.describe_index_stats()['total_vector_count']
    for i in range(len(web_data)):
        chunk=web_data[i]
        chunkInfo=(str(id+i),
                get_embedding(chunk,model="text-embedding-ada-002"),
                {'title': url,'context': chunk})
        index.upsert(vectors=[chunkInfo])
     

In [58]:
addData(plain_text_chunks,url)

QUERY EMBEDDINGS


In [59]:
def find_match(query,k):
    query_em = get_embedding(query,model= "text-embedding-ada-002")
    result = index.query(query_em, top_k=k, includeMetadata=True)
    
    return [result['matches'][i]['metadata']['title'] for i in range(k)],[result['matches'][i]['metadata']['context'] for i in range(k)]

In [62]:
find_match("Does Chris martin have children?",2)

(['https://en.wikipedia.org/wiki/Chris_Martin',
  'https://ewzrjnlqbkhwhkwjjsrl.supabase.co'],
 ['Coldplay. For other people, see Chris Martin (disambiguation).English singer-songwriter (born 1977)Chris MartinMartin performing with Coldplay in 2017BornChristopher Anthony John Martin (1977-03-02) 2 March 1977 (age\xa046)Exeter, Devon, EnglandAlma\xa0materUniversity College LondonOccupationsSingersongwritermusicianproducerphilanthropistYears\xa0active1997–presentSpouseGwyneth Paltrow\u200b \u200b(m.\xa02003; div.\xa02016)\u200bPartner(s)Dakota Johnson(2017–present)Children2AwardsFull listMusical careerOriginLondon, EnglandGenresAlternative rockpop rockpost-BritpoppopInstrumentsVocalspianokeyboardsguitarharmonicaLabelsParlophoneAtlanticCapitolMember ofColdplayMusical artistWebsitecoldplay.comSignatureChristopher Anthony John Martin (born 2 March 1977) is an English singer-songwriter and musician. He is best known as the lead vocalist, pianist, rhythm guitarist and co-founder of the rock b

PROMPT CREATION


In [63]:
def create_prompt(context,query):
    header = "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query' \n"
    return header + context + "\n\n" + query + "\n"

def generate_answer(prompt):
    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    temperature=0,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop = [' END']
    )
    return (response.choices[0].text).strip()

In [70]:
query = "Who is chris martin's wife?"
docs,res = find_match(query,2)

In [71]:
context= "\n\n".join(res)
prompt = create_prompt(context,query)
print(prompt)
     

Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query' 
Coldplay. For other people, see Chris Martin (disambiguation).English singer-songwriter (born 1977)Chris MartinMartin performing with Coldplay in 2017BornChristopher Anthony John Martin (1977-03-02) 2 March 1977 (age 46)Exeter, Devon, EnglandAlma materUniversity College LondonOccupationsSingersongwritermusicianproducerphilanthropistYears active1997–presentSpouseGwyneth Paltrow​ ​(m. 2003; div. 2016)​Partner(s)Dakota Johnson(2017–present)Children2AwardsFull listMusical careerOriginLondon, EnglandGenresAlternative rockpop rockpost-BritpoppopInstrumentsVocalspianokeyboardsguitarharmonicaLabelsParlophoneAtlanticCapitolMember ofColdplayMusical artistWebsitecoldplay.comSignatureChristopher Anthony John Martin (born 2 March 1977) is an English singer-songwriter and 

In [72]:
reply = generate_answer(prompt)
print(reply)

Gwyneth Paltrow​ ​(m. 2003; div. 2016)​
