In [None]:
!pip install -q langchain==0.0.208 deeplake openai tiktoken python-dotenv newspaper3k

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.7 MB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m25.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from dotenv import load_dotenv

!echo "OPENAI_API_KEY='<OPENAI_API_KEY>'" > .env
!echo "ACTIVELOOP_TOKEN='<ACTIVELOOP_TOKEN>'" >> .env

load_dotenv()

True

In [None]:
import requests
from newspaper import Article # https://github.com/codelucas/newspaper
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

article_urls = [
    "https://www.artificialintelligence-news.com/2023/05/16/openai-ceo-ai-regulation-is-essential/",
    "https://www.artificialintelligence-news.com/2023/05/15/jay-migliaccio-ibm-watson-on-leveraging-ai-to-improve-productivity/",
    "https://www.artificialintelligence-news.com/2023/05/15/iurii-milovanov-softserve-how-ai-ml-is-helping-boost-innovation-and-personalisation/",
    "https://www.artificialintelligence-news.com/2023/05/11/ai-and-big-data-expo-north-america-begins-in-less-than-one-week/",
    "https://www.artificialintelligence-news.com/2023/05/02/ai-godfather-warns-dangers-and-quits-google/",
    "https://www.artificialintelligence-news.com/2023/04/28/palantir-demos-how-ai-can-used-military/"
]

session = requests.Session()
pages_content = [] # where we save the scraped articles

for url in article_urls:
    try:
        time.sleep(2) # sleep two seconds for gentle scraping
        response = session.get(url, headers=headers, timeout=10)

        if response.status_code == 200:
            article = Article(url)
            article.download() # download HTML of webpage
            article.parse() # parse HTML to extract the article text
            pages_content.append({ "url": url, "text": article.text })
        else:
            print(f"Failed to fetch article at {url}")
    except Exception as e:
        print(f"Error occurred while fetching article at {url}: {e}")

#If an error occurs while fetching an article, we catch the exception and print
#an error message. This ensures that even if one article fails to download,
#the rest of the articles can still be processed.

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

my_activeloop_org_id = "<Your_Organization_Id>" # TODO: use your organization id here
my_activeloop_dataset_name = "langchain_course_qabot_with_source"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

Your Deep Lake dataset has been successfully created!
The dataset is private so make sure you are logged in!


-

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/ala/langchain_course_qabot_with_source


 

hub://ala/langchain_course_qabot_with_source loaded successfully.




In [None]:
# We split the article texts into small chunks. While doing so, we keep track of each
# chunk metadata (i.e. the URL where it comes from). Each metadata is a dictionary and
# we need to use the "source" key for the document source so that we can then use the
# RetrievalQAWithSourcesChain class which will automatically retrieve the "source" item
# from the metadata dictionary.

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

all_texts, all_metadatas = [], []
for d in pages_content:
    chunks = text_splitter.split_text(d["text"])
    for chunk in chunks:
        all_texts.append(chunk)
        all_metadatas.append({ "source": d["url"] })

In [None]:
# we add all the chunks to the deep lake, along with their metadata
db.add_texts(all_texts, all_metadatas)

Evaluating ingest: 100%|██████████| 1/1 [00:21<00:00
/

Dataset(path='hub://ala/langchain_course_qabot_with_source', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype     shape      dtype  compression
  -------   -------   -------    -------  ------- 
 embedding  generic  (49, 1536)  float32   None   
    ids      text     (49, 1)      str     None   
 metadata    json     (49, 1)      str     None   
   text      text     (49, 1)      str     None   


 

['a9ac1d0c-ffe6-11ed-8434-0242ac1c000c',
 'a9ac1ece-ffe6-11ed-8434-0242ac1c000c',
 'a9ac1faa-ffe6-11ed-8434-0242ac1c000c',
 'a9ac2054-ffe6-11ed-8434-0242ac1c000c',
 'a9ac20f4-ffe6-11ed-8434-0242ac1c000c',
 'a9ac2180-ffe6-11ed-8434-0242ac1c000c',
 'a9ac2216-ffe6-11ed-8434-0242ac1c000c',
 'a9ac22a2-ffe6-11ed-8434-0242ac1c000c',
 'a9ac2338-ffe6-11ed-8434-0242ac1c000c',
 'a9ac23ce-ffe6-11ed-8434-0242ac1c000c',
 'a9ac245a-ffe6-11ed-8434-0242ac1c000c',
 'a9ac24e6-ffe6-11ed-8434-0242ac1c000c',
 'a9ac2572-ffe6-11ed-8434-0242ac1c000c',
 'a9ac2608-ffe6-11ed-8434-0242ac1c000c',
 'a9ac2694-ffe6-11ed-8434-0242ac1c000c',
 'a9ac272a-ffe6-11ed-8434-0242ac1c000c',
 'a9ac27c0-ffe6-11ed-8434-0242ac1c000c',
 'a9ac284c-ffe6-11ed-8434-0242ac1c000c',
 'a9ac28ce-ffe6-11ed-8434-0242ac1c000c',
 'a9ac2964-ffe6-11ed-8434-0242ac1c000c',
 'a9ac29e6-ffe6-11ed-8434-0242ac1c000c',
 'a9ac2a72-ffe6-11ed-8434-0242ac1c000c',
 'a9ac2b08-ffe6-11ed-8434-0242ac1c000c',
 'a9ac2b94-ffe6-11ed-8434-0242ac1c000c',
 'a9ac2c20-ffe6-

In [None]:
# we create a RetrievalQAWithSourcesChain chain, which is very similar to a
# standard retrieval QA chain but it also keeps track of the sources of the
# retrieved documents

from langchain.chains import RetrievalQAWithSourcesChain
from langchain import OpenAI

llm = OpenAI(model_name="text-davinci-003", temperature=0)

chain = RetrievalQAWithSourcesChain.from_chain_type(llm=llm,
                                                    chain_type="stuff",
                                                    retriever=db.as_retriever())

In [None]:
# We generate a response to a query using the chain. The response object is a dictionary containing
# an "answer" field with the textual answer to the query, and a "sources" field containing a string made
# of the concatenation of the metadata["source"] strings of the retrieved documents.
d_response = chain({"question": "What does Geoffrey Hinton think about recent trends in AI?"})

print("Response:")
print(d_response["answer"])
print("Sources:")
for source in d_response["sources"].split(", "):
    print("- " + source)

Response:
 Geoffrey Hinton has expressed concerns about the potential dangers of AI, such as false text, images, and videos created by AI, and the impact of AI on the job market. He believes that AI has the potential to replace humans as the dominant species on Earth.

Sources:
- https://www.artificialintelligence-news.com/2023/05/02/ai-godfather-warns-dangers-and-quits-google/
- https://www.artificialintelligence-news.com/2023/05/15/iurii-milovanov-softserve-how-ai-ml-is-helping-boost-innovation-and-personalisation/
