In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]

## Basic RAG app with the vector database DeepLake

**Load de DeepLake credentials**

In [2]:
os.environ["ACTIVELOOP_TOKEN"] = os.environ["DEEPLAKE_API_KEY"]

In [3]:
my_activeloop_org_id = os.environ["ACTIVELOOP_ORG_ID"]

**Name the new database you will create**

In [4]:
my_activeloop_dataset_name = "basic-rag-with-deeplake"

**Load dependencies**

In [5]:
from langchain.embeddings import OpenAIEmbeddings

In [6]:
from langchain.vectorstores import DeepLake

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
from langchain.chains import RetrievalQA

**Create the external knowledge document**

In [9]:
usa_curious_facts = [
    """
    The US celebrates Independence Day from the British Empire 
    on July 4. However, the country’s Declaration of Independence 
    was passed on July 2. It was only officially ratified on July 4.
    """,
    """
    The very first documented European to arrive in North America was 
    the Spaniard Juan Ponce de León, who landed in Florida in 1513.
    """
]

**Divide the document in smaller chunks of text**

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 0
)

In [11]:
doc_chunks = text_splitter.create_documents(usa_curious_facts)

In [12]:
print(f"Now you have {len(doc_chunks)} chunks.")

Now you have 2 chunks.


**Create the DeepLake vector database**

In [13]:
embeddings = OpenAIEmbeddings()

  warn_deprecated(


In [14]:
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

In [15]:
#!pip install "deeplake[enterprise]"

In [16]:
db = DeepLake(
    dataset_path=dataset_path,
    embedding=embeddings
)

Deep Lake Dataset in hub://julio4ai/basic-rag-with-deeplake already exists, loading from the storage


/

**Load the chunks, will transformed into embeddings**

In [17]:
db.add_documents(doc_chunks)

Creating 2 embeddings in 1 batches of size 2:: 100%|█| 1/1

Dataset(path='hub://julio4ai/basic-rag-with-deeplake', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (10, 1536)  float32   None   
    id        text      (10, 1)      str     None   
 metadata     json      (10, 1)      str     None   
   text       text      (10, 1)      str     None   





['c58da3a8-c122-11ee-8703-1e00d92e2031',
 'c58da4b6-c122-11ee-8703-1e00d92e2031']

**Create the QA Chain**

In [18]:
from langchain_openai import OpenAI

In [19]:
llm = OpenAI()

In [20]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever()
)

**Ask the App about the document**

In [21]:
qa_chain.run("When was actually passed the U.S. Declaration of Independence?")

  warn_deprecated(


' The Declaration of Independence was passed on July 2, but officially ratified on July 4.'

**Add new data to the vector database**

In [22]:
additional_usa_curious_facts = [
    """
    Alaska is the largest state in the US, and used to belong 
    to the Russian Empire before the US purchased it.
    """,
    """
    Big cities and regions have their own style of pizza: Chicago 
    Deep-Dish, New York Style, Detroit Pizza, St Louis-Style, and 
    New England Beach Pizza are just a few different varieties.
    """
]

In [23]:
additional_doc_chunks = text_splitter.create_documents(additional_usa_curious_facts)

In [24]:
db.add_documents(additional_doc_chunks)

Creating 2 embeddings in 1 batches of size 2:: 100%|█| 1/1

Dataset(path='hub://julio4ai/basic-rag-with-deeplake', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
 embedding  embedding  (12, 1536)  float32   None   
    id        text      (12, 1)      str     None   
 metadata     json      (12, 1)      str     None   
   text       text      (12, 1)      str     None   





['0a1db684-c123-11ee-8703-1e00d92e2031',
 '0a1db76a-c123-11ee-8703-1e00d92e2031']

**Ask the app about the new data**

In [25]:
qa_chain.run("What is the largest state in the US?")

' Alaska'

In [26]:
qa_chain.run("Tell me 3 states with their own style of pizza")

'\nChicago, New York, and Detroit'