In [None]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("docs/cnn article.pdf"),
    PyPDFLoader("docs/cnn article.pdf"),
    PyPDFLoader("docs/nytimes article 1.pdf"),
    PyPDFLoader("docs/nytimes article 2.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [None]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [None]:
splits = text_splitter.split_documents(docs)

In [None]:
len(splits)

### Embeddings

In [None]:
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [None]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [None]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [None]:
import numpy as np

In [None]:
np.dot(embedding1, embedding2)

In [None]:
np.dot(embedding1, embedding3)

In [None]:
np.dot(embedding2, embedding3)

## Vectorstores

In [None]:
# ! pip install chromadb
from langchain.vectorstores import Chroma

In [None]:
persist_directory = 'docs/chroma/'

In [None]:
!rm -rf ./docs/chroma  # remove old database files if any

In [None]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [None]:
print(vectordb._collection.count())

### Similarity Search

In [None]:
question = "is there a quick and easy skillet dinner?"

In [None]:
docs = vectordb.similarity_search(question,k=3)

In [None]:
len(docs)

In [None]:
docs[0].page_content

In [None]:
vectordb.persist()

### Failure modes

In [None]:
question = "retaliatory strikes came in response to a drone strike?"

In [None]:
docs = vectordb.similarity_search(question,k=5)

There are duplicates because we loaded the cnn article twice:

In [None]:
docs[0]

In [None]:
docs[1]