In [None]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("docs/cnn article.pdf"),
    PyPDFLoader("docs/cnn article.pdf"),
    PyPDFLoader("docs/nytimes article 1.pdf"),
    PyPDFLoader("docs/nytimes article 2.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [None]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [None]:
splits = text_splitter.split_documents(docs)

In [None]:
len(splits)

### Embeddings

In [None]:
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [None]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [None]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [11]:
import numpy as np

In [12]:
np.dot(embedding1, embedding2)

0.9631675619330522

In [13]:
np.dot(embedding1, embedding3)

0.7710630976675937

In [14]:
np.dot(embedding2, embedding3)

0.7596682675219122

## Vectorstores

In [30]:
# ! pip install chromadb
from langchain.vectorstores import Chroma

In [16]:
persist_directory = 'docs/chroma/'

In [17]:
!rm -rf ./docs/chroma  # remove old database files if any

In [18]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [31]:
print(vectordb._collection.count())

38


### Similarity Search

In [23]:
question = "is there a quick and easy skillet dinner?"

In [24]:
docs = vectordb.similarity_search(question,k=3)

In [25]:
len(docs)

3

In [26]:
docs[0].page_content

'Cheesy\nGreen\nChile\nBean\nBake\nBy\nAli\nSlagle\nUpdated\nJan.\n22,\n2024\nKerri\nBrewer\nfor\nThe\nNew\nYork\nTimes.\nFood\nStylist:\nBarrett\nWashburne.\nTotal\nTime\n25\nminutes\nPrep\nTime\n10\nminutes\nCook\nTime\n15\nminutes\nRating\n4(937)\nNotes\nRead\n126\ncommunity\nnotes\nThis\nquick\nand\neasy\nskillet\ndinner\nis\nsomewhere\nbetween\nan\neasy\ntake\non\nenchiladas\nverdes\n(no\nrolling\nrequired)\nand\na\ncomplete-meal\nrendition\nof\nchile\ncon\nqueso\n.\nThe\nbase\nis\na\nmixture\nof\ncharred\ngreen\nchiles,\npinto\nbeans\nand\nstore-bought\nor\nhomemade\nsalsa\nverde\n,\nwhich\ndelivers\nthe\nbrightness\nof\ntomatillos,\nchiles,\ncilantro\nand\nlime.\nBlanket\nthe\nbean-and-chile\nmixture\nwith\nMonterey\nJack\ncheese,\nthen\nbroil\nuntil\nmolten.\nServe\nwith\ntortillas,\ntortilla\nchips,\nrice,\na\nbaked\nsweet\npotato\nor\nfried\neggs.\nYou\ncould\nalso\nadd\nMexican\nchorizo,\nbacon\nor\nspinach\nwith\nthe\nchiles,\nor\nleftover\npulled\nchicken\nwith\nthe\nbeans

In [27]:
vectordb.persist()

### Failure modes

In [32]:
question = "retaliatory strikes came in response to a drone strike?"

In [33]:
docs = vectordb.similarity_search(question,k=5)

There are duplicates because we loaded the cnn article twice:

In [34]:
docs[0]

Document(page_content='The\nUS\nconducted\nmajor\nairstrikes\non\n85\ntargets\nin\nIraq\nand\nSyria\non\nFriday,\nthe\nstart\nof\nwhat\nwill\nlikely\nbe\na\nseries\nof\nlarger-scale\nUS\nstrikes\non\nIranian-backed\nmilitias\nwho\nhave\ncarried\nout\nattacks\non\nUS\ntroops\nin\nthe\nMiddle\nEast.\nThe\nretaliatory\nstrikes\ncame\nin\nresponse\nto\na\ndrone\nstrike\nby\nIran-backed\nmilitants\non\na\nUS\nmilitary\noutpost\nin\nJordan\non\nSunday,\nwhich\nkilled\nthree\nUS\nservice\nmembers\nand\nwounded\nmore\nthan\n40\nothers.\nUS\nCentral\nCommand\nconfirmed\nin\na\nstatement\nthat\nairstrikes\nwere\ncarried\nout\nin\nIraq\nand\nSyria\n“against\nIran’s\nIslamic\nRevolutionary\nGuards\nCorps\n(IRGC)\nQuds\nForce\nand\naffiliated\nmilitia\ngroups.”\n“U.S.\nmilitary\nforces\nstruck\nmore\nthan\n85\ntargets,\nwith\nnumerous\naircraft\nto\ninclude\nlong-range\nbombers\nflown\nfrom\nUnited\nStates.\nThe\nairstrikes\nemployed\nmore\nthan\n125\nprecision\nmunitions,”\nthe\nstatement\nsaid.\n

In [35]:
docs[1]

Document(page_content='The\nUS\nconducted\nmajor\nairstrikes\non\n85\ntargets\nin\nIraq\nand\nSyria\non\nFriday,\nthe\nstart\nof\nwhat\nwill\nlikely\nbe\na\nseries\nof\nlarger-scale\nUS\nstrikes\non\nIranian-backed\nmilitias\nwho\nhave\ncarried\nout\nattacks\non\nUS\ntroops\nin\nthe\nMiddle\nEast.\nThe\nretaliatory\nstrikes\ncame\nin\nresponse\nto\na\ndrone\nstrike\nby\nIran-backed\nmilitants\non\na\nUS\nmilitary\noutpost\nin\nJordan\non\nSunday,\nwhich\nkilled\nthree\nUS\nservice\nmembers\nand\nwounded\nmore\nthan\n40\nothers.\nUS\nCentral\nCommand\nconfirmed\nin\na\nstatement\nthat\nairstrikes\nwere\ncarried\nout\nin\nIraq\nand\nSyria\n“against\nIran’s\nIslamic\nRevolutionary\nGuards\nCorps\n(IRGC)\nQuds\nForce\nand\naffiliated\nmilitia\ngroups.”\n“U.S.\nmilitary\nforces\nstruck\nmore\nthan\n85\ntargets,\nwith\nnumerous\naircraft\nto\ninclude\nlong-range\nbombers\nflown\nfrom\nUnited\nStates.\nThe\nairstrikes\nemployed\nmore\nthan\n125\nprecision\nmunitions,”\nthe\nstatement\nsaid.\n