In [1]:
%load_ext dotenv
%dotenv ../../05_src/.secrets

In [74]:
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
import os
from openai import OpenAI



OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in environment variables")
else:
    client = OpenAI(api_key = OPENAI_API_KEY)

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

In [57]:
import chromadb

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="misconceptions")

In [43]:

# loader = UnstructuredURLLoader(["http://www.shadowpoetry.com/resources/famous/mayaangelou.html"])
# loader = UnstructuredURLLoader(["https://www.rd.com/list/interesting-facts/"])
loader = UnstructuredHTMLLoader("./documents/List of common misconceptions about science, technology, and mathematics - Wikipedia.htm")

In [44]:
data = loader.load()

In [45]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500, 
    chunk_overlap=25, 
    length_function = len, 
    add_start_index = True
)

In [46]:
documents = text_splitter.split_documents(data)

In [47]:
documents

[Document(metadata={'source': './documents/List of common misconceptions about science, technology, and mathematics - Wikipedia.htm', 'start_index': 0}, page_content='List of common misconceptions about science, technology, and mathematics\n\nবাংলা\n\nEdit links\n\nFrom Wikipedia, the free encyclopedia\n\nEach entry on this list of common misconceptions is worded as a correction; the misconceptions themselves are implied rather than stated. These entries are concise summaries; the main subject articles can be consulted for more detail.\n\nMain article: List of common misconceptions\n\nAstronomy and spaceflight\n\n[edit]'),
 Document(metadata={'source': './documents/List of common misconceptions about science, technology, and mathematics - Wikipedia.htm', 'start_index': 440}, page_content='[edit]\n\nThere is no scientific evidence that the motion of stars, planets, and other celestial bodies influences the fates of humans, and astrology has repeatedly been shown to have no explanatory p

In [48]:
documents = [doc.page_content for doc in documents]

In [49]:
embeddings = [get_embedding(doc) for doc in documents]

In [56]:
ids = [f'id{i}' for i in range(len(documents))]

In [51]:
def query_chromadb(query, top_n=2):
    query_embedding = get_embedding(query)
    results = collection.query(
        query_embeddings = [query_embedding],
        n_results = top_n
    )
    return [(id, score, text) for id, score, text in zip(results['ids'][0], results['distances'][0], results['documents'][0])]

In [58]:
collection.add(
    embeddings = embeddings,
    documents = documents, 
    ids = ids
)

In [60]:
query_chromadb("Are bats blind?", top_n=3)

[('id14',
  0.23180054128170013,
  'Mammals\n\n[edit]\n\nBats are not blind. While about 70% of bat species, mainly in the microbat family, use echolocation to navigate, all bat species have eyes and are capable of sight. In addition, almost all bats in the megabat or fruit bat family cannot echolocate and have excellent night vision.[29]'),
 ('id255',
  0.327432245016098,
  '^ a. Di Silvestro, Roger (February 1, 2003). "The Truth About Animal Clichés". National Wildlife Federation. Retrieved October 31, 2011. b. "Blind as a Bat?". Geneva, New York: Hobart and William Smith Colleges. June 12, 2003. Archived from the original (Press release) on June 7, 2008. Retrieved April 7, 2009.'),
 ('id287',
  0.4602612555027008,
  '^ a. Chatfield, Matthew (January 4, 2008). "Some scientist once proved that bees can\'t fly...?".. naturenet.net. The Ranger\'s Blog. b. Ivars Peterson (September 13, 2004). "Flight of the Bumblebee". Ivars Peterson\'s MathTrek. Mathematical Association of America. Retr

# Tokenization

In [71]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 100, chunk_overlap=10
)

In [72]:
documents = text_splitter.split_documents(data)

Created a chunk of size 132, which is longer than the specified 100
Created a chunk of size 110, which is longer than the specified 100
Created a chunk of size 111, which is longer than the specified 100
Created a chunk of size 116, which is longer than the specified 100
Created a chunk of size 101, which is longer than the specified 100
Created a chunk of size 109, which is longer than the specified 100
Created a chunk of size 106, which is longer than the specified 100
Created a chunk of size 141, which is longer than the specified 100
Created a chunk of size 125, which is longer than the specified 100
Created a chunk of size 127, which is longer than the specified 100
Created a chunk of size 112, which is longer than the specified 100
Created a chunk of size 139, which is longer than the specified 100
Created a chunk of size 106, which is longer than the specified 100
Created a chunk of size 132, which is longer than the specified 100
Created a chunk of size 203, which is longer tha

In [65]:
documents = [doc for doc in documents]

In [77]:
from langchain.vectorstores import Chroma

db = Chroma.from_documents(documents, OpenAIEmbeddings())

  db = Chroma.from_documents(documents, OpenAIEmbeddings())


In [79]:
def query_documents(query, top_n=2):
    docs = db.similarity_search(query, top_n)
    return docs

In [80]:
query_documents("Are bats blind?")

[Document(metadata={'source': './documents/List of common misconceptions about science, technology, and mathematics - Wikipedia.htm'}, page_content='Mammals\n\n[edit]\n\nBats are not blind. While about 70% of bat species, mainly in the microbat family, use echolocation to navigate, all bat species have eyes and are capable of sight. In addition, almost all bats in the megabat or fruit bat family cannot echolocate and have excellent night vision.[29]'),
 Document(metadata={'source': './documents/List of common misconceptions about science, technology, and mathematics - Wikipedia.htm'}, page_content='^ a. Di Silvestro, Roger (February 1, 2003). "The Truth About Animal Clichés". National Wildlife Federation. Retrieved October 31, 2011. b. "Blind as a Bat?". Geneva, New York: Hobart and William Smith Colleges. June 12, 2003. Archived from the original (Press release) on June 7, 2008. Retrieved April 7, 2009.')]