In [51]:
# Helper to read local files
import os
from dotenv import load_dotenv

# Vector Support
# from langchain.vectorstores import FAISS >> instead of FAISS we will use Chroma
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

# Model and chain
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain

# Text splitters
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader

# Schemas
from langchain.schema import HumanMessage, SystemMessage, AIMessage

In [23]:
# loading API keys and constructing models
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', 'YourAPIKey')

# construct models
embeddings = OpenAIEmbeddings(disallowed_special=(), openai_api_key=OPENAI_API_KEY)
llm = ChatOpenAI(temperature=0.5, model_name='gpt-4', openai_api_key=OPENAI_API_KEY)


In [28]:
# loading the documents vectorstore for the library, thefuzz;
path = os.getcwd().split(os.path.sep)
path.pop(-1)
path = os.path.sep.join(path)
path = os.path.join(path, f"data{os.path.sep}thefuzz")

documents = []
for dirpath, dirnames, filenames in os.walk(path):
    # print(dirpath)
    # print(dirnames)
    # Go through each file
    for file in filenames:
        try: 
            # Load up the file as a doc and split
            # print (file)
            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
            documents.extend(loader.load_and_split())
        except Exception as e:
            pass

[Document(page_content='import unittest\nimport re\nimport pycodestyle\n\nfrom thefuzz import fuzz\nfrom thefuzz import process\nfrom thefuzz import utils\nfrom thefuzz.string_processing import StringProcessor\n\n\nclass StringProcessingTest(unittest.TestCase):\n    def test_replace_non_letters_non_numbers_with_whitespace(self):\n        strings = ["new york mets - atlanta braves", "Cães danados",\n                   "New York //// Mets $$$", "Ça va?"]\n        for string in strings:\n            proc_string = StringProcessor.replace_non_letters_non_numbers_with_whitespace(string)\n            regex = re.compile(r"(?ui)[\\W]")\n            for expr in regex.finditer(proc_string):\n                self.assertEqual(expr.group(), " ")\n\n    def test_dont_condense_whitespace(self):\n        s1 = "new york mets - atlanta braves"\n        s2 = "new york mets atlanta braves"\n        p1 = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s1)\n        p2 = StringProcessor.replac

In [30]:
## creating the vectorstores

# create a persistent chroma DB
# db = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db")

# load a persistent chroma DB
db = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)

# Chroma.as_retriever() + retrievalQA chain
This example uses the Chroma wrapper object and calls it as the retriever and uses retrieval QA chain directly as it. Default chain_type="stuff".

In [35]:
# default chain_type = stuff;
retriever = db.as_retriever()
# print(type(retriever)) # <class 'langchain.schema.vectorstore.VectorStoreRetriever'>
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever, verbose=True)
query = "What is thefuzz"
result = qa_chain.run(query)
print(result)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
The `thefuzz` is a Python library that provides fuzzy string matching and similarity functions. It is often used for tasks such as string matching, searching, and deduplication. The library offers a variety of algorithms and methods to calculate the similarity between strings, which allows users to find matches even when there are slight differences or variations in the strings. Some common use cases include search and retrieval, data cleaning and preprocessing, natural language processing, and spell checking and correction.


In [None]:
# default chain_type = refine;

In [44]:
# default chain_type = stuff;
retriever = db.as_retriever()
# print(type(retriever)) # <class 'langchain.schema.vectorstore.VectorStoreRetriever'>
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever, verbose=True, chain_type='refine')
query = "What is thefuzz"
result = qa_chain.run(query)
print(result)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
The term "thefuzz" does not appear to be related to the context provided. In the context of Python programming, `thefuzz` is a library that provides tools for fuzzy string matching and similarity calculations. However, in the context of Joseph Mourinho being a football coach, "thefuzz" does not have a clear relevance. If you could provide more specific details or clarify the connection, I would be able to give a more accurate answer.


# Similarity search + loadQA Chain(chain_type='stuff')

In [62]:
query = "Show me the common use cases of thefuzz, along with code examples"
retrieved_docs = db.similarity_search(query=query, k=5)
for document in retrieved_docs:
    print(document.page_content[:75])
    print('\n---SEPARATOR---')
    # as you can see from this example, there are multiple same documents that are matched.

The `thefuzz` library is a Python library that provides fuzzy string matchi

---SEPARATOR---
The `thefuzz` library is a Python library that provides fuzzy string matchi

---SEPARATOR---
------------------------------------------------------------
OVERIVEW:
----

---SEPARATOR---
------------------------------------------------------------
OVERIVEW:
----

---SEPARATOR---
New Notes:

Milestone based learning path.

Milestone 1: Getting Started
- 

---SEPARATOR---


In [63]:
qa_chain = load_qa_chain(llm, chain_type="stuff", verbose=True)
result = qa_chain.run(question=query, input_documents=retrieved_docs)
result



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
The `thefuzz` library is a Python library that provides fuzzy string matching and similarity functions. It is commonly used for tasks such as string matching, searching, and deduplication. The library offers various algorithms and methods to calculate the similarity between strings, allowing users to find matches even when there are slight differences or variations in the strings.

Some common use cases of the `thefuzz` library include:

1. Search and retrieval: The library can be used to search for similar strings in a large dataset. This is useful in applications like search engines, autocomplete suggestions, and data deduplication.

2. Data cleaning and preprocessing: T

'Sure, here are some common use cases of the `thefuzz` library along with code examples:\n\n1. **Search and Retrieval**\n   \n   You can use the `thefuzz` library to search for similar strings in a large dataset. The `process.extract()` function can be used to find the strings that are most similar to a given query string.\n   \n   ```python\n   from thefuzz import process\n   \n   choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]\n   print(process.extract("new york jets", choices, limit=2))\n   ```\n   This will output the 2 choices that are most similar to "new york jets".\n\n2. **Data Cleaning and Preprocessing**\n\n   The `thefuzz` library can be used to identify and handle variations and inconsistencies in strings, which is useful for data cleaning and preprocessing. The `fuzz.ratio()` function can be used to calculate the similarity between two strings.\n   \n   ```python\n   from thefuzz import fuzz\n   \n   print(fuzz.ratio("new york mets", "ne

# MMR + loadQAChains

In [None]:
results = chroma.max_marginal_relevance_search(query, k)

In [60]:
query = "Show me the common use cases of thefuzz, with code examples"
mmr_retrieved_docs = db.max_marginal_relevance_search(query, k=10)
print(mmr_retrieved_docs)
# for document in mmr_retrieved_docs:
    # print(document.page_content[:75])
    # print('\n---SEPARATOR---')
    # as you can see from this example, there are multiple same documents that are matched.



In [64]:
qa_chain = load_qa_chain(llm, chain_type="stuff", verbose=True)
result = qa_chain.run(question=query, input_documents=mmr_retrieved_docs)
print(result)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
The `thefuzz` library is a Python library that provides fuzzy string matching and similarity functions. It is commonly used for tasks such as string matching, searching, and deduplication. The library offers various algorithms and methods to calculate the similarity between strings, allowing users to find matches even when there are slight differences or variations in the strings.

Some common use cases of the `thefuzz` library include:

1. Search and retrieval: The library can be used to search for similar strings in a large dataset. This is useful in applications like search engines, autocomplete suggestions, and data deduplication.

2. Data cleaning and preprocessing: T