In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, CSVLoader
from langchain.document_loaders import DirectoryLoader

In [None]:
loader = DirectoryLoader(path = '/content/', glob="./*final.csv", loader_cls=CSVLoader)

documents = loader.load()

In [None]:
#splitting the text into
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [None]:
!pip -q install langchain openai tiktoken chromadb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m479.8/479.8 kB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.9/92.9 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
openai_api_key = "your open-ai key"

In [None]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings
embedding = OpenAIEmbeddings(openai_api_key = openai_api_key)

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [None]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [None]:
# Now we can load the persisted database from disk, and use it as normal, if the database is huge this will get stored on some cloud premise databses
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

In [None]:
retriever = vectordb.as_retriever()

In [None]:
docs = retriever.get_relevant_documents("TARGET")

In [None]:
str(docs[0])

'page_content=": 380\\nBRAND: LOREAL PARIS COSMETICS\\nPRODUCT_CATEGORY: Makeup\\nOFFER: L\'Oréal Paris Makeup, spend $35 at Target\\nRETAILER: TARGET" metadata={\'row\': 380, \'source\': \'/content/final.csv\'}'

In [None]:
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key = openai_api_key),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
query = "amazon"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 Amazon is a retailer.


Sources:
/content/final.csv
/content/final.csv
/content/final.csv
/content/final.csv


In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [None]:
# Set up the turbo LLM
turbo_llm = ChatOpenAI(
    temperature=0,
    model_name='gpt-3.5-turbo-16k',
    openai_api_key = openai_api_key
)

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=turbo_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "Amazon"
prompt = f'''
You are an intelligent AI bot with capability to search based on user query and look in the database\
to identify all relevant offers for the same. If you dont find offers, then return all the information you find related to the query.\

Consider below instructions for instance\

Instruction:
•	If a user searches for a category (ex. diapers) the tool should return a list of offers that are relevant to that category.
•	If a user searches for a brand (ex. Huggies) the tool should return a list of offers that are relevant to that brand.
•	If a user searches for a retailer (ex. Target) the tool should return a list of offers that are relevant to that retailer.


Here is the user query in curly braces:
{query}

'''

llm_response = qa_chain(prompt)
process_llm_response(llm_response)

Based on the given context, there is an offer available for the retailer "Amazon" for the brand "KRADLE" in the product category "Dog Supplies". The offer is for Kradle, select varieties, and it is available online at Amazon.


Sources:
/content/final.csv
/content/final.csv
/content/final.csv


In [None]:
!zip -r db.zip db/


  adding: db/ (stored 0%)
  adding: db/b6288cab-e144-4c3e-accc-c71fafc786bd/ (stored 0%)
  adding: db/b6288cab-e144-4c3e-accc-c71fafc786bd/length.bin (deflated 47%)
  adding: db/b6288cab-e144-4c3e-accc-c71fafc786bd/link_lists.bin (stored 0%)
  adding: db/b6288cab-e144-4c3e-accc-c71fafc786bd/header.bin (deflated 61%)
  adding: db/b6288cab-e144-4c3e-accc-c71fafc786bd/data_level0.bin (deflated 100%)
  adding: db/.zip (stored 0%)
  adding: db/chroma.sqlite3 (deflated 41%)
