# AI Assitant Chatbot Using Langchain and RAG

In [1]:
# one time pip installs

# !pip install chromadb tiktoken pypdf docx2txt regex

In [2]:
# import necessary libraries

import os # for openai
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader, TextLoader, PyPDFLoader, Docx2txtLoader
from langchain.vectorstores import DocArrayInMemorySearch, Chroma
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
import re # regex

In [3]:
# openai apikey

api = 'config.py'
def get_file_contents(api):
    """ Given a filename,
        return the contents of that file
    """
    try:
        with open(api, 'r') as f:
            # It's assumed our file contains a single line,
            # with our API key
            return f.read().strip()
    except FileNotFoundError:
        print("'%s' file not found" % filename)

In [4]:
# set the llm and apikey

api_key = get_file_contents(api)
os.environ["OPENAI_API_KEY"] = api_key

llm = OpenAI() # default in langchain is gpt-3.5-turbo

## First iteration: validation of Langchain doc loader according to file type

**Em Thompson**

In [8]:

file = "data/emthompson1.pdf"

# instatiate document loader according to file type (based on regex for validation)
re_doc = '\.(docx|DOCX)$'
re_pdf = '\.(pdf|PDF)$'
re_txt = '\.(txt|TXT)$'

if re.search(re_doc, file) != None:
    loader = Docx2txtLoader(file_path = file)
elif re.search(re_pdf, file) != None:
    loader = PyPDFLoader(file_path = file)
elif re.search(re_txt, file) != None:
    loader = TextLoader(file_path = file)
else: print("Please upload a .docx, .pdf. or .txt file.")

documents = loader.load()

# split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=5)
texts = text_splitter.split_documents(documents)

# select which embeddings we want to use
embeddings = OpenAIEmbeddings()

# create the vectorestore to use as the index
db = Chroma.from_documents(texts, embeddings)

# expose this index in a retriever interface
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k":5}) # default: cosine similarity; k = 2 is alright, because the article is small

# create a chain to answer questions
qa = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                 chain_type="stuff", 
                                 retriever=retriever,
                                 return_source_documents=True)

In [9]:
query = "Would Emily be interested in plastic packaging?"
result = qa({"query": query})
result['result']

" No, Emily's commitment to environmental consciousness and sustainable practices would likely mean that she would not be interested in plastic packaging."

In [10]:
query = "How old is Emily?"
result = qa({"query": query})
result['result']

' Emily is 35 years old.'

**Anne Frank**

In [14]:

af_file = "data/sample.docx"

# instatiate document loader according to file type (based on regex for validation)
re_doc = '\.(docx|DOCX)$'
re_pdf = '\.(pdf|PDF)$'
re_txt = '\.(txt|TXT)$'

if re.search(re_doc, af_file) != None:
    af_loader = Docx2txtLoader(file_path = af_file)
elif re.search(re_pdf, af_file) != None:
    af_loader = PyPDFLoader(file_path = af_file)
elif re.search(re_txt, af_file) != None:
    af_loader = TextLoader(file_path = af_file)
else: print("Please upload a .docx, .pdf. or .txt file.")

af_documents = af_loader.load()

# split the documents into chunks
af_text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=5)
af_texts = af_text_splitter.split_documents(af_documents)

# select which embeddings we want to use
af_embeddings = OpenAIEmbeddings()

# create the vectorestore to use as the index
af_db = Chroma.from_documents(af_texts, af_embeddings)

# expose this index in a retriever interface
af_retriever = af_db.as_retriever(search_type="similarity", search_kwargs={"k":5}) # default: cosine similarity; k = 2 is alright, because the article is small

# create a chain to answer questions
af_qa = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                 chain_type="stuff", 
                                 retriever=af_retriever,
                                 return_source_documents=True)

In [16]:
af_query = "When was Anne's first entry"
af_result = af_qa({"query": query})
af_result['result']

' Emily is 35 years old (born on June 15, 1985).'

**Observations**

* Need to restart kernel before loading new document
* Unable to get correct answers for word document - could it be due to formatting? or chunk size?

**Chunk size = 300**

In [8]:
af_query = "When was Anne's first entry"
af_result = af_qa({"query": af_query})
af_result['result']

' Wednesday, July 8, 1942'

In [9]:
af_query = "When is Anne's birthday"
af_result = af_qa({"query": af_query})
af_result['result']

" Anne does not mention her birthday in this context, so I don't know."

In [10]:
af_query = "When was Anne born?"
af_result = af_qa({"query": af_query})
af_result['result']

" I don't know."

In [12]:
af_query = "When was my sister Margot born?"
af_result = af_qa({"query": af_query})
af_result['result']

" I don't know."

In [13]:
af_query = "When did we celebrate my birthday?"
af_result = af_qa({"query": af_query})
af_result['result']

' June 20, 1942'

**Chunk size = 1000**

In [15]:
af_query = "When did we celebrate my birthday?"
af_result = af_qa({"query": af_query})
af_result['result']

' We celebrated my birthday on June 20, 1942.'

In [16]:
af_query = "My father was managing director of which company?"
af_result = af_qa({"query": af_query})
af_result['result']

' The Dutch Opekta Company.'

In [17]:
af_query = "What did my father work as"
af_result = af_qa({"query": af_query})
af_result['result']

' Father worked as a business partner and owner of a company dealing in spices and spice substitutes.'

In [18]:
af_query = "When was my sister Margot born?"
af_result = af_qa({"query": af_query})
af_result['result']

' Margot was born in Frankfurt am Main in Germany in 1926.'

In [19]:
af_query = "When was I born?"
af_result = af_qa({"query": af_query})
af_result['result']

' June 12, 1929.'

In [20]:
af_query = "Where did I live until I was four?"
af_result = af_qa({"query": af_query})
af_result['result']

' I lived in Frankfurt until I was four.'

In [21]:
af_query = "When did my father marry my mother?"
af_result = af_qa({"query": af_query})
af_result['result']

' He married my mother when he was thirty-six and she was twenty-five.'