# Making a chatbot with Langchain ,chromadb &openAI

 # Importing dependencies

In [28]:
import openai
import langchain
import chromadb

In [29]:
import os
os.environ["OPENAI_API_KEY"] = "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"


In [31]:
#changing directory to cwd
os.chdir("D:/guide-bot")
os.getcwd()


'D:\\guide-bot'

# Loadind data with Langchain document loaders

In [40]:
# Import the PyPDFLoader class from the langchain.document_loaders module
from langchain.document_loaders import PyPDFLoader

# Create an instance of PyPDFLoader with the path to a PDF file
# The file  is located in the "datasets" directory

# Load the PDF file and split it into pages
# The `load_and_split` method reads the PDF and separates its content by pages
# The result is stored in the variable 'pages
loader = PyPDFLoader("datasets\quran.pdf")
pages = loader.load_and_split()

In [45]:
pages=pages[0:50]

# Splitting data into smaller chunks using text splitter

In [46]:
# Import the RecursiveCharacterTextSplitter class from langchain.text_splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create an instance of RecursiveCharacterTextSplitter with specific parameters
# `chunk_size` sets the maximum number of characters in each chunk to 1000
# `chunk_overlap` allows for an overlap of 200 characters between chunks
splitter=RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200

)
# Use the splitter to split the previously loaded pages into smaller text chunks
# The method 'split_documents' takes the 'pages' list and divides each page into smaller parts
# These parts are based on the defined chunk size and overlap
# The result is stored in the variable 'documents
documents=splitter.split_documents(pages)

In [47]:
len(documents)

214

# Counting Tokens to estimate cost

In [48]:
# Import the necessary function for encoding from the tiktoken library
import tiktoken

In [49]:

# Get the encoder for the "text-embedding-ada-002" model
encoder=tiktoken.encoding_for_model("text-embedding-ada-002")

# Encode each document's content and count the number of tokens in each
# This is done by iterating over the 'documents' list, encoding the content of each document
# and then counting the tokens using the length function
doc_tokens=[len(encoder.encode(doc.page_content)) for doc in documents]

# Calculate the total number of tokens by summing up the tokens in all documents
total_tokens=sum(doc_tokens)

# Calculate the cost based on the total number of tokens
# The cost is calculated by dividing the total tokens by 1000 (as pricing is per 1000 tokens)
# and then multiplying by the cost per 1000 tokens, which is $0.004 in this case
cost=(total_tokens/1000)*0.004

print(cost)

0.18675999999999998


In [50]:
total_tokens

46690

# Embedding data using Chromadb

In [51]:
#Import chroma
from langchain.vectorstores import Chroma
#import OpenAIEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings

In [52]:
#Create the embedding functions
embedding_function=OpenAIEmbeddings()

#Create a database from the documents and embeddings function
db=Chroma.from_documents(documents=documents,embedding=embedding_function,
                        persist_directory='my-embeddings')

#Persist the data to the disk
db.persist()

# Query the database

In [54]:
results=db.similarity_search_with_score('who is god')

for(doc,score)in results:
    print('score',score)
    print(doc.page_content)
    print('----------------------')

score 0.36694127321243286
“Allah , there is no god but He, the Living, the Eternal. He sent 
down to you the Book with the Truth, confirming what came 
before it; and He sent down the Torah and the Gospel.”  
 
                                                                                      Verses 3: 2-3
----------------------
score 0.3767910599708557
(Ali ‘Imran ) 
In the name of Allah,  
the Gracious, the Merciful  
1. Alif, Lam, Meem.  
2. Allah, there is no god but He, the Living, the 
Eternal.  
3. He sent down to you the Book with the 
Truth, confirming what came before it; and 
He sent down the Torah and the Gospel.  
4. Aforetime, as guidance for mankind; and 
He sent down the Criterion. Those who have rejected Allah’s signs will have a severe pun-
ishment. Allah is Mighty, Abl e to take re-
venge.  
5. Nothing is hidden from Allah, on earth or 
in the heaven.  
6. It is He who forms you in the wombs as He 
wills. There is no god except He, the Al-
mighty, the Wise.  
7. I

In [55]:
from langchain.prompts import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.chat_models import ChatOpenAI

In [56]:
#Set the question variable
question='who were Bani Israel??'

#Query the database as store the results as 'context_docs'
context_docs=db.similarity_search(question)

#Create a prompt with two variables 'context ' and 'question'
prompt=PromptTemplate(
    template=""""Use the following pieces of context to answer the questions at the end.If
    you don't know the answer, just say don't know. do not try to make up the answer.
    
    <context>
    {context}
    </context>
    
    Question: {question}
    Helpful Answer,formatted in markdown:""",
    input_variables=["context","question"]
)

#create an LLM with ChatOpenAI
llm=ChatOpenAI(temperature=0)

#create the chain
qa_chain=LLMChain(llm=llm,prompt=prompt)

#call the chain
result=qa_chain({
    "question":question,
    "context":"/n".join([doc.page_content for doc in context_docs])
})

#print the result
print(result['text'])

Bani Israel refers to the Children of Israel, who are mentioned in the context provided. They are the descendants of the prophet Jacob (also known as Israel) and are considered to be the chosen people of God in the Abrahamic religions.
