# Setup environment

## Install dependencies

In [1]:
# !pip install openai
# !python -m pip install python-dotenv
# !pip install --upgrade langchain
# !pip install faiss-cpu
# !pip install langchain-huggingface
# !pip install sentence-transformers

In [7]:
!source .env/bin/activate

## Initlize config dictionary

In [23]:
import json
with open("./resources/config.json") as f:
    config = json.load(f)

print(config["embedding_model"])

sentence-transformers/all-mpnet-base-v2


## Create embeddings using open-source hugginngface embedding model

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
# initilize loader to load the pdf file
loader = PyPDFLoader("./sample_data/jeff102.pdf")

In [8]:
# initilze the document client using the loader client already created
document = loader.load()

In [9]:
# Check the type of the document
type(document[0])

langchain_core.documents.base.Document

In [10]:
# Split the document into smaller chuck and make the overlap of 100 character between the chucks for efficent retrieval
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

chunks = text_splitter.split_documents(document)

In [11]:
print(len(chunks))
print(type(chunks))
print(type(chunks[0]))

# we can see the chucks with overlaping characters and also the meta data
print(chunks[0])
print(chunks[1])

34
<class 'list'>
<class 'langchain_core.documents.base.Document'>
page_content='BEFORE YOU READ\n•‘Apartheid’ is a political system that separates people according\nto their race. Can you say which of the three countries named\nbelow had such a political system until very recently?\n(i)  United States of America (ii)  South Africa (iii)  Australia\n•Have you heard of Nelson Mandela? Mandela, and his African\nNational Congress, spent a lifetime fighting against apartheid.\nMandela had to spend thirty years in prison. Finally, democratic\nelections were held in South Africa in 1994, and Mandela became\nthe first black President of a new nation.\nIn this extract fr om his autobiography, Long W alk to Fr eedom ,\nMandela speaks about a historic occasion, ‘the inauguration’. Can\nyou guess what the occasion might be? Check your guess with\nthis news item (from the BBC) of  10 May 1994.\n     Mandela Becomes South Africa’s First Black President\nNelson Mandela has become South Africa’s firs

In [18]:
# initilize embedding client
embedding_client = HuggingFaceEmbeddings(model_name=config['embedding_model'])

In [12]:
embedding_client

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [16]:
# Create vector db and index using FAISS
vectorstore = FAISS.from_documents(chunks, embedding_client)
vectorstore.save_local(config['vector_db_path'])

## Perform Retierival process

In [31]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [25]:
# inililize the llm model we will be using openAI llm model
llm = ChatOpenAI(
    api_key=config['openai_api_key'],
    model=config['openai_model']
)

In [26]:
# laod embeddings
vectorstore = FAISS.load_local(config['vector_db_path'], embedding_client, allow_dangerous_deserialization=True)

In [27]:
# inililize retriever
retriever = vectorstore.as_retriever()

In [41]:
# do some prompt engineering
system_prompt = (
    """
    You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer
    the question. Please be stick with the context and embeddings given to you. If you don't know the answer, say that you
    don't know.Keep the answer concise.
    \n\n
    {context}
    """
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [42]:
# Chain the pieces together
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [47]:
results = rag_chain.invoke({"input": "What does tiger thinks?"})
results

{'input': 'What does tiger thinks?',
 'context': [Document(page_content='The tiger behind the bars of his cage growls,\nThe tiger behind the bars of his cage snarls,\nThe tiger behind the bars of his cage roars.\nThen he thinks.\nIt would be nice not to be behind bars all\nThe time\nBecause they spoil my view\nI wish I were wild, not on show.\nBut if I were wild, hunters might shoot me,\nBut if I were wild, food might poison me,\nBut if I were wild, water might drown me.\nThen he stops thinking\nAnd...\nThe tiger behind the bars of his cage growls,\nThe tiger behind the bars of his cage snarls,\nThe tiger behind the bars of his cage roars .\nPETER NIBLETT\n30First Flight\nRationalised 2023-24', metadata={'source': './sample_data/jeff102.pdf', 'page': 14}),
  Document(page_content='A T A TA TA TA Tiger in the Zooiger in the Zooiger in the Zooiger in the Zooiger in the Zoo\nThis poem contrasts a tiger in the zoo with the tiger in its natural\nhabitat. The poem moves from the zoo to the j

In [48]:
print(f"Question: {results['input']}")
print(f"Answer: {results['answer']}")

Question: What does tiger thinks?
Answer: The tiger thinks that it would be nice not to be behind bars all the time because it spoils his view. He wishes he were wild, not on show, but he also worries about the dangers of being wild, such as hunters, poisoned food, and drowning in water.


In [37]:
print(f"Question: {results['input']}")
print(f"Answer: {results['answer']}")

Question: what are earthen pots?
Answer: Earthen pots are containers made from clay or soil that have been shaped and then fired at high temperatures to harden them. They are commonly used for cooking, storage, and decoration, and are known for their natural insulating properties and ability to retain moisture.


In [46]:
print(f"Question: {results['input']}")
print(f"Answer: {results['answer']}")

Question: What is emancipation?
Answer: Emancipation is the process of being set free from legal, social, or political restrictions; it often refers to the liberation of individuals from slavery or oppression, allowing them to enjoy freedom and equal rights.
