# RAG Masters Thesis Chatbot

i200762

Muhammad Umar Waseem

## Imports

In [195]:
import os
import getpass
import pickle

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.document_loaders import UnstructuredExcelLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

## Model Setup

In [176]:
if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass.getpass("Provide your Google API Key")
else:
    print("Google API Key already set from env")

Google API Key already set from env


In [178]:
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=os.environ["GOOGLE_API_KEY"])
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

## Data Preprocessing

In [180]:
loader = UnstructuredExcelLoader("dataset.xlsx")
docs = loader.load()

In [181]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=500,
)

documents = text_splitter.split_documents(docs)

In [182]:
print(len(documents))

995


In [219]:
print(documents[0].page_content)

MS Thesis Completed in Spring-2023




S. #
Thesis Title and Access Link (to PDF Report)
Link to the MS Thesis Report
Thesis Abstract


## Vector Database

In [196]:
vectordb = FAISS.from_documents(documents=documents ,embedding=embeddings)

In [197]:
with open("vectorstore.pkl", "wb") as f:
    pickle.dump(vectordb, f)

## Langchain Prompting Utilities

In [200]:
with open("vectorstore.pkl", "rb") as f:
    my_vector_database = pickle.load(f)

retriever = my_vector_database.as_retriever(search_kwargs={"k": 5})

In [201]:
template = """
You are a helpful AI assistant.
Answer based on the context provided. 
context: {context}
input: {input}
answer:
"""

prompt = PromptTemplate.from_template(template)
print(prompt)
print("\nInput Variables: ", prompt.input_variables)
print("\nPrompt Template: ", prompt.template)

input_variables=['context', 'input'] template='\nYou are a helpful AI assistant.\nAnswer based on the context provided. \ncontext: {context}\ninput: {input}\nanswer:\n'

Input Variables:  ['context', 'input']

Prompt Template:  
You are a helpful AI assistant.
Answer based on the context provided. 
context: {context}
input: {input}
answer:



In [202]:
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

## Get Contextural Answer

In [221]:
query = "What was Laraib Afzal thesis?"
response = retrieval_chain.invoke({ "input": query })

print(response)
print("\n\nUser Query: ", query)
print("Model Response: ", response["answer"])

{'input': 'What was Laraib Afzal thesis?', 'context': [Document(page_content='1\n21I-2191 Laraib Afzal\\n\\n\\n\\nProsodic alignment for Automatic dubbing\\n\\n\nhttps://drive.google.com/file/d/11nobbbLcTsNzHQHrs5mscGYP1bQ487wM/view?usp=drive_link', metadata={'source': 'dataset.xlsx'}), Document(page_content='24\nMuhammad Haroon Rasheed\\n\\nReading Comprehension Question Answering for Urdu\nhttps://drive.google.com/file/d/1LdXqw8xU96UDp34RghKF4uvQ3DqzxDO1/view?usp=sharing', metadata={'source': 'dataset.xlsx'}), Document(page_content='16\nAbdul Rafay\\n\\nParaphrasing of Urdu Paragraphs using Natural Language\nhttps://drive.google.com/file/d/1KweNaPzR9iupqfjrUz-UEpTpR5mSDaRx/view?usp=sharing', metadata={'source': 'dataset.xlsx'}), Document(page_content='`\n\n\n\n\nS. #\nThesis Title and Access Link (to PDF Report)\nLink to the MS Thesis Report\nThesis Abstract', metadata={'source': 'dataset.xlsx'}), Document(page_content='47\nUsama Khalid\\n\\nLanguage Model for Roman Urdu Trnsfer Lear