#### Import Dependencies

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.callbacks import get_openai_callback

#### Change Root Directory

In [2]:
import os
os.chdir("../")

#### Load PDF Data

In [3]:
file_path = "data/AI_Robotics.pdf"

In [4]:
loader = PyPDFLoader(file_path)
docs = loader.load()
print(len(docs))

56


In [5]:
docs

[Document(metadata={'source': 'data/AI_Robotics.pdf', 'page': 0}, page_content='Javier Andreu Perez, Fani Deligianni, Daniele Ravi and Guang-Zhong Yang  Artiﬁcial Intelligence and Robotics'),
 Document(metadata={'source': 'data/AI_Robotics.pdf', 'page': 1}, page_content='// Artificial Intelligence and Robotics\n'),
 Document(metadata={'source': 'data/AI_Robotics.pdf', 'page': 2}, page_content='UKRAS.ORG\nArtificial Intelligence and Robotics    //\n'),
 Document(metadata={'source': 'data/AI_Robotics.pdf', 'page': 3}, page_content="// Artificial Intelligence and Robotics\nWelcome to the UK-RAS White Paper \nSeries on Robotics and Autonomous \nSystems (RAS). This is one of the core \nactivities of UK-RAS Network, funded by \nthe Engineering and Physical Sciences \nResearch Council (EPSRC). By bringing \ntogether academic centres of excellence, \nindustry, government, funding bodies and \ncharities, the Network provides academic \nleadership, expands collaboration with \nindustry while int

In [6]:
print(docs[0].page_content[0:100])
print(docs[0].metadata)

Javier Andreu Perez, Fani Deligianni, Daniele Ravi and Guang-Zhong Yang  Artiﬁcial Intelligence and 
{'source': 'data/AI_Robotics.pdf', 'page': 0}


#### Set OpenAI API Key

In [7]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

#### Initialize LLM

In [82]:
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0, max_tokens=150) # using gpt-4o-mini for low resource models

#### Initialize Vector Store and Retriever

In [83]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = FAISS.from_documents(documents=splits, embedding=OpenAIEmbeddings(model='text-embedding-3-small'))
retriever = vectorstore.as_retriever()

#### Using Built-in Chains

In [98]:
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, reply with "
    "Data Not Available. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
# "You are a helpful assistant that answers questions based on the given context."
# "If the answer is not explicitly stated in the context, reply with 'Data Not Available'"
# "answer concise."
# "\n\n"
# "{context}"

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [99]:
# PDF included Q&A
with get_openai_callback() as cb:
    results = rag_chain.invoke({"input": "What is robotics?"})
print(f"total_tokens : {cb.total_tokens}, total_cost : {cb.total_cost}")
results['answer']

total_tokens : 626, total_cost : 0.00013394999999999998


"Robotics is a field that combines advances in mechatronics, electrical engineering, and computing to develop machines with sophisticated sensorimotor functions, allowing them to adapt to changing environments. It involves the integration of robots into existing environments, enabling them to perform specialized autonomous tasks such as navigating, manipulating objects, and collaborating. The goal is to optimize the level of autonomy through learning and enhance the robots' ability to perceive, plan, and execute tasks."

In [97]:
# PDF included Q&A
with get_openai_callback() as cb:
    results = rag_chain.invoke({"input": "What is robotics?"})
print(f"total_tokens : {cb.total_tokens}, total_cost : {cb.total_cost}")
results['answer']

total_tokens : 612, total_cost : 0.0001314


'Robotics is a field that builds on advances in mechatronics, electrical engineering, and computing to develop increasingly sophisticated sensorimotor functions that enable machines to adapt to their ever-changing environment. It involves the integration of machines into existing environments, allowing for autonomy in perceiving, planning, and executing tasks such as manipulating, navigating, and collaborating. Robotics aims to optimize the level of autonomy through learning and includes applications in various specialized autonomous tasks.'

In [86]:
with get_openai_callback() as cb:
    results = rag_chain.invoke({"input": "Can you tell me what is Mechanical Engineering?"})
print(f"total_tokens : {cb.total_tokens}, total_cost : {cb.total_cost}")
results['answer']

total_tokens : 646, total_cost : 9.87e-05


'Data Not Available.'

In [87]:
with get_openai_callback() as cb:
    results = rag_chain.invoke({"input": "explain the ARTIFICIAL INTELLIGENCE AND THE BIG BRAIN"})
print(f"total_tokens : {cb.total_tokens}, total_cost : {cb.total_cost}")
results['answer']

total_tokens : 920, total_cost : 0.0001713


'"Artificial Intelligence and the Big Brain" refers to the efforts to simulate the human brain\'s complexity and dynamics through advanced computational models. Projects like the Blue Brain Project aim to replicate brain functions to enhance our understanding of intelligence and consciousness. However, there is ongoing debate about whether these simulations can truly emulate conscious, intelligent behavior, given our limited understanding of the brain\'s workings.'

#### Creating Custom Chains

In [118]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    print(f"Token Used :: {num_tokens}")
    return num_tokens

num_tokens_from_string("testing word count", "cl100k_base")

Token Used :: 3


3

In [109]:

from openai import OpenAI

def get_answer_from_llm(question, context):
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        max_tokens=150,
        temperature=0,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based on the given context. If the answer is not explicitly stated in the context, reply with 'Data Not Available'."},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
        ]
    )
    return response.choices[0].message.content

In [110]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
import numpy as np

def process_data_to_vectorstore(documents):
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small')
    
    # embedding_size = 1536
    # index = faiss.IndexFlatL2(embedding_size)

    # vector_store = FAISS(
    #     embedding_function=embeddings,
    #     index=index,
    #     docstore=InMemoryDocstore(),
    #     index_to_docstore_id={}
    # )

    # texts = [doc.page_content for doc in documents]
    # metadata = [doc.metadata for doc in documents]
    # vector_store.add_texts(texts, metadata)
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)
    vector_store = FAISS.from_documents(documents=texts, embedding=embeddings)
    return vector_store

In [111]:
def check_answer_confidence(question, docs, answer):
    question_lower = question.lower()
    for doc in docs:
        if question_lower in doc.lower():
            return doc.strip()
    if len(answer) < 10: # already set the response "Data Not Available" in the prompt
        return "Data Not Available"
    return answer

In [112]:
def get_loaded_document(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    return documents

In [113]:
def custom_chain(pdf_file,questions):
    documents = get_loaded_document(pdf_file)
    vector_store = process_data_to_vectorstore(documents)
    answers = {}
    for question in questions:
        docs = vector_store.similarity_search(query=question, k=3)
        context = " ".join([doc.page_content for doc in docs])
        num_tokens_from_string(context, "cl100k_base")
        generated_response = get_answer_from_llm(question, context)
        final_answer = check_answer_confidence(question, context, generated_response)
        answers[question] = final_answer
    return answers

In [114]:
questions = ['what is robotics', 'explain the ARTIFICIAL INTELLIGENCE AND THE BIG BRAIN', 'Can you tell me what is Mechanical Engineering?']

In [115]:
answers = custom_chain(file_path,questions)
answers

Token Used :: 275
Token Used :: 579
Token Used :: 396


{'what is robotics': "Robotics is the field that builds on advances in mechatronics, electrical engineering, and computing to develop increasingly sophisticated sensorimotor functions in machines. These functions enable machines to adapt to their ever-changing environment. Robotics involves the integration of machines into existing environments, allowing for greater autonomy in tasks such as perceiving, planning, and executing actions like manipulating, navigating, and collaborating. The convergence of AI and robotics aims to optimize the level of autonomy through learning, enhancing the machines' ability to predict future outcomes in various interactions.",
 'explain the ARTIFICIAL INTELLIGENCE AND THE BIG BRAIN': 'The section titled "ARTIFICIAL INTELLIGENCE AND THE BIG BRAIN" discusses the advancements in simulating the human brain and the implications for artificial intelligence (AI). It highlights that creating a computer as fast and complex as the human brain is becoming increasin

In [116]:
questions_two = ['what is the use of robotics', 'who is messi', 'Can you tell me what is Chemical Engineering?']

In [117]:
answers = custom_chain(file_path,questions_two)
answers

Token Used :: 400
Token Used :: 372
Token Used :: 560


{'what is the use of robotics': 'The use of robotics includes the following objectives as defined by the US military: \n\n1) Increase knowledge abilities in operations’ theatres.\n2) Reduce the amount of charge carried by the soldier.\n3) Improve logistics capacity.\n4) Facilitate movement and manoeuvring.\n5) Increase the protection of forces.\n\nAdditionally, robotics is developing increasingly sophisticated sensorimotor functions that allow machines to adapt to their environment, and they are becoming more autonomous, capable of interacting, executing tasks, and making complex decisions.',
 'who is messi': 'Data Not Available.',
 'Can you tell me what is Chemical Engineering?': 'Data Not Available.'}