# Installing Necessary Libraries

In [1]:
! pip install sentence_transformers pypdf faiss-cpu
! pip install langchain langchain-openai
! pip install langchain_community
! pip install -U langchain-huggingface



# Importing necessary libraries

In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
import dotenv

# Open AI Environment

In [3]:
dotenv.load_dotenv()

True

In [4]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(model_name="gpt-3.5-turbo")

  warn_deprecated(


# Importing Dataset

In [5]:
loader = PyPDFLoader("Dataset/RAG Input Doc.pdf")
documents = loader.load()
documents

[Document(page_content="Title:  MeshAnything: Artist -Created Mesh Generation with Autoregressive Transformers  \nAuthors:  buaacyw/meshanything  \nDate:  14 Jun 2024  \nDescription:  Recently, 3D assets created via reconstruction and generation have matched the \nquality of manually crafted assets, highlighting their potential for replacement.  \nStats:  417, 5.09 stars / hour  \nCategories:  Decoder  \nLinks:  Paper, Code  \n \nTitle:  Accessing GPT -4 level Mathematical Olympiad Solutions via Monte Carlo Tree Self -\nrefine with LLaMa -3 8B  \nAuthors:  trotsky1997/mathblackbox  \nDate:  11 Jun 2024  \nDescription:  This paper introduces the MCT Self -Refine algorithm, an innovative integration of \nLarge Language Models (LLMs) with Monte Carlo Tree Search (MCTS), designed to enhance \nperformance in complex mathematical reasoning tasks.  \nStats:  279, 2.35 stars / hour  \nCategories:  Decision Making, GSM8K +2  \nLinks:  Paper, Code  \n \nTitle:  TextGrad: Automatic 'Differentiati

In [6]:
# Split text
text = RecursiveCharacterTextSplitter().split_documents(documents)
text

[Document(page_content="Title:  MeshAnything: Artist -Created Mesh Generation with Autoregressive Transformers  \nAuthors:  buaacyw/meshanything  \nDate:  14 Jun 2024  \nDescription:  Recently, 3D assets created via reconstruction and generation have matched the \nquality of manually crafted assets, highlighting their potential for replacement.  \nStats:  417, 5.09 stars / hour  \nCategories:  Decoder  \nLinks:  Paper, Code  \n \nTitle:  Accessing GPT -4 level Mathematical Olympiad Solutions via Monte Carlo Tree Self -\nrefine with LLaMa -3 8B  \nAuthors:  trotsky1997/mathblackbox  \nDate:  11 Jun 2024  \nDescription:  This paper introduces the MCT Self -Refine algorithm, an innovative integration of \nLarge Language Models (LLMs) with Monte Carlo Tree Search (MCTS), designed to enhance \nperformance in complex mathematical reasoning tasks.  \nStats:  279, 2.35 stars / hour  \nCategories:  Decision Making, GSM8K +2  \nLinks:  Paper, Code  \n \nTitle:  TextGrad: Automatic 'Differentiati

In [7]:
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5", encode_kwargs={"normalize_embeddings": True}
)

  from tqdm.autonotebook import tqdm, trange


In [8]:
# Create a vectorstore
vectorstore = FAISS.from_documents(text, embeddings)

# Save the documents and embeddings
vectorstore.save_local("vectorstore.db")

# create retriever
retriever = vectorstore.as_retriever()

In [9]:
# create document chain
template = """"
You are an assistant for question-answering tasks.
Use the provided context only to answer the following question:

<context>
{context}
</context>

Question: {input}
"""
prompt = ChatPromptTemplate.from_template(template)
doc_chain = create_stuff_documents_chain(llm, prompt)

In [10]:
chain = create_retrieval_chain(retriever, doc_chain)

In [13]:
questions = [
    "What is the focus of the 'MeshAnything' project?",
    "Which paper discusses the integration of Large Language Models with Monte Carlo Tree Search?",
    "What advancements does the 'VideoLLaMA 2' paper propose?",
    "Which paper was published most recently?",
    "Identify a paper that deals with language modeling and its scalability.",
    "Which paper aims at improving accuracy in Google-Proof Question Answering?",
    "List the categories covered by the paper titled 'TextGrad: Automatic \"Differentiation\" via Text'.",
    "Which paper received the highest number of stars per hour?",
    "Who is Emma Stone?",  # irrelevant question
]
for question in questions:
    response = chain.invoke({"input": question})
    if response["answer"]:
        print(f"Question: {question}\nAnswer: {response['answer']}\n\n")
    else:
        print(f"Question: {question}\nNo information\n\n")

Question: What is the focus of the 'MeshAnything' project?
Answer: The focus of the 'MeshAnything' project is artist-created mesh generation using autoregressive transformers.


Question: Which paper discusses the integration of Large Language Models with Monte Carlo Tree Search?
Answer: The paper titled "Accessing GPT-4 level Mathematical Olympiad Solutions via Monte Carlo Tree Self-refine with LLaMa-3 8B" discusses the integration of Large Language Models with Monte Carlo Tree Search.


Question: What advancements does the 'VideoLLaMA 2' paper propose?
Answer: The 'VideoLLaMA 2' paper proposes advancements in spatial-temporal modeling and audio understanding in video and audio-oriented tasks using Video Large Language Models (Video-LLMs).


Question: Which paper was published most recently?
Answer: The paper titled "MeshAnything: Artist-Created Mesh Generation with Autoregressive Transformers" was published most recently on 14 Jun 2024.


Question: Identify a paper that deals with la

# Trying with a Different Document

In [14]:
loader = PyPDFLoader("Dataset/Weinberg-tumor progression.pdf")
documents = loader.load()
documents

[Document(page_content='How Cancer ArisesHow cancer develops is no\nlonger a mystery. During thepast two decades, investiga-\ntors have made astonishing progress inidentifying the deepest bases of the pro-cess\n—those at the molecular level. These\ndiscoveries are robust: they will survivethe scrutiny of future generations of re-searchers, and they will form the foun-dation for revolutionary approaches totreatment. No one can predict exactlywhen therapies targeted to the molecu-lar alterations in cancer cells will ﬁndwide use, given that the translation ofnew understanding into clinical prac-tice is complicated, slow and expensive.But the effort is now under way.\nIn truth, the term “cancer” refers to\nmore than 100 forms of the disease. Al-most every tissue in the body can spawnmalignancies; some even yield severaltypes. What is more, each cancer hasunique features. Still, the basic processesthat produce these diverse tumors ap-pear to be quite similar. For that reason,\nI will refer 

In [15]:
# Split text
text = RecursiveCharacterTextSplitter().split_documents(documents)
text

[Document(page_content='How Cancer ArisesHow cancer develops is no\nlonger a mystery. During thepast two decades, investiga-\ntors have made astonishing progress inidentifying the deepest bases of the pro-cess\n—those at the molecular level. These\ndiscoveries are robust: they will survivethe scrutiny of future generations of re-searchers, and they will form the foun-dation for revolutionary approaches totreatment. No one can predict exactlywhen therapies targeted to the molecu-lar alterations in cancer cells will ﬁndwide use, given that the translation ofnew understanding into clinical prac-tice is complicated, slow and expensive.But the effort is now under way.\nIn truth, the term “cancer” refers to\nmore than 100 forms of the disease. Al-most every tissue in the body can spawnmalignancies; some even yield severaltypes. What is more, each cancer hasunique features. Still, the basic processesthat produce these diverse tumors ap-pear to be quite similar. For that reason,\nI will refer 

In [16]:
# Create a vectorstore
vectorstore = FAISS.from_documents(text, embeddings)

# Save the documents and embeddings
vectorstore.save_local("vectorstore.db")

# create retriever
retriever = vectorstore.as_retriever()

In [17]:
# create document chain
template = """"
You are an assistant for question-answering tasks.
Use the provided context only to answer the following question:

<context>
{context}
</context>

Question: {input}
"""
prompt = ChatPromptTemplate.from_template(template)
doc_chain = create_stuff_documents_chain(llm, prompt)

In [18]:
chain = create_retrieval_chain(retriever, doc_chain)

In [20]:
questions = [
    "What are the molecular mechanisms behind cancer development as discussed in Robert A. Weinberg's paper?",
    "How does the paper describe the role of oncogenes in the development of cancer?",
    "What insights does Weinberg provide about tumor suppressor genes and their function?",
    "According to the paper, how do mutations contribute to the onset of cancer?",
    "What new therapies for cancer treatment are suggested by Robert A. Weinberg?",
    "What are the effects of climate change on polar bear populations?",  # irrelevant question
    "How does quantum computing differ from classical computing?",  # irrelevant question
    "What are the main causes of the global financial crisis of 2008?",  # irrelevant question
]

for question in questions:
    response = chain.invoke({"input": question})
    if response["answer"]:
        print(f"Question: {question}\nAnswer: {response['answer']}\n\n")
    else:
        print(f"Question: {question}\nNo information\n\n")

Question: What are the molecular mechanisms behind cancer development as discussed in Robert A. Weinberg's paper?
Answer: The molecular mechanisms behind cancer development as discussed in Robert A. Weinberg's paper involve mutations in specific classes of genes, such as proto-oncogenes and tumor suppressor genes. Proto-oncogenes, when mutated, can become oncogenes that drive excessive cell proliferation. On the other hand, tumor suppressor genes contribute to cancer when they are inactivated by mutations, leading to uncontrolled cell growth. These mutations accumulate in the DNA of cells, ultimately leading to the development of malignancies. Additionally, abnormal signaling pathways within cells, caused by mutations in genes like ras, play a role in promoting cancer development.


Question: How does the paper describe the role of oncogenes in the development of cancer?
Answer: The paper describes how oncogenes drive excessive cell proliferation in cancer by mutating and becoming canc