## Detailed article explaination

The detailed code explanation for this article is available at the following link:

https://www.daniweb.com/programming/computer-science/tutorials/541151/extracting-information-from-research-papers-using-langchain-openai

For my other articles for Daniweb.com, please see this link:

https://www.daniweb.com/members/1235222/usmanmalik57

## Downloading and Importing Required Libraries

In [None]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken
!pip install rich

In [None]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY_HERE"

## Reading and Chunking Text Documents

In [None]:
pdf_reader = PdfReader(r'D:\Datasets\1907.11692.pdf')

In [None]:
from typing_extensions import Concatenate

pdf_text = ''
for i, page in enumerate(pdf_reader.pages):
    page_content = page.extract_text()
    if page_content:
        pdf_text += page_content

In [None]:
pdf_text

In [None]:
splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
text_chunks = splitter.split_text(pdf_text)
print(f"Total chunks {len(text_chunks)}")
print("============================")
print(text_chunks[0])

In [None]:
embeddings = OpenAIEmbeddings()
embedding_vectors = FAISS.from_texts(text_chunks, embeddings)

## Extracting Information from Research Papers

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

qa_chain = load_qa_chain(OpenAI(), 
                         chain_type="stuff")

In [None]:
question = "Can you give me a list of datasets used in this paper?"

research_paper = embedding_vectors.similarity_search(question)

qa_chain.run(input_documents = research_paper, 
             question = question)

In [None]:
question = "Can you summarize the benchmark results from the paper?"

research_paper = embedding_vectors.similarity_search(question)

qa_chain.run(input_documents = research_paper, 
             question = question)

In [None]:
def get_answer(questions):
    
    answers = []
    for question in questions:
        
        research_paper = embedding_vectors.similarity_search(question)

        answer = qa_chain.run(input_documents = research_paper, 
                 question = question)
        
        answers.append(answer)
    
    return answers

In [None]:
from rich import print

questions = ["Can you give me a list of datasets used in this paper?",
             "What are the evaluation metrics used in the paper?",
             "Can you summarize the benchmark results from the paper?"]

answers = get_answer(questions)

for i in range(len(questions)):
    print(f"[bold]Question: {i+1}: {questions[i]} [/bold]")
    print(f"[bold]Answer:[/bold] {answers[i]}")
    print("==================================================")

