In [2]:
from langchain_community.document_loaders import PyPDFLoader

In [3]:
loader = PyPDFLoader('../data/LLM.pdf')

In [4]:
documents = loader.load()

In [5]:
documents

[Document(metadata={'source': '../data/LLM.pdf', 'page': 0}, page_content='Original Investigation| Statistics and Research Methods\nPerformance of a Large Language Model in Screening Citations\nTakehikoOami,MD,PhD;YoheiOkada,MD,PhD;Taka-akiNakada,MD,PhD\nAbstract\nIMPORTANCE Largelanguagemodels(LLMs)arepromisingastoolsforcitationscreeningin\nsystematicreviews.H owever,theirapplicabilityhasnotyetbeendetermined.\nOBJECTIVE ToevaluatetheaccuracyandefficiencyofanLLMintitleandabstractliterature\nscreening.\nDESIGN, SETTING, AND PARTICIPANTSThisprospectivediagnosticstudyusedthedatafromthe\ntitleandabstractscreeningprocessfor5clinicalquestions(CQs)inthedevelopmentoftheJapanese\nClinicalPracticeGuidelinesforManagementofSepsisandSepticShock.TheLLMdecidedtoinclude\norexcludecitationsbasedontheinclusionandexclusioncriteriaintermsofpatient,population,\nproblem;intervention;comparison;andstudydesignoftheselectedCQandwascomparedwiththe\nconventionalmethodfortitleandabstractscreening.Thisstudywascond

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 50)

In [10]:
chunks = splitter.split_documents(documents)

In [11]:
chunks

[Document(metadata={'source': '../data/LLM.pdf', 'page': 0}, page_content='Original Investigation| Statistics and Research Methods\nPerformance of a Large Language Model in Screening Citations\nTakehikoOami,MD,PhD;YoheiOkada,MD,PhD;Taka-akiNakada,MD,PhD\nAbstract\nIMPORTANCE Largelanguagemodels(LLMs)arepromisingastoolsforcitationscreeningin\nsystematicreviews.H owever,theirapplicabilityhasnotyetbeendetermined.\nOBJECTIVE ToevaluatetheaccuracyandefficiencyofanLLMintitleandabstractliterature\nscreening.\nDESIGN, SETTING, AND PARTICIPANTSThisprospectivediagnosticstudyusedthedatafromthe\ntitleandabstractscreeningprocessfor5clinicalquestions(CQs)inthedevelopmentoftheJapanese\nClinicalPracticeGuidelinesforManagementofSepsisandSepticShock.TheLLMdecidedtoinclude\norexcludecitationsbasedontheinclusionandexclusioncriteriaintermsofpatient,population,\nproblem;intervention;comparison;andstudydesignoftheselectedCQandwascomparedwiththe\nconventionalmethodfortitleandabstractscreening.Thisstudywascond

In [12]:
from langchain_community.vectorstores import FAISS

In [13]:
from langchain_ollama.embeddings import OllamaEmbeddings

In [14]:
embeddings = OllamaEmbeddings(
    model = 'llama3.2:1b'
)

In [17]:
db = FAISS.from_documents(chunks, embedding = embeddings)

In [18]:
db

<langchain_community.vectorstores.faiss.FAISS at 0x19db31651c0>

In [19]:
from langchain_ollama.llms import OllamaLLM

In [20]:
llm = OllamaLLM(
    model = 'llama3.2:1b'
)

In [21]:
from langchain_core.prompts import ChatPromptTemplate

In [32]:
prompt = ChatPromptTemplate.from_template(
    """
    Answer the following question using the context data only
    <context>
    {context}
    </context>
    Questions: {input}
"""
)

In [33]:
from langchain.chains.combine_documents import create_stuff_documents_chain

In [34]:
chain = create_stuff_documents_chain(llm, prompt)

In [35]:
retriever = db.as_retriever()

In [36]:
retriever

VectorStoreRetriever(tags=['FAISS', 'OllamaEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000019DB31651C0>, search_kwargs={})

In [37]:
from langchain.chains.retrieval import create_retrieval_chain

In [38]:
retrieval_chain = create_retrieval_chain(retriever, chain)

In [45]:
response = retrieval_chain.invoke({'input': "Give summary of the discussion"})

In [47]:
response

{'input': 'Give summary of the discussion',
 'context': [Document(metadata={'source': '../data/LLM.pdf', 'page': 3}, page_content='Figure 1. Schematic Diagram of Systematic Review Using Large Language Model (LLM)–Assisted Citation Screening and the Conventional Method\nRecords identified via databases and registers\n6831 4312 1146 5464 2417\nRecords removed before screening\nDuplicate records\nremoved 1197 894 108 1138 164\nTitle and abstract screening\n5634 3418 1038 4326 2253\nConventional citation screening LLM-assisted citation screeningTime measurement\nRecords excluded\n5522 3401 1024 4256 2214\nReports not retrieved\n0002 1 2\nFull-text screening\n(standard reference in secondary analysis) Index results by\nLLM-assisted citation screening\n112 17 14 68 17\nRecords excluded\nTotal\nDifferent language\nDifferent publication type\nDifferent population\nDifferent study design\nDuplicate  reports\n104\n3\n61\n23\n11\n6\n13\n1\n0\n0\n12\n0\n10\n1\n6\n1\n2\n0\n51\n0\n21\n2\n25\n3\n9\n0