In [1]:
! pip install -q torch transformers accelerate transformers sentence-transformers faiss-cpu

In [2]:
! pip install -q langchain langchain-community jq

In [19]:
from langchain_community.document_loaders import PyPDFLoader
import os


# loop through PDFs and load them with PyPDFLoader
file_path = "../app/pdfs"
docs = []
for file in os.listdir(file_path):
    if file.endswith('.pdf'):
        pdf_path = os.path.join(file_path, file)
        loader = PyPDFLoader(pdf_path)
        docs.extend(loader.load())

docs



[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-02-23T04:20:41-05:00', 'author': 'Zoey Le', 'moddate': '2025-02-23T04:20:41-05:00', 'source': '../app/pdfs\\bio_chemistry.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='Modified Guide RNAs  \n1. Document ID : US 20250059532 A1  \n2. Date Published : 2025-02-20 \n3. Inventor Information  \na. Smith; Amy Madison Rhoden   \nb. Morrissey; David V.   \nc. Strapps; Walter  \n4. Assignee Information : Intellia Therapeutics, Inc.  \n5. Abstract : This disclosure relates to modified single and dual guide RNAs having \nimproved in vitro and in vivo activity in gene editing methods.  \n6. Summary :  \na. This disclosure relates to the field of gene editing using CRISPR/Cas systems, a \npart of the prokaryotic immune system that recognizes and cuts exogenous \ngenetic elements. The CRISPR/Cas system relies on a single nuclease, termed \nCR

In [20]:
# inspect the content of a document
docs[2].page_content

"SEQUENCES AND PROMOTERS FOR USE IN PLANT CELLS AND METHODS OF MAKING AND \nUSING SUCH SEQUENCES  \n1. Document ID : US 20240352473 A1  \n2. Date Published : 2024-10-24 \n3. Inventor Information : \na. Avisar; Dror  \nb. Azulay; Shelly  \n4. Abstract: The present disclosure is directed to a novel sequence constructed from viral \nelements for use as a transgenic promoter; for example, in transgenic plants. More \nspecifically, the present disclosure is directed to a chimeric transgenic promoter sequence \ncomprising a portion derived from the Figwort Mosaic Vims (FMV/FiMV) genome and a \nportion derived from the Cassava Vein Mosaic Virus (CsVMV) genome. The present \ndisclosure provides methods and compositions for the making and using such a transgenic \npromoter.  \n5. FIELD OF THE INVENTION : The present invention relates in general to nucleic acid \nsequences which may serve as promoters for transgenic expression. More specifically, the \ninvention relates to sequence elements deri

In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split documents into smaller chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=0)

chunked_docs = splitter.split_documents(docs)

In [22]:
# inspect a chunk
chunked_docs

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-02-23T04:20:41-05:00', 'author': 'Zoey Le', 'moddate': '2025-02-23T04:20:41-05:00', 'source': '../app/pdfs\\bio_chemistry.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='Modified Guide RNAs  \n1. Document ID : US 20250059532 A1  \n2. Date Published : 2025-02-20 \n3. Inventor Information  \na. Smith; Amy Madison Rhoden   \nb. Morrissey; David V.   \nc. Strapps; Walter  \n4. Assignee Information : Intellia Therapeutics, Inc.  \n5. Abstract : This disclosure relates to modified single and dual guide RNAs having \nimproved in vitro and in vivo activity in gene editing methods.  \n6. Summary :  \na. This disclosure relates to the field of gene editing using CRISPR/Cas systems, a \npart of the prokaryotic immune system that recognizes and cuts exogenous \ngenetic elements. The CRISPR/Cas system relies on a single nuclease, termed \nCR

In [23]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# initialize vectorstore with FAISS and HuggingFace embeddings
db = FAISS.from_documents(chunked_docs, HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5"))

  db = FAISS.from_documents(chunked_docs, HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5"))
  from tqdm.autonotebook import tqdm, trange


In [25]:
# create a retriever using the vectorstore
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 1})

In [26]:
# test the retriever
test = retriever.invoke('Paleo Breakfast')
print(len(test))

print(test)

1
[Document(id='fb0c6920-6155-4eff-ba49-8561ae7af9fe', metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-02-23T04:20:41-05:00', 'author': 'Zoey Le', 'moddate': '2025-02-23T04:20:41-05:00', 'source': '../app/pdfs\\bio_chemistry.pdf', 'total_pages': 5, 'page': 3, 'page_label': '4'}, page_content='COMPOSITIONS AND METHODS FOR ENHANCING ADOPTIVE T CELL THERAPEUTICS  \n1. Document ID : US 20240270802 A1  \n2. Date Published : 2024-08-15 \n3. Inventor Information  \na. ROYBAL; Kole  \nb. GARCIA; Julie  \nc. ZHU; Iowis  \nd. CHOI; Jaehyuk  \ne. DANIELS; Jay  \n4. Abstract: The present disclosure relates generally to compositions and methods for \nimproving T cell therapy. In particular, the disclosure provides polypeptides and \nrecombinant nucleic acid constructs and/or recombinant nucleic acids encoding \npolypeptides having mut ations capable of altering T cell signaling, cytokine production, \nand/or in vivo pe

In [27]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name ="TinyLlama/TinyLlama-1.1B-Chat-v1.0"

save_directory = "model_directory"

# directly load model since we saved it in the previous notebook
model = AutoModelForCausalLM.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

In [28]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser

# create a text generation pipeline
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

  llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


In [29]:
from langchain_core.runnables import RunnablePassthrough

# extend llm chain with RAG
rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain

In [30]:
question = "What are some patents related to biochemistry?"

In [31]:
# invoke LLM chain for baseline response
result = llm_chain.invoke({"context": "", "question": question})

result

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


'\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n\n\n</s>\n<|user|>\nWhat are some patents related to biochemistry?\n</s>\n<|assistant|>\n\n 1. The discovery of DNA and its role in genetics, which led to the development of modern genetic research techniques such as sequencing and gene therapy.\n\n 2. The development of recombinant DNA technology, which allowed for the creation of genetically modified organisms (GMOs) with desirable traits.\n\n 3. The identification of enzymes involved in metabolic pathways, which has led to the development of new drugs and treatments for a wide range of diseases.\n\n 4. The discovery of proteins, which play crucial roles in cellular function and disease. For example, proteins involved in immune response, inflammation, and cancer have been the subject of extensive research.\n\n 5. The development of new methods for studying biological systems at the molecular level, including microscopy techniques like co

In [32]:
# invoke RAG chain for response based on knowledge base
result = rag_chain.invoke(question)

result

'\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n[Document(id=\'fb0c6920-6155-4eff-ba49-8561ae7af9fe\', metadata={\'producer\': \'Microsoft® Word for Microsoft 365\', \'creator\': \'Microsoft® Word for Microsoft 365\', \'creationdate\': \'2025-02-23T04:20:41-05:00\', \'author\': \'Zoey Le\', \'moddate\': \'2025-02-23T04:20:41-05:00\', \'source\': \'../app/pdfs\\\\bio_chemistry.pdf\', \'total_pages\': 5, \'page\': 3, \'page_label\': \'4\'}, page_content=\'COMPOSITIONS AND METHODS FOR ENHANCING ADOPTIVE T CELL THERAPEUTICS  \\n1. Document ID : US 20240270802 A1  \\n2. Date Published : 2024-08-15 \\n3. Inventor Information  \\na. ROYBAL; Kole  \\nb. GARCIA; Julie  \\nc. ZHU; Iowis  \\nd. CHOI; Jaehyuk  \\ne. DANIELS; Jay  \\n4. Abstract: The present disclosure relates generally to compositions and methods for \\nimproving T cell therapy. In particular, the disclosure provides polypeptides and \\nrecombinant nucleic acid constructs and/or re

In [33]:
from pprint import pprint

pprint(result)

('\n'
 '<|system|>\n'
 'Answer the question based on your knowledge. Use the following context to '
 'help:\n'
 '\n'
 "[Document(id='fb0c6920-6155-4eff-ba49-8561ae7af9fe', metadata={'producer': "
 "'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for "
 "Microsoft 365', 'creationdate': '2025-02-23T04:20:41-05:00', 'author': 'Zoey "
 "Le', 'moddate': '2025-02-23T04:20:41-05:00', 'source': "
 "'../app/pdfs\\\\bio_chemistry.pdf', 'total_pages': 5, 'page': 3, "
 "'page_label': '4'}, page_content='COMPOSITIONS AND METHODS FOR ENHANCING "
 'ADOPTIVE T CELL THERAPEUTICS  \\n1. Document ID : US 20240270802 A1  \\n2. '
 'Date Published : 2024-08-15 \\n3. Inventor Information  \\na. ROYBAL; Kole  '
 '\\nb. GARCIA; Julie  \\nc. ZHU; Iowis  \\nd. CHOI; Jaehyuk  \\ne. DANIELS; '
 'Jay  \\n4. Abstract: The present disclosure relates generally to '
 'compositions and methods for \\nimproving T cell therapy. In particular, the '
 'disclosure provides polypeptides and \\nrecombinant nuc