In [97]:
import importlib
import pandas as pd
import sys
import os 
import itertools
import warnings
import PyPDF2

# add parent path to python path so that we can import from src
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
importlib.reload(importlib.import_module('src.utils'))

from src.config import DATA_PATH, RAW_PATH, PROCESSED_PATH, OUTPUTS_PATH
from src.utils import read_pdf,read_all_pdfs_concatenate

warnings.filterwarnings("ignore") # To ignore all future warnings


## reading pdf files 

In [25]:
pdf_file_path = "../data/raw/"
pdf_data = read_all_pdfs_concatenate(pdf_file_path)

In [26]:
print(pdf_data)

For educational purposes only   
[45378] Colonoscopy, flexible; diagnostic  • Colorectal cancer screening, as indicated by 1 or more of the following: o Patient has average-risk or higher, as indicated by ALL of the following § Age 45 years or older § No colonoscopy in past 10 years o High risk family history, as indicated by 1 or more of the following: § Colorectal cancer diagnosed in one or more first-degree relatives of any age and ALL of the following: • Age 40 years or older • Symptomatic (eg, abdominal pain, iron deficiency anemia, rectal bleeding) § Family member with colonic adenomatous polyposis of unknown etiology o Juvenile polyposis syndrome diagnosis indicated by 1 or more of the following: § Age 12 years or older and symptomatic (eg, abdominal pain, iron deficiency anemia, rectal bleeding, telangiectasia) § Age younger than 12 years and symptomatic (eg, abdominal pain, iron deficiency anemia, rectal bleeding, telangiectasia) 

Co:Helm  
MEDICAL RECORD Patient Name: James 

## Step 1 — Process data and build vector store

In [84]:
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings

# Load PDF file from data path
loader = DirectoryLoader('../data/raw/',
                         glob="*.pdf",
                         loader_cls=PyPDFLoader)
documents = loader.load()

# Split text from PDF into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                               chunk_overlap=50)
texts = text_splitter.split_documents(documents)

# Load embeddings for BiomedNLP-PubMedBERT model
embeddings = HuggingFaceEmbeddings(model_name='microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext',
                                   model_kwargs={'device': 'cpu'})

# Build and persist FAISS vector store
vectorstore = FAISS.from_documents(texts, embeddings)
vectorstore.save_local('../models/vectorstore/db_faiss')

No sentence-transformers model found with name C:\Users\Administrator/.cache\torch\sentence_transformers\microsoft_BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext. Creating a new one with MEAN pooling.


## Step 2 — Testing the QA model 

In [65]:
qa_prompt = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Context: {context}
Question: {question}
Only return the helpful answer below and nothing else.
"""

In [1]:
from langchain.llms import HuggingFacePipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=100
)
pipe.save_pretrained('../models/llm/')

local_llm = HuggingFacePipeline(pipeline=pipe)


  from .autonotebook import tqdm as notebook_tqdm


In [94]:
from langchain.chains import RetrievalQA

def build_retrieval_qa(llm, prompt, vectordb):
    dbqa = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type='stuff',
                                       retriever=vectordb.as_retriever(search_kwargs={'k':10}),
                                       return_source_documents=True)
    return dbqa

vectordb = FAISS.load_local('../models/vectorstore/db_faiss', embeddings)
dbqa = build_retrieval_qa(local_llm, qa_prompt, vectordb)

In [95]:
response = dbqa({'query': 'Give me Patient Names in all documents'})

Token indices sequence length is longer than the specified maximum sequence length for this model (1067 > 512). Running this sequence through the model will result in indexing errors


In [96]:
response

{'query': 'Give me Patient Names in all documents',
 'result': 'James Maddison',
 'source_documents': [Document(page_content='PATIENT MEDICAL RECORD Name: James Maddison Date of Birth: 03/15/1965 Gender: Male Address: 1234 Sunset Blvd, Los Angeles, California 90026 Contact Number: (214) 555-0123 Emergency Contact: (214) 555-0456 MEDICAL HISTORY: • Family History: Father had colorectal cancer at a significantly advanced age. • Personal Medical History: Hypertension, reportedly managed with medication. • Medications: Lisinopril 10 mg daily, and possibly other medications not clearly listed. • Allergies: No known drug', metadata={'source': '..\\data\\raw\\medical-record-2.pdf', 'page': 0}),
  Document(page_content='cancer at age 68. • Personal Medical History: Hypertension, managed with medication. • Medications: Lisinopril 10mg daily. • Allergies: No known drug allergies.  ALLERGIES • Allergies not reviewed (last reviewed 11/28/2022) • NKDA', metadata={'source': '..\\data\\raw\\medical-r