In [None]:
%pip install llama_index==0.8.38 pypdf sentence-transformers

In [1]:
import os
from dotenv import load_dotenv
from genai.credentials import Credentials
from genai.model import Model
from genai.schemas import GenerateParams
from genai.credentials import Credentials
from genai.extensions.langchain import LangChainInterface

from llama_index import SimpleDirectoryReader,VectorStoreIndex,ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.indices.postprocessor import NERPIINodePostprocessor

try:
    import chromadb
    from chromadb.api.types import EmbeddingFunction
except ImportError:
    raise ImportError("Could not import chromdb: Please install chromadb package.")

# make sure you have a .env file under ibm-generative-ai root with
# GENAI_KEY=<your-genai-key>
# GENAI_API=<genai-api-endpoint>
load_dotenv()
api_key = os.getenv("GENAI_KEY", None)
api_url = os.getenv("GENAI_API", None)
creds = Credentials(api_key, api_endpoint=api_url) # credentials object to access the LLM service

The class ModelType is being deprecated.
Please replace any reference to ModelType by its model id string equivalent.
Example :
  ModelType.FLAN_T5 becomes "google/flan-t5-xxl"[0m

  from genai.schemas.models import ModelType


In [2]:

params = GenerateParams(
    decoding_method="greedy",
    max_new_tokens=100,
    min_new_tokens=1,
    stream=False,
    temperature=0,
)

langchain_model = LangChainInterface(model="google/flan-t5-xxl", params=params, credentials=creds)


In [3]:
!curl https://pii-tools.com/wp-content/uploads/2021/10/health_report.pdf --output health_report.pdf

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  269k    0  269k    0     0   183k      0 --:--:--  0:00:01 --:--:--  184k


In [4]:
store_path="../vectors"
dataset_path="../dataset"

In [5]:
#load data
from llama_index.text_splitter import SentenceSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

chroma_client = chromadb.PersistentClient(path=store_path)

chroma_collection = chroma_client.create_collection(
        name="search", 
        get_or_create=True
    )

chunk_size=1000
chunk_overlap=100

# text_splitter = SentenceSplitter(
#   separator=" ",
#   chunk_size=chunk_size,
#   chunk_overlap=chunk_overlap,
#   paragraph_separator="\n\n\n",
#   secondary_chunking_regex="[^,.;。]+[,.;。]?",
#   # tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
# )


text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
)

reader = SimpleDirectoryReader(input_files=['health_report.pdf'])
docs = reader.load_data()
print(f'Loaded {len(docs)} docs')

parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents(docs, show_progress=True)
print(f'Parsed {len(nodes)} nodes')

# define embedding function
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")


# construct vector store, storage and service context

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(llm=langchain_model, embed_model=embed_model, node_parser=parser)

Loaded 6 docs


Parsing documents into nodes:   0%|          | 0/6 [00:00<?, ?it/s]

Parsed 11 nodes


In [6]:
# save to disk

def save_docs_in_store(store_path: str = store_path, dataset_path: str = dataset_path):
    
    # load documents
    documents = SimpleDirectoryReader(dataset_path).load_data()
    
    index = VectorStoreIndex.from_documents(
        documents, storage_context=storage_context, service_context=service_context
    )
    return index

In [7]:
# load from disk

def load_docs_from_store(store_path: str = store_path, dataset_path: str = dataset_path):  

   index = VectorStoreIndex.from_vector_store(
        vector_store,
        service_context=service_context,
    )
   return index

In [8]:
index = save_docs_in_store(store_path=store_path, dataset_path=dataset_path)
vector_index = load_docs_from_store(store_path=store_path, dataset_path=dataset_path)


In [9]:
# Query Data from the persisted index
query_engine = index.as_query_engine(similarity_top_k=5, verbose=True)


In [20]:
query = "what is name of test performed ?"
response = query_engine.query(query)
# # display(Markdown(f"<b>{response}</b>"))
print(f"\n\nRESPONE: >> {response}")



RESPONE: >> Whole genome sequencing


In [11]:
query = "what is sex of the patient?"
response = query_engine.query(query)
print(f"\n\nRESONSE:>> {response}")



RESONSE:>> Male


In [12]:
# construct NERPIINodePostprocessor
#service_context = ServiceContext.from_defaults()
service_context = ServiceContext.from_defaults(llm=langchain_model, embed_model=embed_model, node_parser=parser)
processor = NERPIINodePostprocessor(service_context=service_context)

from llama_index.schema import NodeWithScore

# call postprocess_nodes to transform the nodes by masking PII data
new_nodes = processor.postprocess_nodes([NodeWithScore(node=node) for node in nodes])

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.




In [13]:
for node in new_nodes:
    print(">>> print redated text")
    print(node.text)
    print(">>> print metadata")
    print(node.node.metadata["__pii_node_info__"])

>>> print redated text
page_label: 1
file_name: health_report.pdf

Name: [ORG_50], [ORG_55]N   Accession ID: PM- XX-12345  
 
DOB: 12/31 /1999                      MRN: 0123456789    Specimen: Blood, Peripheral  
Sex: Male                      Referring facility: [LOC_231]   Lab Control Number: ABC123  
Race /[MISC_1125]thnicity : [MISC_302]                            Referring physician: Dr. DNA    Received: 01/24/2014  
Family #: F012345                         Copies to: [ORG_443]    Page: 1 of 4 
 
Test (s) performed:  Whole genome  sequencing  
Indication for test : Clinical diagnosis and family history of [MISC_577]M  with arrhythmia   
 
 
 
 
  
APPROACH  
Sequencing of this individual's genome  was performed and the data was  analyzed to identify previously reported and novel variants in (1) 
335  genes that have been previously implicated in various cardiac diseases  and myopathies  (see Supplement  for a list of genes and coverage 
information) ; and (2)  variants classified

In [14]:
index = VectorStoreIndex([n.node for n in new_nodes], service_context=service_context,)


In [15]:
response = index.as_query_engine().query("What is the name of the patient?")
print(str(response))

[ORG_50], [ORG_55]N


In [16]:
response = index.as_query_engine().query("what is sex of patient")
print(f"\n\nRESPONE: >> {response}")



RESPONE: >> Male


In [17]:
response = index.as_query_engine().query("what is family #")
#print(f"\n\nResponse:  >> {response}")
print(str(response))

F012345


In [18]:
response = index.as_query_engine().query("What is the methodology for the test?")
print(str(response))

Sequencing of this individual's genome was performed and the data was analyzed to identify previously reported and novel variants in (1) 335 genes that have been previously implicated in various cardiac diseases and myopathies (see Supplement for a list of genes and coverage information) ; and (2) variants classified as disease -causing in public databases that have a minor allele frequency 5% in [MISC_1125]uropean American or [MISC


In [19]:
response = index.as_query_engine().query("What is the result?")
print(str(response))

The patient has a pathogenic variant in LMNA.
