In [21]:
!pip install --upgrade pymilvus openai requests tqdm
!pip install torch
!pip install llama-index
!pip install transformers
!pip install sentence-transformers 


!pip install llama-index-embeddings-huggingface 
!pip install llama-index-embeddings-instructor 



In [8]:
import glob
import json, sys

sys.path.insert(1,'./utils/')
from FHIR_to_graph import resource_to_node, resource_to_edges, resource_to_string


In [16]:
with open('fhir_resources/Cassy835_Kohler843_01b6ca2a-b591-4560-9014-c37e1565f0cd.json') as raw:
    bundle = json.load(raw)
    for entry in bundle['entry']:
        resource_type = entry['resource']['resourceType']
        if resource_type != 'Provenance':
            print("---------------------------------------------------------------")
            print(resource_type)
            print("---------------------------------------------------------------")
            flattened_resource = resource_to_string(entry['resource']) # This is a dictionary
            print(flattened_resource['text'])
            print(json.dumps(flattened_resource))
            

---------------------------------------------------------------
Patient
---------------------------------------------------------------
The type of information in this entry is patient. The name use for this patient is official. The name family for this patient is Kohler843. The name given for this patient is Cassy835. The telecom system for this patient is phone. The telecom value for this patient is 555-292-5994. The telecom use for this patient is home. The gender for this patient is female. The birth date for this patient is 2019-07-23. The address line for this patient is 949 Crist Burg Unit 59. The address city for this patient is Plymouth. The address state for this patient is Massachusetts. The address postalCode for this patient is 02360. The address country for this patient is US. The marital status for this patient is Never Married. The multiple birth boolean for this patient is False. The communication language for this patient is English.
{"resource_type": "Patient", "id":

## Understand the fields and references

In [20]:
## Lets understand all the references
with open('fhir_resources/Cassy835_Kohler843_01b6ca2a-b591-4560-9014-c37e1565f0cd.json') as raw:
    bundle = json.load(raw)
    for entry in bundle['entry']:
        resource_type = entry['resource']['resourceType']
        if resource_type != 'Provenance':
            print("---------------------------------------------------------------")
            print(resource_type)
            print("---------------------------------------------------------------")
            flattened_resource = resource_to_string(entry['resource']) # This is a dictionary
            keys = [k for k, v in flattened_resource.items()]  # Gets all the keys
            keys = [k for k, v in flattened_resource.items() if "reference" in k]  # Only gets reference fields
            print(keys)  # Prints all the keys
            print(flattened_resource['text'])
            # print(json.dumps(bundle))

---------------------------------------------------------------
Patient
---------------------------------------------------------------
[]
The type of information in this entry is patient. The name use for this patient is official. The name family for this patient is Kohler843. The name given for this patient is Cassy835. The telecom system for this patient is phone. The telecom value for this patient is 555-292-5994. The telecom use for this patient is home. The gender for this patient is female. The birth date for this patient is 2019-07-23. The address line for this patient is 949 Crist Burg Unit 59. The address city for this patient is Plymouth. The address state for this patient is Massachusetts. The address postalCode for this patient is 02360. The address country for this patient is US. The marital status for this patient is Never Married. The multiple birth boolean for this patient is False. The communication language for this patient is English.
-------------------------------

## Prepare embedding model

Example used open AI model we want a different one

In [23]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")


Testing embeddings

In [30]:
embeddings = embed_model.get_text_embedding("The type of information in this entry is observation. The status for this observation is final. The category of this observation is vital-signs. The code for this observation is Blood Pressure. This observation was effective date time on 07/23/2019 at 00:19:26. This observation was issued on 07/23/2019 at 00:19:26. This observation contains 2 components. The 1st component's code for this observation is Diastolic Blood Pressure. The 1st com")
print(len(embeddings))
embedding_dim = len(embeddings)

print(embeddings[:5])

384
[-0.039267104119062424, 0.04040497913956642, -0.018152697011828423, -0.020609106868505478, 0.06916255503892899]


## Load data in Milvus
Create connection and collection - using the local in library version

In [29]:
from pymilvus import MilvusClient, DataType

milvus_client = MilvusClient(uri="./milvus_fhir.db")

In [27]:
collection_name = "my_fhir_collection"

As for the argument of MilvusClient:

* Setting the uri as a local file, e.g../milvus.db, is the most convenient method, as it automatically utilizes Milvus Lite to store all data in this file.
* If you have large scale of data, you can set up a more performant Milvus server on docker or kubernetes. In this setup, please use the server uri, e.g.http://localhost:19530, as your uri.
* If you want to use Zilliz Cloud, the fully managed cloud service for Milvus, adjust the uri and token, which correspond to the Public Endpoint and Api key in Zilliz Cloud.

In [28]:
# if the collection exists create a new one
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

Create a new collection with specified parameters.

If we don't specify any field information, Milvus will automatically create a default id field for primary key, and a vector field to store the vector data. A reserved JSON field is used to store non-schema-defined fields and their values.

Filtering is also possible: https://milvus.io/docs/filtered-search.md

In [38]:
schema = milvus_client.create_schema(
    auto_id = False,
    enable_dynamic_fields=True
)

schema.add_field(field_name="resource", datatype=DataType.JSON)
schema.add_field(field_name="id", datatype=DataType.VARCHAR, max_length=64, is_primary=True)
schema.add_field(field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=embedding_dim)

index_params = milvus_client.prepare_index_params()

# Index for embedding to speed things up
index_params.add_index(
    field_name="embedding",
    index_type="AUTOINDEX",
    metric_type="COSINE"
)
    

milvus_client.create_collection(
    collection_name=collection_name,
    schema=schema,
    index_params=index_params
    #metric_type="IP",  # Inner product distance
    #consistency_level="Strong",  # Strong consistency level
)

## Insert data

Iterate through the text lines, create embeddings, and then insert the data into Milvus.

Here is a new field text, which is a non-defined field in the collection schema. It will be automatically added to the reserved JSON dynamic field, which can be treated as a normal field at a high level.

In [39]:
from tqdm import tqdm


In [68]:

with open('fhir_resources/Cassy835_Kohler843_01b6ca2a-b591-4560-9014-c37e1565f0cd.json') as raw:
    bundle = json.load(raw)
    for entry in bundle['entry']:
        resource_type = entry['resource']['resourceType']
        if resource_type != 'Provenance':
            print("---------------------------------------------------------------")
            print(resource_type)
            print("---------------------------------------------------------------")
            flattened_resource = resource_to_string(entry['resource']) # This is a dictionary
            #keys = [k for k, v in flattened_resource.items()]  # Gets all the keys
            #keys = [k for k, v in flattened_resource.items() if "reference" in k]  # Only gets reference fields
            #print(keys)  # Prints all the keys
            #print(flattened_resource['text'])
            # print(json.dumps(flattened_resource))
            #data = []  # Surely bulk insert would be better
            data = {"id": flattened_resource['id'], "embedding": embed_model.get_text_embedding(flattened_resource['text']), "resource": flattened_resource}
            milvus_client.insert(collection_name=collection_name, data=data)

---------------------------------------------------------------
Patient
---------------------------------------------------------------
---------------------------------------------------------------
Organization
---------------------------------------------------------------
---------------------------------------------------------------
Practitioner
---------------------------------------------------------------
---------------------------------------------------------------
Encounter
---------------------------------------------------------------
---------------------------------------------------------------
Observation
---------------------------------------------------------------
---------------------------------------------------------------
Observation
---------------------------------------------------------------
---------------------------------------------------------------
Observation
---------------------------------------------------------------
------------------------

## Check the collection


In [69]:
from pymilvus import Collection, utility

print(milvus_client.list_collections())
milvus_client.get_collection_stats(collection_name)

['my_fhir_collection']


{'row_count': 38}

In [65]:


collection = milvus_client.Collection(collection_name)  # Get an existing collection.

collection.schema                # Return the schema.CollectionSchema of the collection.
collection.description           # Return the description of the collection.
collection.name                  # Return the name of the collection.
collection.is_empty              # Return the boolean value that indicates if the collection is empty.
collection.num_entities          # Return the number of entities in the collection.
collection.primary_field         # Return the schema.FieldSchema of the primary key field.
collection.partitions            # Return the list[Partition] object.
collection.indexes            

AttributeError: 'MilvusClient' object has no attribute 'Collection'

# Build the RAG
## Retrieve data for a query
Let's try a milvus query

In [85]:
question = "What is the patients name and surname"
question = "What conditions does the patient have?"


In [86]:
# Search for the question and return the top 3 matches
search_result = milvus_client.search(
    collection_name=collection_name,
    data=[
        embed_model.get_text_embedding(question)
    ], #questions becomes a vector as well
    limit = 3,
    search_params={"params": {}},
    output_fields=["id","resource"],
)

In [87]:
import json

retrieved_lines_with_distances = [
    (res["entity"]["resource"]["text"], res["distance"]) for res in search_result[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))

[
    [
        "The type of information in this entry is patient. The name use for this patient is official. The name family for this patient is Kohler843. The name given for this patient is Cassy835. The telecom system for this patient is phone. The telecom value for this patient is 555-292-5994. The telecom use for this patient is home. The gender for this patient is female. The birth date for this patient is 2019-07-23. The address line for this patient is 949 Crist Burg Unit 59. The address city for this patient is Plymouth. The address state for this patient is Massachusetts. The address postalCode for this patient is 02360. The address country for this patient is US. The marital status for this patient is Never Married. The multiple birth boolean for this patient is False. The communication language for this patient is English.",
        0.7696189284324646
    ],
    [
        "The type of information in this entry is practitioner. The active for this practitioner is True. The n

# RAG it up with LLM
Convert the retrieved documents to String

In [81]:
from openai import OpenAI
client = OpenAI(
    api_key="lm-studio",
    base_url="http://localhost:1234/v1")

In [88]:
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)

Define system and user prompts for the Lanage Model. This prompt is assembled with the retrieved documents from Milvus.



In [89]:
SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.
"""
USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""

Use OpenAI client to call local LLM to generate a response based on the prompts.
I am using LM Studio you can get your list of models by runnnig: curl localhost:1234/v1/models

In [90]:
response = client.chat.completions.create(
    model="granite-3.1-8b-instruct",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(response.choices[0].message.content)

The patient's given name is Cassy835 and the family name is Kohler843.
