In [1]:
from owlready2 import *
from owlready2.pymedtermino2 import *
from owlready2.pymedtermino2.umls import *
# from owlready2.pymedtermino2.icd10_french import *


In [2]:
DB_NAME = "../../../data/pym.sqlite3"
ZIP_NAME = "../../../data/umls-2024AA-full.zip"

!! NB In order to start, the user should download zip file with concepts, inctructions: `https://owlready2.readthedocs.io/en/latest/pymedtermino2.html`

# Load from zip (do once)

Notes:
* we can omit terminologies, than all will be imported
* To import also suppressed/deprecated concept, add the following parameter: remove_suppressed = “”.
* french (commented) is optional (do not need for our purposes).

In [4]:
default_world.set_backend(filename = DB_NAME)
import_umls(ZIP_NAME, terminologies = ["ICD10", "SNOMEDCT_US", "CUI"])
# import_icd10_french()
default_world.save()

Importing UMLS from ../../../data/umls-2024AA-full.zip with Python version 3.7.12 and Owlready version 2-0.47...
Full UMLS release - importing UMLS from inner Zip file 2024AA-full/2024aa-1-meta.nlm...
  Parsing 2024AA/META/MRSTY.RRF.gz as MRSTY with encoding UTF-8
  Parsing 2024AA/META/MRRANK.RRF.gz as MRRANK with encoding UTF-8
  Parsing 2024AA/META/MRCONSO.RRF.aa.gz as MRCONSO with encoding UTF-8
  Parsing 2024AA/META/MRCONSO.RRF.ab.gz as MRCONSO with encoding UTF-8
  Parsing 2024AA/META/MRDEF.RRF.gz as MRDEF with encoding UTF-8
Full UMLS release - importing UMLS from inner Zip file 2024AA-full/2024aa-2-meta.nlm...
  Parsing 2024AA/META/MRREL.RRF.aa.gz as MRREL with encoding UTF-8
  Parsing 2024AA/META/MRREL.RRF.ab.gz as MRREL with encoding UTF-8
  Parsing 2024AA/META/MRREL.RRF.ac.gz as MRREL with encoding UTF-8
  Parsing 2024AA/META/MRREL.RRF.ad.gz as MRREL with encoding UTF-8
  Parsing 2024AA/META/MRSAT.RRF.aa.gz as MRSAT with encoding UTF-8
  Parsing 2024AA/META/MRSAT.RRF.ab.gz as

# Check does it work

In [3]:
default_world.set_backend(filename = DB_NAME)
PYM = get_ontology("http://PYM/").load()
SNOMEDCT_US = PYM["SNOMEDCT_US"]

In [4]:
concept = SNOMEDCT_US[302509004]
concept

SNOMEDCT_US["302509004"] # Entire heart

In [10]:
concept.name, concept.label, concept.synonyms, concept.terminology, 

('302509004',
 [locstr('Entire heart', 'en')],
 [locstr('Entire heart (body structure)', 'en')],
 PYM["SNOMEDCT_US"] # US Edition of SNOMED CT)

In [28]:
str(concept.label[0])

'Entire heart'

In [11]:
concept.children, concept.parents

([SNOMEDCT_US["195591003"] # Entire transplanted heart],
 [SNOMEDCT_US["116004006"] # Entire hollow viscus,
  SNOMEDCT_US["187639008"] # Entire thoracic viscus,
  SNOMEDCT_US["80891009"] # Heart structure])

In [20]:
concept

SNOMEDCT_US["302509004"] # Entire heart

In [22]:
concept.get_class_properties()

{PYM.finding_site_of,
 PYM.type_id,
 PYM.ctv3id,
 PYM.effective_time,
 PYM.active,
 PYM.entire_anatomy_structure_of,
 PYM.synonyms,
 PYM.terminology,
 PYM.definition_status_id,
 rdf-schema.label,
 PYM.subset_member,
 PYM.term_type,
 PYM.unifieds,
 PYM.icd-o-3_code,
 PYM.case_significance_id}

__Summary__:
    
I need to get all triplets:
node, property, node

If I have one node, i can get the name via: `str(concept.label[0])`, and i can get it's properties via: `concept.get_class_properties()`, which returns set. And i can access it by calling (specific name).
I need list of all `(concept, prop, concept)` for the current concept...

# Extracting concept

In [43]:
import random

# BANNED_PROPS = ['icd-o-3_code', 'term_type', 'subset_member', 'label', 'terminology', 'synonyms', 'unifieds']
RANDOM_SEED=30_239_566


def get_concept_triplets(concept, seed=None):
    triplets = []
    
    # Set random seed for reproducibility if provided
    if seed is not None:
        random.seed(seed)
    
    # Iterate through all properties of the concept
    for prop in concept.get_class_properties():
        # Skip the banned properties
        if prop.name in BANNED_PROPS:
            continue
        
        # Get the linked concept(s) via the property
        related_concepts = getattr(concept, prop.name)
        
        # If related concepts are a list or set, select one randomly
        if isinstance(related_concepts, list) or isinstance(related_concepts, set):
            if related_concepts:
                selected_concept = random.choice(list(related_concepts))
                if hasattr(selected_concept, 'label') and selected_concept.label:
                    triplets.append((str(concept.label[0]), prop.name, str(selected_concept.label[0])))
                else:
                    triplets.append((str(concept.label[0]), prop.name, str(selected_concept)))
        else:
            # If it's a single concept
            if hasattr(related_concepts, 'label') and related_concepts.label:
                triplets.append((str(concept.label[0]), prop.name, str(related_concepts.label[0])))
            else:
                triplets.append((str(concept.label[0]), prop.name, str(related_concepts)))
    
    return triplets

In [44]:
get_concept_triplets(concept, seed=RANDOM_SEED)

[('Entire heart', 'case_significance_id', '900000000000448009'),
 ('Entire heart', 'definition_status_id', '900000000000074008'),
 ('Entire heart', 'entire_anatomy_structure_of', 'Heart structure'),
 ('Entire heart', 'active', '1'),
 ('Entire heart', 'effective_time', '20020131'),
 ('Entire heart', 'ctv3id', 'Xa8SM'),
 ('Entire heart', 'type_id', '900000000000003001'),
 ('Entire heart', 'finding_site_of', 'Mesocardia')]

In [50]:
from tqdm import tqdm

# Function to get triplets for all concepts in the ontology
def get_all_concept_triplets(ontology, seed=None):
    all_triplets = []

    # Iterate over all concepts (classes) in the ontology
    for concept in tqdm(ontology.classes()):
        triplets = get_concept_triplets(concept, seed)  # Use the function from before
        all_triplets.extend(triplets)  # Add the triplets to the full list
    
    return all_triplets

In [None]:
all_triplets = get_all_concept_triplets(PYM, seed=RANDOM_SEED)
# for triplet in all_triplets:
#     print(triplet)

87560it [01:03, 1300.83it/s]