In [1]:
from owlready2 import *
from owlready2.pymedtermino2 import *
from owlready2.pymedtermino2.umls import *
# from owlready2.pymedtermino2.icd10_french import *

from tqdm import tqdm

import random

In [None]:
DB_NAME = "../../../data/pym.sqlite3"
ZIP_NAME = "../../../data/umls-2024AA-full.zip"
SAVE_TO = "../../../data/triplets.tsv"


BANNED_PROPS = ['icd-o-3_code', 'term_type', 'subset_member', 'label', 'terminology', 'synonyms', 'unifieds']
RANDOM_SEED=30_239_566

In [3]:
default_world.set_backend(filename = DB_NAME)
PYM = get_ontology("http://PYM/").load()

# Extracting concept

In [6]:
def get_concept_triplets(concept, seed=None):
    triplets = []
    
    # Set random seed for reproducibility if provided
    if seed is not None:
        random.seed(seed)
    
    # Iterate through all properties of the concept
    for prop in concept.get_class_properties():
        # Skip the banned properties
        if prop.name in BANNED_PROPS:
            continue
        
        # Get the linked concept(s) via the property
        related_concepts = getattr(concept, prop.name)
        
        # If related concepts are a list or set, select one randomly
        if isinstance(related_concepts, list) or isinstance(related_concepts, set):
            if related_concepts:
                selected_concept = random.choice(list(related_concepts))
                if hasattr(selected_concept, 'label') and selected_concept.label:
                    triplets.append((str(concept.label[0]), prop.name, str(selected_concept.label[0])))
                else:
                    triplets.append((str(concept.label[0]), prop.name, str(selected_concept)))
        else:
            # If it's a single concept
            if hasattr(related_concepts, 'label') and related_concepts.label:
                triplets.append((str(concept.label[0]), prop.name, str(related_concepts.label[0])))
            else:
                triplets.append((str(concept.label[0]), prop.name, str(related_concepts)))
    
    return triplets

# Function to get triplets for all concepts in the ontology
def get_all_concept_triplets(ontology, seed=None):
    all_triplets = []

    # Iterate over all concepts (classes) in the ontology
    for concept in tqdm(ontology.classes()):
        try:
            triplets = get_concept_triplets(concept, seed)  # Use the function from before
            all_triplets.extend(triplets)  # Add the triplets to the full list
        except IndexError:
            pass
    return all_triplets

In [None]:
all_triplets = get_all_concept_triplets(PYM, seed=RANDOM_SEED)
# for triplet in all_triplets:
#     print(triplet)

525160it [03:59, 2525.25it/s]

## Save it

In [None]:
df = pd.DataFrame(all_triplets, columns=['Concept', 'Property', 'Related Concept'])
# Save the DataFrame to a CSV file
df.to_csv(SAVE_TO, index=False, sep='\t')