In [1]:
from owlready2 import *
from owlready2.pymedtermino2 import *
from owlready2.pymedtermino2.umls import *
# from owlready2.pymedtermino2.icd10_french import *

from tqdm import tqdm
import pandas as pd

from collections import defaultdict
import random
import networkx as nx
from tqdm import tqdm

import re
import random

In [2]:
DB_NAME = "../../../data/pym.sqlite3"
ZIP_NAME = "../../../data/umls-2024AA-full.zip"
SAVE_TO = "../../../data/created_data/seqs.tsv"
MID_SAVE_TO = "../../../data/created_data/mid.tsv"


# previously deleted:     'term_type', 'terminology', 
# added: 'ctv3id', originals'
BANNED_PROPS = [
    'icd-o-3_code',  #ok almost empty
    'ctv3id', #ok unique for all, which is bad, full memorization
    'subset_member',  #ok unique for all, which is bad, full memorization
    'label', #ok almost the same
    'synonyms',  #ok almost the same to the parent node
    'unifieds', #ok almost the same to the parent node
    'originals'  #ok almost the same to the parent node
               ]
RANDOM_SEED=30_239_566

BANNED_GROUP = ['case_significance_id',
 'groups',
 'type_id',
 'ctv3id',
 'effective_time',
 'unifieds',
 'active',
 'synonyms',
 'terminology',
 'subset_member',
 'definition_status_id',
 'term_type']

# mapped_to: example -- 'ICD10["I51.8"] # Other ill-defined heart diseases\n'


# Read KG  and build graph

In [3]:
default_world.set_backend(filename = DB_NAME)
PYM = get_ontology("http://PYM/").load()

In [4]:
# Graph initialization from SNOMED data
def build_graph(ontology, output_file=MID_SAVE_TO):
    G = nx.Graph()
    for concept in tqdm(ontology.classes()):
        for prop in concept.get_class_properties():
            if prop.name in BANNED_PROPS or prop.name in BANNED_GROUP:
                continue
            related_concepts = getattr(concept, prop.name, [])
            if not isinstance(related_concepts, (list, set)):
                related_concepts = [related_concepts, ]
            for rc in related_concepts:
                G.add_edge(concept, rc, relationship=prop.name)
                # Add edge with attribute for reversed relationship
                G.add_edge(rc, concept, relationship=f"reversed_{prop.name}")
    return G

In [5]:
# ! head -20 ../../../data/created_data/seqs10000.tsv

In [6]:
ontology = PYM
G = build_graph(ontology)

1220039it [06:57, 2920.28it/s]


# Algorithms for traversal

In [8]:
# Function to beautify nodes
def beautify_node(node_str):
    patterns = [r'SNOMEDCT_US\[".*?"\] #\s*', r'ICD10\[".*?"\] #\s*']
    cleaned_node = node_str
    for pattern in patterns:
        cleaned_node = re.sub(pattern, '', cleaned_node)
    return cleaned_node.strip()



# Function to format and save a sequence
def save_sequence(sequence, file):
    file.write("\t".join(sequence) + "\n")

## Sparse

In [15]:
# Create a subgraph around a random starting node
def create_subgraph(G, start_node, hops=3):
    bfs_nodes = nx.single_source_shortest_path_length(G, start_node, cutoff=hops)
    subgraph_nodes = list(bfs_nodes.keys())
    return G.subgraph(subgraph_nodes)

def beautify_graph_nodes(graph):
    """Pre-beautify all nodes in the graph."""
    mapping = {node: beautify_node(str(node)) for node in graph.nodes}
    return nx.relabel_nodes(graph, mapping)


# Generate a sequence from the subgraph with global ambiguity avoidance
def generate_sequence_from_subgraph(subgraph, edge_count_range=(3, 5)):
    global visited_pairs  # Access global visited_pairs
    local_triplets = set()  # Track node-edge-node triplets for the current sequence
    sequence = []
    
    # Randomly select the target number of edges within the range
    target_edge_count = random.randint(edge_count_range[0], edge_count_range[1])
    current_edge_count = 0  # Counter for edges in the sequence
    
    # Random starting node
    start_node = random.choice(list(subgraph.nodes))
    current_node = start_node

    while current_edge_count < target_edge_count:
        neighbors = list(subgraph.neighbors(current_node))
        
        # Filter neighbors to exclude already visited (node, edge) pairs
        valid_neighbors = []
        for next_node in neighbors:
            edge_data = subgraph.get_edge_data(current_node, next_node)
            edge_name = edge_data.get('relationship', 'No relationship') if edge_data else 'No relationship'
            
            # Create the triplet for the current context
            triplet = (current_node, edge_name, next_node)
            
            # Check for both global and local ambiguity
            if (current_node, edge_name) not in visited_pairs and triplet not in local_triplets:
                valid_neighbors.append((next_node, edge_name, triplet))

        if not valid_neighbors:  # If no valid neighbors, terminate or restart
            break  # Terminate sequence generation
        
        # Randomly select a valid neighbor
        next_node, edge_name, triplet = random.choice(valid_neighbors)
        
        # Add the triplet to the local visited set
        local_triplets.add(triplet)
        
        # Add the (node, edge) pair to the global visited set
        visited_pairs.add((current_node, edge_name))
        
        # Beautify and add nodes/edges to the sequence
        sequence.append(str(current_node))
        sequence.append(edge_name)
        current_node = next_node
        current_edge_count += 1

    # Add the final node to the sequence
    sequence.append(str(current_node))
    
    return sequence


In [117]:
# Global set to track visited node+edge pairs across all sequences
visited_pairs = set()

graph = beautify_graph_nodes(G.copy())

# Open the file once, clear its content initially, and write all sequences
with open(SAVE_TO, "w") as f:  # Open in write mode to clear and write
    for i in tqdm(range(100)):
        start_node = random.choice(list(graph.nodes))  # Random starting node
        subgraph = create_subgraph(graph, start_node, hops=4)  # Create a subgraph around the node
        sequence = generate_sequence_from_subgraph(subgraph, edge_count_range=(3, 5))  # Generate sequence

        # Save the sequence to the file
        save_sequence(sequence, f)

100%|█████████████████████████████████████████| 100/100 [01:42<00:00,  1.03s/it]


In [118]:
! cat ../../../data/created_data/seqs.tsv

Apocrine miliaria of axilla	reversed_has_interpretation	Present	reversed_has_interpretation	Idiopathic paroxysmal cold hemoglobinuria	reversed_is_interpreted_by	Hemolysis	reversed_is_interpreted_by	Evans syndrome
Congenital deformity of hip, unspecified	reversed_mapped_to	Congenital deformity of right hip joint	reversed_has_pathological_process	Pathological developmental process	reversed_has_pathological_process	groups.3591735_1	reversed_has_associated_morphology	Deformity
Moll's gland cyst	reversed_inactivation_indicator	723277005	reversed_inactivation_indicator	Tendon of semispinalis cervicis	reversed_inactivation_indicator	723277005
groups.3861562_1	reversed_has_presentation_strength_numerator_unit	mg	reversed_has_presentation_strength_numerator_unit	Eplerenone 25 mg oral tablet	reversed_has_dose_form	Oral tablet	reversed_has_dose_form	groups.3705040_0
groups.3820769_1	reversed_has_associated_morphology	Malposition	reversed_has_direct_morphology	groups.3553138_1	reversed_has_dir

In [127]:
! ls ../../../data/out_models/models_20250106_*

../../../data/out_models/models_20250106_163111_lay_act_4_8.pkl
../../../data/out_models/models_20250106_190539_lay_act_8_12.pkl
../../../data/out_models/models_20250106_193839_lay_act_0_4.pkl


In [128]:
! rm ../../../data/out_models/models_20250106_*

# Condensed

In [None]:
Insufficient requesting detail

In [12]:
G.neighbors('Failed medical induction of labour')

NetworkXError: The node Failed medical induction of labour is not in the graph.

In [19]:
def has_unvisited_edges(node, graph, visited_pairs):
    """Check if a node has any unvisited edges."""
    for neighbor in graph.neighbors(node):
        edge_data = graph.get_edge_data(node, neighbor)
        edge_name = edge_data.get('relationship', 'No relationship') if edge_data else 'No relationship'
        if (node, edge_name) not in visited_pairs:
            return True  # Found an unvisited edge
    return False  # All edges for this node are visited

# Main function to create condensed sequences
def create_condensed_sequences(graph, save_to, num_sequences, edge_count_range=(3, 5), min_component_size=5):
    global visited_pairs
    visited_pairs = set()
    cur_component_visited_nodes = set()  # Track all visited nodes within the current connectivity component
    successful_sequences = 0  # Counter for successful sequences
    # Precompute all original neighbors
    original_neighbors = {node: set(graph.neighbors(node)) for node in graph.nodes}
    components = [comp for comp in nx.connected_components(graph) if len(comp) >= min_component_size]
    with open(save_to, "w") as f, tqdm(total=num_sequences) as pbar:
        while successful_sequences < num_sequences:
            # Generate nearby nodes, considering visited nodes
            if cur_component_visited_nodes:
                start_nodes = set()
                for node in cur_component_visited_nodes:
                    if has_unvisited_edges(node, graph, visited_pairs):
                        start_nodes.add(node)  # Add the current node if it has any unvisited edges

                    # Check neighbors for their edges
                    for neighbor in graph.neighbors(node):
                        if has_unvisited_edges(neighbor, graph, visited_pairs):
                            start_nodes.add(neighbor)  # Add neighbor if it has any unvisited edges
            else:
                # Start a new component
                cur_component_visited_nodes = set()  # Reset for a new component
                component = random.choice(components)
                start_nodes = list(component)
                print('Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.')
                
            if not start_nodes:
                # Fallback to a random component if no nearby nodes
                cur_component_visited_nodes = set()  # Reset for a new component
                component = random.choice(components)
                start_nodes = list(component)
                print('Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.')
            
             # Randomly select a starting node
            start_node = random.choice(list(start_nodes))
            subgraph = create_subgraph(graph, start_node, hops=edge_count_range[1])  # Create a subgraph around the node
            # Generate sequence
            sequence = generate_sequence_from_subgraph(subgraph, edge_count_range=edge_count_range)
            
            # Skip too-short sequences
            if len(sequence) <= edge_count_range[0]:
                continue

            # Save the sequence
            save_sequence(sequence, f)

            # Update progress bar and counters
            successful_sequences += 1
            pbar.update(1)

            # Update last sequence nodes
            cur_component_visited_nodes = cur_component_visited_nodes | set(sequence[::2])  # Add all visited nodes



# Function to format and save a sequence
def save_sequence(sequence, file):
    file.write("\t".join(sequence) + "\n")


# In[117]:

ROWS=10000

# Example Usage
visited_pairs = set()
SAVE_TO = "../../../data/created_data/condensed_seqs.tsv"
graph = beautify_graph_nodes(G.copy())  # Work with a copy of the graph to avoid modifying the original
create_condensed_sequences(graph, SAVE_TO, num_sequences=10000, edge_count_range=(3, 5), min_component_size=5)

  0%|                                       | 30/10000 [00:00<00:45, 220.92it/s]

Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to th

  1%|▍                                     | 100/10000 [00:00<00:36, 273.10it/s]

Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the 

  2%|▌                                     | 156/10000 [00:00<00:45, 216.70it/s]

Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to th

  2%|▉                                     | 246/10000 [00:00<00:30, 317.57it/s]

Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to th

  3%|█                                     | 281/10000 [00:01<00:36, 266.16it/s]

Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go

  3%|█▎                                    | 346/10000 [00:01<00:43, 221.52it/s]

Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be emp

  4%|█▍                                    | 373/10000 [00:01<00:53, 178.73it/s]

Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty

  4%|█▋                                    | 441/10000 [00:01<00:38, 247.88it/s]

Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: we took the max from previously selected `cur_component_visited_nodes`.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.
Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be emp

Go to the new connectivity component: previously selected `cur_component_visited_nodes` turned out to be empty.


  5%|██                                     | 532/10000 [02:32<45:08,  3.50it/s]


KeyboardInterrupt: 

In [22]:
! head -9 ../../../data/created_data/condensed_seqs.tsv

Fall due to failure of rail	reversed_mapped_to	Fall from, out of or through building or structure causing accidental injury	reversed_mapped_to	Fall from flagpole	reversed_mapped_to	Fall from, out of or through building or structure causing accidental injury	reversed_mapped_from	Fall from building	reversed_mapped_from	Fall from, out of or through building or structure causing accidental injury
Ammonium molybdate	reversed_is_modification_of	groups.3865316_0	reversed_is_modification_of	Ammonium molybdate
Mechanical asphyxia accident	reversed_mapped_to	Unspecified threat to breathing as an external cause of morbidity and mortality	reversed_mapped_to	Mechanical asphyxia accident
groups.3652591_0	reversed_mapped_to	Unspecified threat to breathing as an external cause of morbidity and mortality	reversed_mapped_from	Smothering	reversed_mapped_from	Unspecified threat to breathing as an external cause of morbidity and mortality
groups.3569888_0	reversed_mapped_to	Pedal cyclist injured in col

In [None]:
# # Sequence generation functions
# def generate_condensed_sequences(G, num_samples, length_range, seed=None):
#     random.seed(seed)
#     sequences = []
#     clusters = nx.algorithms.community.greedy_modularity_communities(G)
#     for _ in tqdm(range(num_samples)):
#         cluster = random.choice(clusters)
#         start_node = random.choice(list(cluster))
#         sequence = [start_node]
#         while len(sequence) < random.randint(*length_range):
#             neighbors = list(G.neighbors(sequence[-1]))
#             neighbors = [n for n in neighbors if n in cluster]
#             if not neighbors:
#                 break
#             sequence.append(random.choice(neighbors))
#         if len(sequence) >= length_range[0]:
#             sequences.append(sequence)
#     return sequences

# def generate_sparse_sequences(G, num_samples, length_range, seed=None):
#     random.seed(seed)
#     sequences = []
#     for _ in tqdm(range(num_samples)):
#         start_node = random.choice(list(G.nodes))
#         sequence = [start_node]
#         while len(sequence) < random.randint(*length_range):
#             neighbors = list(G.neighbors(sequence[-1]))
#             neighbors = [n for n in neighbors if n not in sequence]  # Avoid dense local regions
#             if not neighbors:
#                 break
#             sequence.append(random.choice(neighbors))
#         if len(sequence) >= length_range[0]:
#             sequences.append(sequence)
#     return sequences

# # Convert sequences to a human-readable format
# def format_sequence(sequence):
#     return " -> ".join([str(node) for node in sequence])

# # Main workflow
# def create_datasets(ontology, num_samples=20000, length_range=(4, 7), seed=42):
#     G = build_graph(ontology)
#     condensed = generate_condensed_sequences(G, num_samples, length_range, seed)
#     sparse = generate_sparse_sequences(G, num_samples, length_range, seed)
#     # Save datasets to files
#     with open("condensed_dataset.txt", "w") as f:
#         for seq in condensed:
#             f.write(format_sequence(seq) + "\n")
#     with open("sparse_dataset.txt", "w") as f:
#         for seq in sparse:
#             f.write(format_sequence(seq) + "\n")

# # Call the function
# create_datasets(PYM, num_samples=20000, length_range=(4, 7), seed=RANDOM_SEED)