# OWL → HeteroData Conversion

**Simple conversion**: OWL knowledge graph → PyTorch Geometric HeteroData

**Input**: `GSE54514_enriched_ontology_degfilterv2.9.owl`  
**Output**: `hetero_graph_han.pt`

In [3]:
from pathlib import Path
from rdflib import Graph
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import HeteroData
import sys

project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

print(f"✓ Project root: {project_root}")

✓ Project root: /Users/silviatrottet/Documents/M2_GENIOMHE/Deep Learning/2526-m2geniomhe-GNN-sepsis


## 1. Load OWL File

In [4]:
owl_file = project_root / "models" / "executions" / "GSE54514_enriched_ontology_degfilterv2.9" / "GSE54514_enriched_ontology_degfilterv2.9.owl"

print(f"Loading OWL: {owl_file.name}...")

g = Graph()
g.parse(str(owl_file), format='xml')

print(f"✓ Loaded {len(g):,} triples")

Loading OWL: GSE54514_enriched_ontology_degfilterv2.9.owl...
✓ Loaded 3,431,963 triples


## 2. Extract Triples

Extract relevant triples for our heterogeneous graph

In [5]:
# Helper function to extract node type
def get_node_type(uri):
    uri_str = str(uri)
    if 'Sample_' in uri_str:
        return 'patient'
    elif 'Protein_' in uri_str:
        return 'protein'
    return None

# Helper to extract node ID
def get_node_id(uri):
    return str(uri).split('/')[-1].split('#')[-1]

In [6]:
# Collect all relevant triples
patient_ids = set()
protein_ids = set()
patient_protein_edges = []  # (patient, protein)
protein_protein_edges = []  # (protein, protein)
patient_labels = {}         # patient -> label

print("Extracting triples...")

for subject, predicate, obj in g:
    subj_type = get_node_type(subject)
    obj_type = get_node_type(obj)
    
    pred_name = str(predicate).split('#')[-1].split('/')[-1]
    
    # Patient nodes
    if subj_type == 'patient':
        patient_id = get_node_id(subject)
        patient_ids.add(patient_id)
        
        # Extract labels
        if pred_name == 'hasDiseaseStatus':
            status = str(obj).lower()
            label = 1 if 'sepsis' in status else 0
            patient_labels[patient_id] = label
    
    # Protein nodes
    if subj_type == 'protein':
        protein_ids.add(get_node_id(subject))
    if obj_type == 'protein':
        protein_ids.add(get_node_id(obj))
    
    # Patient-Protein edges
    if subj_type == 'patient' and obj_type == 'protein':
        patient_protein_edges.append((get_node_id(subject), get_node_id(obj)))
    
    # Protein-Protein edges
    if subj_type == 'protein' and obj_type == 'protein':
        if pred_name == 'hasPhysicalInteractionWith':
            protein_protein_edges.append((get_node_id(subject), get_node_id(obj)))

patient_ids = sorted(list(patient_ids))
protein_ids = sorted(list(protein_ids))

print(f"\n✓ Patients: {len(patient_ids)}")
print(f"✓ Proteins: {len(protein_ids)}")
print(f"✓ Patient→Protein edges: {len(patient_protein_edges)}")
print(f"✓ Protein↔Protein edges: {len(protein_protein_edges)}")
print(f"✓ Patient labels: {len(patient_labels)}")

Extracting triples...

✓ Patients: 163
✓ Proteins: 1295
✓ Patient→Protein edges: 277784
✓ Protein↔Protein edges: 2821
✓ Patient labels: 163


## 3. Create Node Features

Simple features to start with

In [7]:
# Patient features: one-hot (HAN will learn better representations)
num_patients = len(patient_ids)
patient_features = torch.eye(num_patients, dtype=torch.float)

# Protein features: one-hot
num_proteins = len(protein_ids)
protein_features = torch.eye(num_proteins, dtype=torch.float)

print(f"Patient features: {patient_features.shape}")
print(f"Protein features: {protein_features.shape}")

Patient features: torch.Size([163, 163])
Protein features: torch.Size([1295, 1295])


In [8]:
# Patient labels
labels = torch.tensor([patient_labels.get(pid, 0) for pid in patient_ids], dtype=torch.long)

print(f"\nLabels distribution:")
print(f"  Sepsis: {(labels == 1).sum()}")
print(f"  Healthy: {(labels == 0).sum()}")


Labels distribution:
  Sepsis: 127
  Healthy: 36


## 4. Build Edge Indices

In [9]:
# Create mappings
patient_to_idx = {pid: i for i, pid in enumerate(patient_ids)}
protein_to_idx = {prot: i for i, prot in enumerate(protein_ids)}

# Patient→Protein edge index
patient_protein_src = []
patient_protein_dst = []

for patient, protein in patient_protein_edges:
    if patient in patient_to_idx and protein in protein_to_idx:
        patient_protein_src.append(patient_to_idx[patient])
        patient_protein_dst.append(protein_to_idx[protein])

edge_index_patient_protein = torch.tensor(
    [patient_protein_src, patient_protein_dst], 
    dtype=torch.long
)

print(f"Patient→Protein edge_index: {edge_index_patient_protein.shape}")

Patient→Protein edge_index: torch.Size([2, 277784])


In [10]:
# Protein↔Protein edge index
protein_protein_src = []
protein_protein_dst = []

for prot1, prot2 in protein_protein_edges:
    if prot1 in protein_to_idx and prot2 in protein_to_idx:
        protein_protein_src.append(protein_to_idx[prot1])
        protein_protein_dst.append(protein_to_idx[prot2])

edge_index_protein_protein = torch.tensor(
    [protein_protein_src, protein_protein_dst],
    dtype=torch.long
)

print(f"Protein↔Protein edge_index: {edge_index_protein_protein.shape}")

Protein↔Protein edge_index: torch.Size([2, 2821])


## 5. Create HeteroData

In [11]:
data = HeteroData()

# Nodes
data['patient'].x = patient_features
data['patient'].y = labels
data['protein'].x = protein_features

# Edges
data['patient', 'expresses', 'protein'].edge_index = edge_index_patient_protein
data['protein', 'interacts', 'protein'].edge_index = edge_index_protein_protein

print("\n" + "="*70)
print("HeteroData Created")
print("="*70)
print(data)


HeteroData Created
HeteroData(
  patient={
    x=[163, 163],
    y=[163],
  },
  protein={ x=[1295, 1295] },
  (patient, expresses, protein)={ edge_index=[2, 277784] },
  (protein, interacts, protein)={ edge_index=[2, 2821] }
)


## 6. Add Train/Val/Test Splits

In [12]:
# 70/15/15 split
indices = np.random.RandomState(42).permutation(num_patients)

train_size = int(0.70 * num_patients)
val_size = int(0.15 * num_patients)

train_mask = torch.zeros(num_patients, dtype=torch.bool)
val_mask = torch.zeros(num_patients, dtype=torch.bool)
test_mask = torch.zeros(num_patients, dtype=torch.bool)

train_mask[indices[:train_size]] = True
val_mask[indices[train_size:train_size+val_size]] = True
test_mask[indices[train_size+val_size:]] = True

data['patient'].train_mask = train_mask
data['patient'].val_mask = val_mask
data['patient'].test_mask = test_mask

print(f"Train: {train_mask.sum()}")
print(f"Val:   {val_mask.sum()}")
print(f"Test:  {test_mask.sum()}")

Train: 114
Val:   24
Test:  25


## 7. Save

In [14]:
output_dir = project_root / "data" / "han"
output_dir.mkdir(parents=True, exist_ok=True)

output_path = output_dir / "hetero_graph_han.pt"
torch.save(data, output_path)

print(f"\n Saved to: {output_path}")
print(f"Size: {output_path.stat().st_size / 1024:.2f} KB")


 Saved to: /Users/silviatrottet/Documents/M2_GENIOMHE/Deep Learning/2526-m2geniomhe-GNN-sepsis/data/han/hetero_graph_han.pt
Size: 11044.76 KB
