In [1]:
import pandas as pd
import pickle
from tqdm import tqdm
import ollama
import re


In [2]:
excel_path = 'Clinical_cohort.xlsx'

df = pd.read_excel(excel_path)
print(df.columns)
df.head()


Index(['Case number', 'Case Group (1=pubmed,2=Stanford)', 'Source Identifier',
       'Number/PMID', 'Symptoms', 'OMIM link', 'OMIM-Diagnosis',
       'Diagnosis-pubmedcases', 'Gene 1', 'Gene 1 Mutation'],
      dtype='object')


Unnamed: 0,Case number,"Case Group (1=pubmed,2=Stanford)",Source Identifier,Number/PMID,Symptoms,OMIM link,OMIM-Diagnosis,Diagnosis-pubmedcases,Gene 1,Gene 1 Mutation
0,1,2,,lyg10,"tonic seizures, febrile seizures during, abno...",https://www.omim.org/clinicalSynopsis/611867?h...,"CHROMOSOME 22q11.2 DELETION SYNDROME, DISTAL",,22q11.2,2.4Mb deletion
1,2,2,,lybb32,"seizure, developmental delay, speech delay, c...",https://www.omim.org/clinicalSynopsis/611867?h...,"CHROMOSOME 22q11.2 DELETION SYNDROME, DISTAL",,22q11.2 deletion,
2,3,2,,lybb166,"sizure onset since infant, febrile seizure, in...",https://www.omim.org/clinicalSynopsis/613443?h...,"NEURODEVELOPMENTAL DISORDER WITH HYPOTONIA, ST...",,5q14.3,"5q14.3(87,148,105-88,698,587)x1 deletion"
3,4,2,,lybb132,"refractory epilepsy, intellectual disability, ...",https://omim.org/clinicalSynopsis/606854?highl...,"CORTICAL DYSPLASIA, COMPLEX, WITH OTHER BRAIN ...",,ADGRG1,"c.407T>A (p.Leu136*),heterozygous,Pathogenic"
4,5,2,,lyd52,"intellectual disability, learning disability, ...",https://omim.org/entry/604352,"FEBRILE SEIZURES, FAMILIAL, 4; FEB4",,ADGRV1,"c.12211C>T (p.R4071X),heterozygous,Pathogenic"


**Extract and normalize symptom strings**

* Convert to string
* Split one cell into multiple symptoms
* Strip whitespace
* Drop empty entries
* Deduplicate
* Splits on ;
* Example: Epilepsy (present in 92.2% of patients); Median age at seizure onset: 2 years 6 months; Multiple seizure types (generalized onset tonic-clonic, absences, myoclonic, focal-onset, atonic, myoclonic-atonic, tonic, epileptic spasms, myoclonic-clonic); Single seizure types (generalized onset tonic-clonic, absences, focal, myoclonic, myoclonic-atonic); History of febrile seizures; Status epilepticus (SE) (convulsive, nonconvulsive, or both); Prevalent epilepsy type: generalized (75.5%), combined generalized and focal (22.3%), focal (2.2%); Aggressive behavior; Attention Deficit Hyperactivity Disorder (ADHD); Autism spectrum disorder (ASD) / autistic features; Specific Learning Disorder (SLD); Obsessive-Compulsive Disorder (OCD); Psychotic symptoms; Tourette's syndrome; Bipolar disorder; Childhood-onset schizophrenia; Brain MRI findings: normal (majority), or cerebellar hypoplasia, cerebellar vermis hypoplasia, cerebellar atrophy, Arnold-Chiari type I malformation, white matter hyperintensity.

In [3]:
# 1) Ensure column is string
df['Symptoms'] = df['Symptoms'].astype(str)

all_symptoms = []

for raw in df['Symptoms']:
    # Split by ','
    parts = re.split(',', raw)
    for p in parts:
        s = p.strip()
        if s:
            all_symptoms.append(s)

print("Total symptom entries (with duplicates):", len(all_symptoms))

# 2) Deduplicate while preserving order
seen = set()
unique_symptoms = []
for s in all_symptoms:
    if s not in seen:
        seen.add(s)
        unique_symptoms.append(s)

print("Unique symptoms:", len(unique_symptoms))
print("First 10 unique symptoms:", unique_symptoms[:10])


Total symptom entries (with duplicates): 642
Unique symptoms: 336
First 10 unique symptoms: ['tonic seizures', 'febrile seizures during', 'abnormal EEG', 'developmental delays', 'anxiety', 'Short statue', 'dysmorphic face', 'cleft palate', 'seizure', 'developmental delay']


In [4]:
# embed each unique symptom with Ollama
def embed_symptom(text: str, model: str = "mxbai-embed-large"):
    res = ollama.embeddings(model=model, prompt=text)
    return res["embedding"]

symptom_embeddings = {}

tqdm_bar = tqdm(unique_symptoms, desc="Embedding symptoms")

for symptom in tqdm_bar:
    try:
        emb = embed_symptom(symptom)
        symptom_embeddings[symptom] = emb
    except Exception as e:
        print(f"Error embedding '{symptom}': {e}")
        # you could also decide to continue silently or log this


Embedding symptoms: 100%|██████████| 336/336 [00:07<00:00, 44.98it/s]


In [5]:
print("Number of keys in dict:", len(symptom_embeddings))

first_symptom = next(iter(symptom_embeddings.keys()))
print("Example symptom:", first_symptom)
print("Embedding length:", len(symptom_embeddings[first_symptom]))
print("First 5 dims:", symptom_embeddings[first_symptom][:5])


Number of keys in dict: 336
Example symptom: tonic seizures
Embedding length: 1024
First 5 dims: [-0.020388107746839523, 0.1989801973104477, -0.02324994094669819, 0.5181179642677307, -0.848914623260498]


In [6]:
output_path = "clinical_symptom_embeddings_dict_mxbai.pkl"

with open(output_path, "wb") as f:
    pickle.dump(symptom_embeddings, f)

print("Saved symptom embedding dict to:", output_path)


Saved symptom embedding dict to: clinical_symptom_embeddings_dict_mxbai.pkl


In [7]:
import pickle

with open(r"clinical_symptom_embeddings_dict_mxbai.pkl", "rb") as f:
    pubmed_symptom_dict = pickle.load(f)

for i, (sym, emb) in enumerate(pubmed_symptom_dict.items()):
    print(i+1, "Symptom:", sym)
    print("   Embedding length:", len(emb))
    print("   First 5 dims:", emb[:5])
    if i == 4:
        break


1 Symptom: tonic seizures
   Embedding length: 1024
   First 5 dims: [-0.020388107746839523, 0.1989801973104477, -0.02324994094669819, 0.5181179642677307, -0.848914623260498]
2 Symptom: febrile seizures during
   Embedding length: 1024
   First 5 dims: [0.21442380547523499, -0.0074622430838644505, -0.11287715286016464, 0.5921085476875305, -0.1798083782196045]
3 Symptom: abnormal EEG
   Embedding length: 1024
   First 5 dims: [-0.6618313193321228, 0.3917617201805115, -0.5478911399841309, 0.29417574405670166, -0.3873158395290375]
4 Symptom: developmental delays
   Embedding length: 1024
   First 5 dims: [-0.2365935891866684, 0.2842962145805359, -0.6109287142753601, -0.12643812596797943, -0.23573502898216248]
5 Symptom: anxiety
   Embedding length: 1024
   First 5 dims: [-0.3422442376613617, 0.2198868840932846, -0.007962829433381557, 0.6413687467575073, 0.47278720140457153]


**Cluster Centroid Assignment**

For each PubMed symptom:
* Compute distance to all cluster centroids save in file cluster_centroids_mxbai_10000.json
* Assign it to the closest cluster

In [8]:
import pickle
import numpy as np
from collections import defaultdict
from tqdm.auto import tqdm
import ollama
import pandas as pd


In [9]:
pubmed_pkl_path = "clinical_symptom_embeddings_dict_mxbai.pkl"

with open(pubmed_pkl_path, "rb") as f:
    pubmed_sym2emb = pickle.load(f)

print("Number of PubMed symptoms:", len(pubmed_sym2emb))

# Turn into ordered lists + matrix
pubmed_symptoms = list(pubmed_sym2emb.keys())
pubmed_matrix = np.array([pubmed_sym2emb[s] for s in pubmed_symptoms], dtype=np.float32)

print("PubMed embedding matrix shape:", pubmed_matrix.shape)  # (n_symptoms, dim)


Number of PubMed symptoms: 336
PubMed embedding matrix shape: (336, 1024)


Assign each PubMed and Clinical symptom to the nearest cluster

In [11]:
# TODO: help me correct the following code, the centroids info has been saved in file cluster_centroids_mxbai_10000.json
import json
import numpy as np

# Load centroids from JSON
centroid_json_path = "cluster_centroids_mxbai_10000.json"

with open(centroid_json_path, "r") as f:
    centroid_data = json.load(f)

# Each entry looks like:
# "0": { "cluster_id": 0, "terms": [...], "centroid": [ ... ] }
# We sort by cluster_id so the order is stable/predictable.
centroid_records = sorted(centroid_data.values(), key=lambda x: x["cluster_id"])

cluster_ids = [rec["cluster_id"] for rec in centroid_records]
centroids = np.array([rec["centroid"] for rec in centroid_records], dtype=np.float32)

print("Number of clusters:", len(cluster_ids))
print("Centroids matrix shape:", centroids.shape)  # (n_clusters, dim)

def normalize_rows(mat: np.ndarray) -> np.ndarray:
    norms = np.linalg.norm(mat, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    return mat / norms

# pubmed_matrix is coming from your earlier cell:
# pubmed_symptoms = list(pubmed_sym2emb.keys())
# pubmed_matrix   = np.array([...])

pubmed_norm = normalize_rows(pubmed_matrix)
centroids_norm = normalize_rows(centroids)

# sanity check: dimensions must match
assert pubmed_norm.shape[1] == centroids_norm.shape[1], \
    f"Dim mismatch: symptoms dim {pubmed_norm.shape[1]} vs centroids dim {centroids_norm.shape[1]}"

# similarity[i, j] = cosine similarity between pubmed_symptoms[i] and cluster_ids[j]
similarity = pubmed_norm @ centroids_norm.T   # (n_symptoms, n_clusters)

print("Similarity matrix shape:", similarity.shape)




Number of clusters: 10000
Centroids matrix shape: (10000, 1024)
Similarity matrix shape: (336, 10000)


In [13]:
# TODO: correct here
# Get the closest cluster for each symptom
best_cluster_idx = similarity.argmax(axis=1)     # index in cluster_ids list
best_similarity = similarity.max(axis=1)

assigned_cluster_ids = [cluster_ids[i] for i in best_cluster_idx]

print("Example assignments:")
for s, cid, sim in zip(pubmed_symptoms[:5], assigned_cluster_ids[:5], best_similarity[:5]):
    print(f"  {s!r} -> cluster {cid}, sim={sim:.4f}")


Example assignments:
  'tonic seizures' -> cluster 1990, sim=0.9624
  'febrile seizures during' -> cluster 1012, sim=0.9622
  'abnormal EEG' -> cluster 4997, sim=0.9807
  'developmental delays' -> cluster 7156, sim=0.9631
  'anxiety' -> cluster 2435, sim=1.0000


In [14]:
assign_df = pd.DataFrame({
    "symptom": pubmed_symptoms,
    "cluster_id": assigned_cluster_ids,
    "similarity": best_similarity,
})

assign_df.head()

output_assign_path = "clinical_symptom_cluster_assignments.csv"
assign_df.to_csv(output_assign_path, index=False)
print("Saved assignments to:", output_assign_path)


Saved assignments to: clinical_symptom_cluster_assignments.csv
