In [1]:
# Use the cosine similarity metric to determine the distance matrix between different conditions

In [2]:
import json
import os
import numpy as np
from scipy.spatial import distance

In [3]:
# use the initial condition definition, not any of the modified ones
condition_file = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/definitions/symcat_conditions.json"

In [4]:
with open(condition_file) as fp:
    conditions = json.load(fp)

In [13]:
symptoms_list = set([])

In [14]:
for condition_name in conditions:
    cond_symptoms = set(conditions.get(condition_name).get("symptoms").keys())
    symptoms_list = symptoms_list | cond_symptoms

In [17]:
symptoms_list = sorted(list(symptoms_list))

In [19]:
symptom_vector = {item: idx for idx,item in enumerate(symptoms_list)}

In [21]:
condition_names = sorted(conditions.keys())

In [39]:
num_conditions = len(condition_names)
num_symptoms = len(symptom_vector)

In [40]:
condition_symptom_matrix = np.zeros((num_conditions, num_symptoms))

In [41]:
condition_symptom_matrix.shape

(801, 376)

In [42]:
for jdx, condition_name in enumerate(condition_names):
    condition_symptoms = conditions.get(condition_name).get("symptoms")
    symptom_indices = []
    symptom_prob = []
    for key, value in condition_symptoms.items():
        idx = symptom_vector.get(key)
        prob = value.get("probability")
        symptom_indices.append(idx)
        symptom_prob.append(prob/100)
    
    condition_symptom_matrix[jdx, symptom_indices] = symptom_prob

In [43]:
# now compute the distance matrix

In [44]:
distance_matrix = distance.pdist(condition_symptom_matrix, 'cosine')

In [45]:
distance_matrix.shape

(320400,)

In [46]:
full_distance_matrix = np.zeros((num_conditions, num_conditions))

In [85]:
start_idx = 0
for idx in range(num_conditions-1):
    gap = num_conditions - idx - 1
    end_idx = start_idx + gap
    full_distance_matrix[idx, idx+1:] = distance_matrix[start_idx:end_idx]
    full_distance_matrix[idx+1:, idx] = distance_matrix[start_idx:end_idx]

In [89]:
# and we have a full_distance_matrix
# does it make sense though ? Check most similar conditions to appendicitis!

In [90]:
condition_vector = {name: idx for idx, name in enumerate(condition_names)}

In [91]:
condition_vector["appendicitis"]

50

In [92]:
appendicitis_distance = full_distance_matrix[50, :]

In [95]:
app_dist_sorted = np.argsort(appendicitis_distance)

In [100]:
np_cond_names = np.array(condition_names)

In [101]:
app_close_conditions = np_cond_names[app_dist_sorted]

In [105]:
app_close_conditions[-10:]

array(['open-wound-of-the-abdomen', 'open-wound-from-surgical-incision',
       'open-wound-due-to-trauma', 'onychomycosis', 'celiac-disease',
       'graves-disease', 'guillain-barre-syndrome',
       'central-atherosclerosis', 'normal-pressure-hydrocephalus',
       'edward-syndrome'], dtype='<U54')

In [107]:
# cosine similarity does not seem to help too much
# we get the weirdest diseases as being more similar
# so we'll try plain old hamming distance

In [108]:
from copy import deepcopy

In [109]:
n_condition_matrix = deepcopy(condition_symptom_matrix)

In [110]:
n_condition_matrix[n_condition_matrix>0] = 1

In [112]:
hamming_distance_matrix = distance.pdist(n_condition_matrix, 'hamming')

In [113]:
full_hamming_distance_matrix = np.zeros((num_conditions, num_conditions))

In [114]:
start_idx = 0
for idx in range(num_conditions-1):
    gap = num_conditions - idx - 1
    end_idx = start_idx + gap
    full_hamming_distance_matrix[idx, idx+1:] = hamming_distance_matrix[start_idx:end_idx]
    full_hamming_distance_matrix[idx+1:, idx] = hamming_distance_matrix[start_idx:end_idx]

In [115]:
appendicitis_h_dist = full_hamming_distance_matrix[50, :]

In [116]:
app_hdist_sorted = np.argsort(appendicitis_h_dist)

In [117]:
app_close_h_conditions = np_cond_names[app_hdist_sorted]

In [118]:
app_close_h_conditions[0]

'appendicitis'

In [125]:
app_close_h_conditions[:20]

array(['appendicitis', 'injury-to-the-abdomen', 'pelvic-organ-prolapse',
       'interstitial-lung-disease', 'paroxysmal-ventricular-tachycardia',
       'pulmonic-valve-disease', 'alzheimer-disease', 'cerebral-edema',
       'autism', 'lichen-simplex', 'hormone-disorder', 'histoplasmosis',
       'lipoma', 'primary-insomnia', 'conjunctivitis',
       'fungal-infection-of-the-skin', 'peritonitis', 'cystic-fibrosis',
       'paronychia', 'intestinal-obstruction'], dtype='<U54')

In [126]:
app_close_conditions[:20]

array(['appendicitis', 'benign-kidney-cyst', 'alcohol-intoxication',
       'lipoma', 'lichen-simplex', 'varicose-veins', 'hormone-disorder',
       'histoplasmosis', 'primary-insomnia', 'poisoning-due-to-gas',
       'cystic-fibrosis', 'paroxysmal-ventricular-tachycardia',
       'insect-bite', 'injury-of-the-ankle', 'moyamoya-disease',
       'alzheimer-disease', 'autism', 'poisoning-due-to-antidepressants',
       'cysticercosis', 'cholesteatoma'], dtype='<U54')

In [129]:
appendicitis_h_dist_sorted =  appendicitis_h_dist[app_hdist_sorted]

In [130]:
appendicitis_h_dist_sorted[1]

0.031914893617021274

In [131]:
appendicitis_h_dist_sorted[-1]

0.06382978723404255

In [133]:
np.median(appendicitis_h_dist_sorted[1:])

0.05851063829787234

In [134]:
euc_dist = distance.pdist(condition_symptom_matrix, 'euclidean')

In [135]:
euc_dist_mat = np.zeros((num_conditions, num_conditions))

In [136]:
start_idx = 0
for idx in range(num_conditions-1):
    gap = num_conditions - idx - 1
    end_idx = start_idx + gap
    euc_dist_mat[idx, idx+1:] = euc_dist[start_idx:end_idx]
    euc_dist_mat[idx+1:, idx] = euc_dist[start_idx:end_idx]

In [137]:
euc_append_dist = euc_dist_mat[50, :]

In [138]:
euc_sorted = np.argsort(euc_append_dist)

In [139]:
app_euc_sim_cond = np_cond_names[euc_sorted]

In [142]:
app_euc_sim_cond[:10]

array(['appendicitis', 'lichen-simplex',
       'paroxysmal-ventricular-tachycardia',
       'poisoning-due-to-antidepressants', 'benign-kidney-cyst',
       'alcohol-intoxication', 'diaper-rash', 'diabetic-ketoacidosis',
       'necrotizing-fasciitis', 'interstitial-lung-disease'], dtype='<U54')

In [143]:
euc_append_dist[euc_sorted][:10]

array([0.        , 0.69433421, 0.70014284, 0.7109149 , 0.71189887,
       0.71189887, 0.74128267, 0.76759364, 0.79006329, 0.802683  ])