## 1 Load and Print Parameters

In [1]:
from gensim.models import Word2Vec
from src.utils import resolve_path

model_path = resolve_path("embedding_cache/w2v/variants/Radiology/best_Radiology.model")
w2v = Word2Vec.load(str(model_path))

# Print key training parameters
print("Word2Vec Model Parameters:")
print("----------------------------------")
print(f"vector_size:        {w2v.vector_size}")
print(f"window:             {w2v.window}")
print(f"min_count:          {w2v.min_count}")
print(f"workers:            {w2v.workers}")
print(f"sg (skip-gram=1):   {w2v.sg}")
print(f"hs (hierarchical):  {w2v.hs}")
print(f"negative sampling:  {w2v.negative}")
print(f"epochs:             {w2v.epochs}")
print(f"sample:             {w2v.sample}")
print(f"alpha:              {w2v.alpha}")
print(f"min_alpha:          {w2v.min_alpha}")
print(f"cbow_mean:          {w2v.cbow_mean}")
print(f"total_words:        {w2v.corpus_total_words}")
print(f"total_examples:     {w2v.corpus_count}")

Word2Vec Model Parameters:
----------------------------------
vector_size:        100
window:             10
min_count:          2
workers:            12
sg (skip-gram=1):   1
hs (hierarchical):  0
negative sampling:  10
epochs:             15
sample:             0.001
alpha:              0.025
min_alpha:          0.0001
cbow_mean:          1
total_words:        45728312
total_examples:     5206


## Define a helper to embed whole documents

In [2]:
import numpy as np

def embed_document(text, w2v_model):
    tokens = text.lower().split()
    vectors = [w2v_model.wv[w] for w in tokens if w in w2v_model.wv]
    if len(vectors) == 0:
        return np.zeros(w2v_model.vector_size)
    return np.mean(vectors, axis=0)


## 3 Cosine similarity function

In [3]:
from numpy.linalg import norm

def cosine_sim(a, b):
    if norm(a) == 0 or norm(b) == 0:
        return 0.0
    return np.dot(a, b) / (norm(a) * norm(b))


## 4 Get any two patientsâ€™ radiology notes

In [16]:
import pandas as pd

df = pd.read_csv(resolve_path("data/interim/data_full_notes_interim.csv"))
df_full = pd.read_csv(resolve_path("data/interim/data_nlp_ready.csv"))


In [18]:
df['note_type_1'].values

array(['radiology', 'radiology', 'radiology', ..., 'discharge',
       'discharge', 'discharge'], dtype=object)

In [6]:
def get_patient_note(df, subject_id):
    row = df.loc[
        (df['subject_id'] == subject_id) & 
        (df['note_type_1'].str.lower() == 'radiology')
    ]
    
    if row.empty:
        return None
    
    return row['text'].values[0]


In [21]:
def get_patient_note_full(df_full, subject_id):
    row = df_full.loc[
        (df_full['subject_id'] == subject_id)
    ]
    
    if row.empty:
        return None
    
    return row['Radiology_notes'].values[0]

In [7]:
# Example
note_1 = get_patient_note(df, 14923562)
print(note_1)


EXAMINATION:  CHEST (PORTABLE AP)

INDICATION:  ___ year old man with s/p liver transplant with prolonged course,
currently trached with MDR E coli PNA, increasing O2 requirements recently//
?interval change     ?interval change

IMPRESSION: 

Compared to chest radiographs ___ through ___ one at 23:48.

Moderate pulmonary edema seen best in the right lung has worsened.  There is
substantial new right basal consolidation, and increase in left perihilar
consolidation, either combinations of edema and atelectasis or pneumonia. 
Cardiac silhouette is partially obscured but moderate cardiomegaly appears to
have increased.

Tracheostomy tube in standard placement.  Left dual channel dialysis catheters
end in the SVC and close to the superior cavoatrial junction.  Right jugular
line ends at the thoracic inlet, all unchanged.

There may be new fluid in the previously air containing externalized right
pleural space, a function of cardiac decompensation.



In [22]:
note_1_full = get_patient_note_full(df_full, 14923562)
print(note_1_full)

EXAMINATION: CHEST (PORTABLE AP) INDICATION:  year old man with sp liver transplant with prolonged course, currently trached with MDR E coli PNA, increasing O2 requirements recently ?interval change ?interval change IMPRESSION: Compared to chest radiographs  through  one at 23:48. Moderate pulmonary edema seen best in the right lung has worsened. There is substantial new right basal consolidation, and increase in left perihilar consolidation, either combinations of edema and atelectasis or pneumonia. Cardiac silhouette is partially obscured but moderate cardiomegaly appears to have increased. Tracheostomy tube in standard placement. Left dual channel dialysis catheters end in the SVC and close to the superior cavoatrial junction. Right jugular line ends at the thoracic inlet, all unchanged. There may be new fluid in the previously air containing externalized right pleural space, a function of cardiac decompensation. EXAMINATION: CHEST (PORTABLE AP) INDICATION:  year old man with LLL co

In [8]:
note_2 = get_patient_note(df, 11619087)
print(note_2)

INDICATION:  ___ year old woman with UTI, likely drug resistant, great
difficulty with peripheral venous access.// picc placement

COMPARISON:  ___

TECHNIQUE:  OPERATORS: Dr. ___ and Dr.
___ radiologist performed the procedure. Dr. ___
___ supervised the trainee during the key components of the procedure
and has reviewed and agrees with the trainee's findings.
ANESTHESIA: 1% lidocaine was injected in the skin and subcutaneous tissues
overlying the access site.
CONTRAST: 5 ml of Optiray contrast.
FLUOROSCOPY TIME AND DOSE: 3.54 min, 16.8 mGy

PROCEDURE:
1. Double lumen PICC placement through the right brachial vein.

PROCEDURE DETAILS: Using sterile technique and local anesthesia, the right
brachial vein was punctured under direct ultrasound guidance using a
micropuncture set. Permanent ultrasound images were obtained before and after
intravenous access, which confirmed vein patency.  The guidewire could not be
advanced easily.  The inner of a micropuncture sheath was advanced over the

## 5 Compare Cosine Similarity

In [9]:
# Example above
vec_1 = embed_document(note_1, w2v)
vec_2 = embed_document(note_2, w2v)

similarity = cosine_sim(vec_1, vec_2)
similarity


0.7661022

## 6 Get 50 Random Similarity Scores

In [60]:
import itertools

patient_ids = df['subject_id'].unique()[:200]  # sample for speed

results = []
for a, b in itertools.combinations(patient_ids, 2):
    note_a = get_patient_note(df, a)
    note_b = get_patient_note(df, b)
    sim = cosine_sim(
        embed_document(note_a, w2v),
        embed_document(note_b, w2v)
    )
    results.append((a, b, sim))

# Sort descending
results_sorted = sorted(results, key=lambda x: x[2], reverse=True)
results_sorted[:5]



[(18486555, 14788557, 0.9972115),
 (12214410, 14788557, 0.9971013),
 (18486555, 12214410, 0.9961083),
 (12542700, 12683473, 0.9923077),
 (10804922, 14788557, 0.9915437)]

In [50]:
results_sorted_low = sorted(results, key=lambda x: x[2])
results_sorted_low[:5]


[(17978114, 11863401, 0.55611366),
 (15552419, 11863401, 0.5781456),
 (12468016, 11863401, 0.5928373),
 (11619087, 11863401, 0.5985415),
 (17978114, 14174955, 0.6112414)]

### High Cosine Similarity

In [54]:
#(18486555, 12214410, 0.9961083)

note_A = get_patient_note(df, 18486555)
note_B = get_patient_note(df, 12214410)

vec_A = embed_document(note_A, w2v)
vec_B = embed_document(note_B, w2v)

similarity1 = cosine_sim(vec_A, vec_B)
similarity1

0.9961083

In [55]:
print(note_A)

EXAMINATION:  BILAT LOWER EXT VEINS

INDICATION:  ___ year old man with leg weakness and calf tenderness.  // ? DVT

TECHNIQUE:  Grey scale, color, and spectral Doppler evaluation was performed
on the bilateral lower extremity veins.

COMPARISON:  None.

FINDINGS: 

There is normal compressibility, flow, and augmentation of the bilateral
common femoral, femoral, and popliteal veins. Normal color flow and
compressibility are demonstrated in the posterior tibial and peroneal veins.

There is normal respiratory variation in the common femoral veins bilaterally.

No evidence of medial popliteal fossa (___) cyst.

IMPRESSION: 

No evidence of deep venous thrombosis in the right or left lower extremity
veins.



In [56]:
print(note_B)

EXAMINATION:  UNILAT LOWER EXT VEINS RIGHT

INDICATION:  ___ with R leg swelling.  DVT?

TECHNIQUE:  Grey scale, color, and spectral Doppler evaluation was performed
on the right lower extremity veins.

COMPARISON:  None.

FINDINGS: 

There is normal compressibility, flow, and augmentation of the right common
femoral, femoral, and popliteal veins. Normal color flow and compressibility
are demonstrated in the tibial and peroneal veins.

There is normal respiratory variation in the common femoral veins bilaterally.

No evidence of medial popliteal fossa (___) cyst.

IMPRESSION: 

No evidence of deep venous thrombosis in the right lower extremity veins.



### Low Cosine Similarity

In [57]:
#(17978114, 11863401, 0.55611366)

note_C = get_patient_note(df, 17978114)
note_D = get_patient_note(df, 11863401)

vec_C = embed_document(note_C, w2v)
vec_D = embed_document(note_D, w2v)

similarity2 = cosine_sim(vec_C, vec_D)
similarity2

0.55611366

In [58]:
print(note_C)

TRANSJUGULAR LIVER BIOPSY

INDICATION:  ___ woman with liver dysfunction, elevated INR.

OPERATORS:  Drs. ___ (fellow), ___ (resident) and ___
___ (attending physician).  Dr. ___ was present during key moments of the
procedure.

SEDATION:  Moderate sedation in divided doses of intravenous ___ mcg fentanyl
and 2.5 mg of Versed over 50 minutes, during which patient's hemodynamic
status was continuously monitored by a trained radiology nurse.

PROCEDURE AND FINDINGS:  Consent was obtained from the patient after
explaining the benefits, risks, and alternatives.  The patient was placed
supine on the imaging table in the interventional suite.  Timeout was
performed as per ___ protocol.

Under aseptic conditions, sonographic guidance, and after infiltrating the
skin and subcutaneous tissues with 1% lidocaine, a micropuncture needle was
placed in the patent right internal jugular vein.  A 0.018 wire was advanced
through the needle and into the SVC.  After making an incision, needle was
exchang

In [59]:
print(note_D)

EXAMINATION:  CHEST (PORTABLE AP)

INDICATION:  ___ year old man with alcoholic hepatitis, w/mild hypoxia and
sob// consolidation, pleural effusion, pulm edema      consolidation, pleural
effusion, pulm edema

IMPRESSION: 

Heart size and mediastinum are overall normal in size.  Left retrocardiac
consolidation might be consistent with hernia versus varices versus infectious
process.  Lung volumes are low.  Bibasal atelectasis is present.  Bilateral
pleural effusion cannot be excluded.  There is no pulmonary edema.  There is
no pneumothorax.



## Check for Information leakage

Patient 2 below shows potential information with the word 'death' being included in the text notes due to brain-death protocol.

In [27]:
died_df = df_full[df_full['hospital_expire_flag'] == 1]

sample_patients = died_df['subject_id'].drop_duplicates().sample(5, random_state=42)
sample_patients

for sid in sample_patients:
    note = died_df.loc[died_df['subject_id'] == sid, 'Radiology_notes'].values[0]
    print(f"SUBJECT ID {sid}:\n{note[:]}\n\n")


SUBJECT ID 10007818:
EXAMINATION: CT CHEST WCONTRAST INDICATION:  man with cirrhosis and bacteremia with rising leukocytosis. Evaluate for source of infection. TECHNIQUE: Multidetector helical scanning of the chest was coordinated with intravenous infusion of nonionic iodinated contrast agent reconstructed as contiguous 5- and 1.25-mm thick axial, 2.5-mm thick coronal and parasagittal, and 8 x 8 mm MIPs axial images. DOSE: See same-day CTA ABD AND PELVIS for dose report. COMPARISON: Ultrasound , arteriogram , CT , CT . FINDINGS: The thoracic aorta is normal in caliber with minimal atherosclerotic calcifications at the aortic arch. Incidentally noted is a two vessel takeoff from the aortic arch. The pulmonary artery is normal in caliber. Contrast bolus timing is not optimized to evaluate for pulmonary embolism but there is no large central PE. No pathologically enlarged supraclavicular, axillary, mediastinal or hilar lymph nodes are identified ranging up to 8 mm in the right lower parat