In [1]:
import pandas as pd
import jsonlines

In [2]:
pt_data = pd.read_csv("/home/vs428/project/Moore_data/CT_26545.csv",
                      names=["KL_ID", "text", "date_of_birth", "MRN", "visit_occurence_ID"])
all_pt_data = pd.read_csv("/home/vs428/project/Moore_data/CT_93966.csv",
                         names=["patient_ID", "MRN", "ENC_CSN_ID", "RD_ID", "KL_ID", "text"])

In [7]:
pt_data['date_of_birth'] = pt_data['date_of_birth'].astype("datetime64[ns]")

# Initial Metrics

## For ED Patient Data

This dataset is a list of patients (is it complete?) from 2014-2021 

In [3]:
print("# of rows: ", pt_data.shape[0])
print("# of unique KL_IDs: ", pt_data['KL_ID'].nunique())

assert pt_data['KL_ID'].nunique() == pt_data.shape[0]

print("# of unique patients by MRN: ", pt_data['MRN'].nunique())
print("# of unique patients by Visit ID: ", pt_data['visit_occurence_ID'].nunique())

print("# of Visits with more than one CT: ", (pt_data.groupby("visit_occurence_ID").size() > 1).sum())
print("Proportion of Visits with more than one CT: ", (pt_data.groupby("visit_occurence_ID").size() > 1).sum()/pt_data.groupby("visit_occurence_ID").ngroups)



# of rows:  26545
# of unique KL_IDs:  26545
# of unique patients by MRN:  17166
# of unique patients by Visit ID:  26055
# of Visits with more than one CT:  391
Proportion of Visits with more than one CT:  0.015006716561120707


## For All Patient Data

This dataset is all the CT reports for the patient cohort within the earlier file across the whole health system, not just ED. 

In [65]:
print("# of rows: ", all_pt_data.shape[0])
print("# of unique KL_IDs: ", all_pt_data['KL_ID'].nunique())
print("# of unique patients by MRN: ", all_pt_data['MRN'].nunique())
print("# of unique patients by Encounter ID: ", all_pt_data['ENC_CSN_ID'].nunique())
print("\n-------------------------------------------------------\n")
print("# of patients that exist in this dataset that aren't in the original dataset: ", (~all_pt_data['MRN'].isin(pt_data['MRN'])).sum())
# sanity check on above
assert (~all_pt_data['MRN'].isin(pt_data['MRN'])).sum() == 0

print("# of Patients w/out KL_ID: ", all_pt_data['KL_ID'].isna().sum())
print("# of patients that overlap with all patient data by text: ", pt_data['text'].isin(all_pt_data['text']).sum())


# of rows:  93966
# of unique KL_IDs:  25589
# of unique patients by MRN:  16324
# of unique patients by Encounter ID:  34626

-------------------------------------------------------

# of patients that exist in this dataset that aren't in the original dataset:  0
# of Patients w/out KL_ID:  0
# of patients that overlap with all patient data by text:  0


### Exemplar Selection Showing Lack of Match Between Datasets

In [38]:
pt_data[(pt_data['MRN'] == "MR1515627") & (pt_data['KL_ID'] == "KL_12943")]

Unnamed: 0,KL_ID,text,date_of_birth,MRN,visit_occurence_ID
0,KL_12943,"Reported And Signed By: Amandeep Singh, MD** O...",2060-10-20,MR1515627,118291311


In [67]:
all_pt_data[(all_pt_data['MRN'] == "MR1515627") & (all_pt_data['KL_ID'] == "KL_12943")]
# all_pt_data[(all_pt_data['MRN'] == "MR1515627")]

Unnamed: 0,patient_ID,MRN,ENC_CSN_ID,RD_ID,KL_ID,text
0,39171732,MR1515627,183181823,RD_13507,KL_12943,
1,39171732,MR1515627,183288981,RD_13507,KL_12943,CTA CHEST (PE) W IV CONTRAST INDICATION: CP o...


In [52]:
pt_data[(pt_data['MRN'] == "MR1515627") 
        & (pt_data['KL_ID'] == "KL_12943")].squeeze()['text']

'Reported And Signed By: Amandeep Singh, MD** ORIGINAL REPORT **  CTA CHEST (PE) W IV CONTRAST   INDICATION: Known pulmonary embolism, on the Lovenox, increased work of breathing and altered mental status.  COMPARISON: CT chest 11/12/2018.  TECHNIQUE: CT images of the chest were obtained from the lung bases through the apices after the intravenous administration of 80cc of Omnipaque-350 contrast. Multiplanar MIPs are provided.   FINDINGS:  There is adequate opacification of the pulmonary arterial tree without evidence of pulmonary embolus. The previously seen subsegmental emboli in the right upper lobe not visualized. The aorta and main pulmonary artery are normal in caliber.  The tracheobronchial tree is patent. No suspicious pulmonary nodule is seen. Scattered punctate calcified granulomas are noted. There is no pneumothorax or pleural effusion.  There is no mediastinal, axillary or hilar adenopathy. No aggressive osseous lesion is identified.  IMPRESSION:   No pulmonary embolism.  R

In [57]:
all_pt_data[(all_pt_data['MRN'] == "MR1515627") & 
           (~all_pt_data['text'].isna())
           & (all_pt_data['KL_ID'] == "KL_12943")].squeeze()['text']

'CTA CHEST (PE) W IV CONTRAST  INDICATION: CP or SOB  COMPARISON: CTA of the chest dated 12/1/2018.  TECHNIQUE: CT images of the chest were obtained from the lung bases through the apices after the intravenous administration of 80cc of Omnipaque-350 contrast. Multiplanar MIPs are provided.    FINDINGS: There is no evidence of filling defects in the pulmonary arteries to suspect pulmonary embolism.  These findings were corroborated in the MIP images. The central pulmonary arteries are normal in caliber.  The heart is normal in size; specifically, the right ventricle is normal. There is no evidence of pericardial effusion. There is no aneurysm or dissection of the thoracic aorta.   There is no pulmonary consolidation either atelectasis.. There is no pleural effusion or pneumothorax.  There is no axillary, mediastinal, or hilar lymphadenopathy.   Limited evaluation of the upper abdomen demonstrates diffuse hepatic steatosis..  No aggressive osseous lesions are seen. '

In [29]:
all_pt_data

Unnamed: 0,patient_ID,MRN,ENC_CSN_ID,RD_ID,KL_ID,text
0,39171732,MR1515627,183181823,RD_13507,KL_12943,
1,39171732,MR1515627,183288981,RD_13507,KL_12943,CTA CHEST (PE) W IV CONTRAST INDICATION: CP o...
2,39171732,MR1515627,183181823,RD_13507,KL_20625,
3,39171732,MR1515627,183288981,RD_13507,KL_20625,CTA CHEST (PE) W IV CONTRAST INDICATION: CP o...
4,29530916,MR6010698,182917563,RD_10347,KL_14997,
...,...,...,...,...,...,...
93961,13483478,MR2519855,266564050,RD_5644,KL_9751,Yale Radiology and Biomedical Imaging** ORIGIN...
93962,44392700,MR5721053,268196656,RD_16598,KL_18849,Yale Radiology and Biomedical Imaging** ORIGIN...
93963,13134339,MR2538725,265763539,RD_4586,KL_24924,Yale Radiology and Biomedical Imaging** ORIGIN...
93964,10719287,MR2279041,218245067,RD_1332,KL_532,"\ CT CHEST ABDOMEN PELVIS W IV CONTRAST, CT 2D..."


In [82]:
all_pt_data[all_pt_data['text'].str.contains(r"suspicious pulmonary nodule is seen. Scattered punctate calcified",
                                            na=False)]

Unnamed: 0,patient_ID,MRN,ENC_CSN_ID,RD_ID,KL_ID,text
