In [1]:
import os
import lightgbm
import re

import pandas as pd
import numpy as np
import dask.dataframe as dd

from femr.datasets import PatientDatabase

from src.default_paths import path_root, path_extract
from src.io import read_pkl, read_msgpack
from src.utils import list_dir

In [2]:
path_models = os.path.join(path_root, "data/adapter_models/count_mimic")
path_features = os.path.join(path_root, "data/features/count_mimic")
path_labels = os.path.join(path_root, "data/labels")
path_concept_table = "/hpf/projects/lsung/phi/data/mimic_omop_parquet/concept"

df_concepts = dd.read_parquet(path_concept_table).compute()

In [3]:
topN = 50
tasks = list_dir(path_labels)
for task in tasks:
    model = read_pkl(os.path.join(path_models, task, "model.pkl"))
    model_info = read_pkl(os.path.join(path_models, task, "model_info.pkl"))
    featurizer = read_pkl(os.path.join(path_features, task, "preprocessed_featurizers.pkl"))
    featurized_patients = read_pkl(os.path.join(path_features, task, "featurized_patients.pkl"))
    features = featurized_patients[0]
    labels = featurized_patients[2]

    imp = model.feature_importances_
    ind_topN = np.argpartition(imp, -topN)[-topN:]
    imp_topN = imp[ind_topN]
    imp_rank = (-imp_topN).argsort().argsort()+1

    df_results = pd.DataFrame()

    for i, ind in enumerate(ind_topN): 
        col = featurizer.get_column_name(ind)
        rank = imp_rank[i]
        quantile_bin = np.nan
        imp = imp_topN[i]
        n_pos = (features[:, ind]!=0).sum()
        n_prev = (features[:, ind]!=0).mean()
        mean_pos = features[np.where(labels==True)[0],ind].mean()
        mean_neg = features[np.where(labels==False)[0],ind].mean()
        in_sk_dict = 0
        in_stanford_dict = 0
        
        if "AgeFeaturizer" in col:
            concept_vocabulary = np.nan,
            concept_code = np.nan,
            time_bin = np.nan,
            concept_name = "Age",
            concept_domain = np.nan,
            
        else:
            concept_vocabulary = col.split("),")[1].split("/")[0]
            concept_code = col.split("),")[1].split("/")[1].split("_")[0]
            time_bin = col.split("),")[1].split("/")[1].split("_")[1].split(",")[0]
            re_res = re.search(r'(?<=\[)(.*?)(?=\))', col)
            

            if re_res:
                quantile_bin = re_res.groups()[0]

            if "[" in concept_code:
                concept_code = concept_code.split(" ")[0]

            concept_name = df_concepts.query("concept_code==@concept_code")["concept_name"].values[0]
            concept_domain = df_concepts.query("concept_code==@concept_code")["domain_id"].values[0]
            
        df_results = pd.concat((
            df_results,
            pd.DataFrame({
                "rank": [rank],
                "index": ind,
                "importance": imp,
                "num_not_zero": n_pos,
                "prev_not_zero": n_prev,
                "mean_pos_label": mean_pos,
                "mean_neg_label": mean_neg,
                "concept_vocabulary": concept_vocabulary,
                "concept_domain": concept_domain,
                "concept_code": concept_code,
                "concept_name": concept_name,
                "quantile_bin": quantile_bin,
                "time_bin": time_bin,
            }),
        ))

    df_results = df_results.set_index("rank").sort_index()
    
    print(f"\n\n==============={task}=================")
    display(df_results)





Unnamed: 0_level_0,index,importance,num_not_zero,prev_not_zero,mean_pos_label,mean_neg_label,concept_vocabulary,concept_domain,concept_code,concept_name,quantile_bin,time_bin
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,2698,44055,1.0,0.353331,-0.135551,,,,Age,,
2,64813,534,24353,0.552786,5.682112,6.50804,NUCC,Visit,261Q00000X,Ambulatory Clinic / Center,,36500 days
3,21926,475,6537,0.148383,0.244699,0.115295,SNOMED,Procedure,735414004,Insertion of catheter into peripheral vein,,1 day
4,18994,354,11151,0.253115,0.021695,0.341897,SNOMED,Observation,405606005,Labour and delivery unit,,1 day
5,23453,351,36048,0.81825,1.03553,0.951131,Visit,Visit,ERIP,Emergency Room and Inpatient Visit,,1 day
6,10491,301,5011,0.113744,0.008187,0.15424,RxNorm,Drug,197806,ibuprofen 600 MG Oral Tablet,,1 day
7,17893,288,1974,0.044808,0.088907,0.027889,SNOMED,Observation,309958005,Psychiatry department,,1 day
8,9285,282,22479,0.510249,0.695702,0.439102,LOINC,Measurement,77147-7,Glomerular filtration rate/1.73 sq M.predicted...,,1 day
9,11419,272,11552,0.262218,0.322964,0.238945,RxNorm,Drug,313782,acetaminophen 325 MG Oral Tablet,,1 day
10,10207,267,7941,0.180252,0.115677,0.205025,RxNorm,Drug,1740467,2 ML ondansetron 2 MG/ML Injection,,1 day






Unnamed: 0_level_0,index,importance,num_not_zero,prev_not_zero,mean_pos_label,mean_neg_label,concept_vocabulary,concept_domain,concept_code,concept_name,quantile_bin,time_bin
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,27083,43950,1.0,0.506909,-0.00448,,,,Age,,
2,17883,3338,967,0.022002,0.122078,0.021118,SNOMED,Observation,309993003,Surgical transplant department,,1 day
3,79752,3260,13149,0.299181,1.272727,1.433031,LOINC,Measurement,2075-0,Chloride [Moles/volume] in Serum or Plasma,"104.0, 106.0",36500 days
4,9271,3074,22381,0.509238,0.693506,0.507609,LOINC,Measurement,77147-7,Glomerular filtration rate/1.73 sq M.predicted...,,1 day
5,78814,3001,3648,0.083003,0.174026,0.082199,mimiciv_per_ethnicity,Race,UNKNOWN,UNKNOWN,,36500 days
6,56517,2910,26682,0.607099,0.477922,0.608241,Gender,Gender,F,FEMALE,,36500 days
7,9756,2206,11688,0.265939,0.38961,0.264846,RxNorm,Drug,1361615,"heparin sodium, porcine 5000 UNT/ML Injectable...",,1 day
8,11933,2167,12725,0.289534,0.387013,0.288672,RxNorm,Drug,727820,10 ML sodium chloride 9 MG/ML Prefilled Syringe,,1 day
9,9324,2083,3427,0.077975,0.103896,0.077746,Medicare Specialty,Visit,A0,Hospital,,1 day
10,24368,1932,1986,0.045188,0.124675,0.046964,LOINC,Measurement,2075-0,Chloride [Moles/volume] in Serum or Plasma,"95.0, 98.0",1 day






Unnamed: 0_level_0,index,importance,num_not_zero,prev_not_zero,mean_pos_label,mean_neg_label,concept_vocabulary,concept_domain,concept_code,concept_name,quantile_bin,time_bin
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,34287,44055,1.0,0.724329,-0.02728,,,,Age,,
2,9285,3765,22479,0.510249,0.785491,0.499882,LOINC,Measurement,77147-7,Glomerular filtration rate/1.73 sq M.predicted...,,1 day
3,78926,3166,3658,0.083033,0.268918,0.076032,mimiciv_per_ethnicity,Race,UNKNOWN,UNKNOWN,,36500 days
4,56609,2751,26725,0.606628,0.454034,0.612375,Gender,Gender,F,FEMALE,,36500 days
5,9770,2528,11737,0.266417,0.417761,0.260717,RxNorm,Drug,1361615,"heparin sodium, porcine 5000 UNT/ML Injectable...",,1 day
6,11951,2298,12770,0.289865,0.398374,0.285778,RxNorm,Drug,727820,10 ML sodium chloride 9 MG/ML Prefilled Syringe,,1 day
7,64753,2251,23186,0.526297,4.852407,3.688101,LOINC,Measurement,77147-7,Glomerular filtration rate/1.73 sq M.predicted...,,36500 days
8,64818,1972,27401,0.621973,0.565979,0.624081,Race,Observation,5,Sample collection duration,,36500 days
9,25110,1811,1855,0.042106,0.361476,0.040018,LOINC,Measurement,2777-1,Phosphate [Mass/volume] in Serum or Plasma,"4.800000190734863, inf",1 day
10,17903,1805,979,0.022222,0.060038,0.020798,SNOMED,Observation,309993003,Surgical transplant department,,1 day






Unnamed: 0_level_0,index,importance,num_not_zero,prev_not_zero,mean_pos_label,mean_neg_label,concept_vocabulary,concept_domain,concept_code,concept_name,quantile_bin,time_bin
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,5829,44042,1.0,-0.083433,0.000494,,,,Age,,
2,38225,2280,11432,0.25957,0.104247,0.266085,RxNorm,Drug,197806,ibuprofen 600 MG Oral Tablet,,7 days
3,79020,1700,3134,0.071159,0.397683,0.199187,LOINC,Measurement,11556-8,Oxygen [Partial pressure] in Blood,"294.90000000000055, inf",36500 days
4,51188,1427,5975,0.135666,0.324324,0.135555,Visit,Visit,IP,Inpatient Visit,,7 days
5,20905,1416,614,0.013941,0.088803,0.013498,SNOMED,Condition,609496007,Complication occurring during pregnancy,,1 day
6,67960,1404,268,0.006085,0.042471,0.013042,RxNorm,Drug,896222,14 ACTUAT fluticasone propionate 0.5 MG/ACTUAT...,,36500 days
7,39060,1303,389,0.008832,0.061776,0.008885,RxNorm,Drug,312134,oxazepam 15 MG Oral Capsule,,7 days
8,73447,1295,220,0.004995,0.054054,0.008908,SNOMED,Condition,313435000,Insulin-dependent diabetes without complication,,36500 days
9,21394,1219,111,0.00252,0.007722,0.00249,SNOMED,Condition,698816006,Chronic occlusion of artery of extremity,,1 day
10,38949,1179,716,0.016257,0.042471,0.018432,RxNorm,Geography,310385,Grebin,,7 days






Unnamed: 0_level_0,index,importance,num_not_zero,prev_not_zero,mean_pos_label,mean_neg_label,concept_vocabulary,concept_domain,concept_code,concept_name,quantile_bin,time_bin
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,23013,43891,1.0,0.522423,-0.004502,,,,Age,,
2,17871,3087,2435,0.055478,0.152,0.054647,SNOMED,Observation,309971008,Cardiac surgery department,,1 day
3,25154,2721,2815,0.064136,0.28,0.072456,LOINC,Measurement,2823-3,Potassium [Moles/volume] in Serum or Plasma,"4.900000095367432, inf",1 day
4,9265,2619,22328,0.508715,0.72,0.506894,LOINC,Measurement,77147-7,Glomerular filtration rate/1.73 sq M.predicted...,,1 day
5,64730,1798,27295,0.621881,0.618667,0.621909,Race,Observation,5,Sample collection duration,,36500 days
6,1,1689,7744,0.176437,0.157333,0.17674,CMS Place of Service,Procedure,23,Removal and restoration of teeth,,1 day
7,78825,1561,3646,0.083069,0.16,0.082406,mimiciv_per_ethnicity,Race,UNKNOWN,UNKNOWN,,36500 days
8,56539,1555,26643,0.607026,0.421333,0.608627,Gender,Gender,F,FEMALE,,36500 days
9,23420,1540,35910,0.818163,0.992,0.972332,Visit,Visit,ERIP,Emergency Room and Inpatient Visit,,1 day
10,27515,1493,2094,0.047709,0.464,0.13202,LOINC,Measurement,9279-1,Respiratory rate,"28.0, inf",1 day






Unnamed: 0_level_0,index,importance,num_not_zero,prev_not_zero,mean_pos_label,mean_neg_label,concept_vocabulary,concept_domain,concept_code,concept_name,quantile_bin,time_bin
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3265,43682,1.0,0.463534,-0.03362,,,,Age,,
2,64480,424,24139,0.552607,7.435003,6.146951,NUCC,Visit,261Q00000X,Ambulatory Clinic / Center,,36500 days
3,26792,365,879,0.020123,0.186527,0.014044,LOINC,Measurement,718-7,Hemoglobin [Mass/volume] in Blood,"-inf, 7.800000190734863",1 day
4,56329,351,26513,0.606955,0.547393,0.611275,Gender,Gender,F,FEMALE,,36500 days
5,21825,324,6349,0.145346,0.27759,0.138676,SNOMED,Procedure,735414004,Insertion of catheter into peripheral vein,,1 day
6,11879,307,10176,0.232956,0.179418,0.23684,RxNorm,Drug,727633,sodium chloride Prefilled Syringe,,1 day
7,11882,301,12608,0.288631,0.424171,0.278801,RxNorm,Drug,727820,10 ML sodium chloride 9 MG/ML Prefilled Syringe,,1 day
8,23349,296,7805,0.178678,0.16283,0.181374,Visit,Visit,IP,Inpatient Visit,,1 day
9,9898,281,3185,0.072913,0.149628,0.067349,RxNorm,Drug,1658717,"250 ML heparin sodium, porcine 100 UNT/ML Inje...",,1 day
10,26156,272,903,0.020672,0.180772,0.014093,LOINC,Measurement,4544-3,Hematocrit [Volume Fraction] of Blood by Autom...,"-inf, 24.200000762939453",1 day






Unnamed: 0_level_0,index,importance,num_not_zero,prev_not_zero,mean_pos_label,mean_neg_label,concept_vocabulary,concept_domain,concept_code,concept_name,quantile_bin,time_bin
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,25366,43983,1.0,0.557166,-0.009259,,,,Age,,
2,9284,3534,22411,0.509538,0.735744,0.505778,LOINC,Measurement,77147-7,Glomerular filtration rate/1.73 sq M.predicted...,,1 day
3,64744,3146,23149,0.526317,6.012517,3.687939,LOINC,Measurement,77147-7,Glomerular filtration rate/1.73 sq M.predicted...,,36500 days
4,23449,2669,35983,0.818112,1.023644,0.973327,Visit,Visit,ERIP,Emergency Room and Inpatient Visit,,1 day
5,17894,2575,2438,0.055431,0.109875,0.054526,SNOMED,Observation,309971008,Cardiac surgery department,,1 day
6,24604,2526,1589,0.036128,0.147427,0.035688,LOINC,Measurement,2345-7,Glucose [Mass/volume] in Serum or Plasma,"-inf, 83.0",1 day
7,64809,2361,27367,0.622218,0.631432,0.622065,Race,Observation,5,Sample collection duration,,36500 days
8,24479,2094,2001,0.045495,0.168289,0.049672,LOINC,Measurement,2160-0,Creatinine [Mass/volume] in Serum or Plasma,"1.600000023841858, 2.4000000953674316",1 day
9,24480,2048,1958,0.044517,0.260083,0.052908,LOINC,Measurement,2160-0,Creatinine [Mass/volume] in Serum or Plasma,"2.4000000953674316, inf",1 day
10,11091,2028,1735,0.039447,0.207232,0.036659,RxNorm,Drug,285018,insulin glargine 100 UNT/ML Injectable Solutio...,,1 day






Unnamed: 0_level_0,index,importance,num_not_zero,prev_not_zero,mean_pos_label,mean_neg_label,concept_vocabulary,concept_domain,concept_code,concept_name,quantile_bin,time_bin
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,28902,43912,1.0,0.321071,-0.010122,,,,Age,,
2,9265,4397,22352,0.509018,0.73994,0.501738,LOINC,Measurement,77147-7,Glomerular filtration rate/1.73 sq M.predicted...,,1 day
3,78807,2930,3623,0.082506,0.135618,0.080832,mimiciv_per_ethnicity,Race,UNKNOWN,UNKNOWN,,36500 days
4,64649,2900,23113,0.526348,8.251863,3.581865,LOINC,Measurement,77147-7,Glomerular filtration rate/1.73 sq M.predicted...,,36500 days
5,9748,2733,11705,0.266556,0.254844,0.266925,RxNorm,Drug,1361615,"heparin sodium, porcine 5000 UNT/ML Injectable...",,1 day
6,27102,2608,3491,0.0795,0.120715,0.089429,LOINC,Measurement,777-3,Platelets [#/volume] in Blood by Automated count,"115.0, 153.0",1 day
7,82484,2583,1487,0.033863,10.44784,0.376533,LOINC,Measurement,777-3,Platelets [#/volume] in Blood by Automated count,"-inf, 64.0",36500 days
8,56522,2460,26674,0.607442,0.429955,0.613037,Gender,Gender,F,FEMALE,,36500 days
9,11929,2435,12711,0.289465,0.321162,0.288466,RxNorm,Drug,727820,10 ML sodium chloride 9 MG/ML Prefilled Syringe,,1 day
10,17867,1867,2437,0.055497,0.048435,0.05572,SNOMED,Observation,309971008,Cardiac surgery department,,1 day
