## Need to investigate Lab based labelers

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.utils import read_pkl

In [2]:
PATH_RESULTS = "/hpf/projects/lsung/projects/lguo/femr-on-sk/data/sanity_checks/rand_glucose_prediction"

In [3]:
result_files = [
    x for x in os.listdir(PATH_RESULTS)
    if "result" in x
]

results = {}
for file in result_files:
    model = file.split('.')[0]
    results[model] = read_pkl(os.path.join(PATH_RESULTS, file))['auroc']
    
results

{'results_clmbr_stanford': 0.751840751231437,
 'results_clmbr_sk': 0.7486331598708912,
 'results_count': 0.6844520562984433}

In [4]:
read_pkl(os.path.join(PATH_RESULTS, file))["labels"].mean()

0.8537284894837476

#### Investigate labels

Strangely, most glucose results are even, which does not match what appears in SQL Server

In [5]:
labeled_patients = read_pkl(os.path.join(PATH_RESULTS, "labeled_patients.pkl"))
np.mean([v[0].value for _,v in labeled_patients.get_patients_to_labels().items() if v])

0.8617565395294462

If we take a look at raw glucose values, roughly 50% are even.

In [8]:
from femr.datasets import PatientDatabase
from femr.labelers.omop import map_omop_concept_codes_to_femr_codes
database = PatientDatabase("/hpf/projects/lsung/data/lguo/omop_extract_v6")
ontology = database.get_ontology()

In [9]:
glucose_values = []
for p in database:
    patient = database.get(p)
    
    for e in patient.events:
        if e.code in list(
            map_omop_concept_codes_to_femr_codes(
                ontology, ["LOINC/14749-6","LOINC/15074-8"]
            )
        ):
            glucose_values.append(e.value)
            
len(glucose_values), np.mean([int(float(x))%2==0 for x in glucose_values if x])

(574078, 0.5016253789066583)

Ok, let's take a look at patients with positive label and their e.start times..

In [10]:
patients_with_pos = {k:v for k,v in labeled_patients.get_patients_to_labels().items() if v and v[0].value}

for patient_id, label in patients_with_pos.items():
    patient = database.get(patient_id)
    label = label[0]
    break

In [11]:
patient_glucose = []
start = None
end = None
for e in patient.events:
    # find visit
    if e.start == label.time and e.code.lower()=="visit/ip":
        start = e.start
        end = e.end
    
if start is None and end is None:
    raise ValueError("did not find Admission event")

for e in patient.events:
    if e.code in ["LOINC/14749-6","LOINC/15074-8"] and (
        e.start is not None and
        e.start >= start and
        e.start <= end
    ):
        patient_glucose.append(e.value)

In [12]:
np.mean([int(float(x)) % 2 == 0 for x in patient_glucose])

0.631578947368421