In [1]:
import os
import glob
import pandas as pd

# os.chdir('Z:/OMOP/omop_etl/omop_etl/templates')
# print(os.getcwd())

from omop_etl.datastore import DataStore
from omop_etl.bo import bo_query
from omop_etl.config import ProjectConfig, ETLConfig
from omop_etl.utils import find, search

In [2]:
database = 'dws_omop'
server = 'edw.shands.ufl.edu'
config_file = os.path.join('Z:/OMOP/omop_etl/omop_etl/templates/', 'config.yml')
omop = DataStore(config_file)
omop.connection_str = f'mssql+pyodbc://{server}/{database}?driver=SQL+Server'

with omop.engine.connect() as con:
    bo_queries = bo_query('omop', con)

_config = ProjectConfig(config_file)
etl_config = ETLConfig()
sql_scripts = [os.path.split(p)[1] for p in glob.glob('Z:/OMOP/omop_etl/omop_etl/sql/*.sql') ]

In [3]:
data_guide = pd.read_csv('loincs_data_guide.csv')
data_guide.fillna(method='ffill', inplace=True)

In [4]:
data_guide['loinc'] = data_guide['Possible values and units'].apply(lambda s: [l for l in _config.loinc.keys() if search(l, s)][0])

In [5]:
data_guide

Unnamed: 0,Data element,Possible values and units,loinc
0,AFP,���� Alpha-1-Fetoprotein [Mass/volume] in Seru...,1834-1
1,AFP,���� Alpha-1-Fetoprotein [Multiple of the medi...,23811-3
2,AFP,���� Alpha-1-Fetoprotein [Multiple of the medi...,20450-3
3,Albumin,���� Albumin [Mass/volume] in Serum or Plasma ...,1751-7
4,Albumin,���� Albumin [Mass/volume] in Serum or Plasma ...,61151-7
...,...,...,...
305,WBC,���� 12227-5: Leukocytes [#/volume] corrected ...,12227-5
306,WBC,���� 33256-9: Leukocytes [#/volume] corrected ...,33256-9
307,WBC,���� 49498-9: Leukocytes [#/volume] in Blood b...,49498-9
308,WBC,���� 804-5: Leukocytes [#/volume] in Blood by ...,804-5


In [6]:
not_found = {}
found = {}
for loinc in _config.loinc.keys():
    was_found = list(filter(None, find(loinc, data_guide['Possible values and units'])))
    if not was_found:
        not_found[loinc] = _config.loinc[loinc]
    else:
        found[loinc] = _config.loinc[loinc]

In [7]:
# found

In [8]:
df = pd.DataFrame({'LOINC':not_found.keys(), 'Desc':not_found.values()})

In [9]:
df

Unnamed: 0,LOINC,Desc
0,10331-7,Rh [Type] in Blood
1,10368-9,Lead [Mass/volume] in Capillary blood
2,10466-1,Anion gap 3 in Serum or Plasma
3,10912-4,Lead [Mass/volume] in Serum or Plasma
4,11053-6,Lactate dehydrogenase [Enzymatic activity/volu...
...,...,...
317,884-7,ABO and Rh group [Type] in Capillary blood
318,91556-1,Fibrin D-dimer DDU [Mass/volume] in Blood by I...
319,9422-7,Herpes simplex virus IgG Ab [Units/volume] in ...
320,9741-0,Toxoplasma gondii IgG Ab [Units/volume] in Cer...


In [10]:
lead = df[find('Lead \[', df.Desc)].LOINC.to_list()
df[find('Lead \[', df.Desc)]

Unnamed: 0,LOINC,Desc
1,10368-9,Lead [Mass/volume] in Capillary blood
3,10912-4,Lead [Mass/volume] in Serum or Plasma
234,5671-3,Lead [Mass/volume] in Blood
294,77307-7,Lead [Mass/volume] in Venous blood


In [11]:
df.loc[find('Lead \[', df.Desc), 'Group'] = 'Lead'

In [12]:
rh = df[find('Rh \[', df.Desc)].LOINC.to_list()
df.loc[find('Rh \[', df.Desc), 'Group'] = 'Rh antigen'

In [13]:
anion = df[find('Anion gap ', df.Desc)].LOINC.to_list()
df.loc[find('Anion gap ', df.Desc), 'Group'] = 'AG'

In [14]:
lactate = df[find('Lactate dehy', df.Desc)].LOINC.to_list()
df.loc[find('Lactate dehy', df.Desc), 'Group'] = 'LDH'

In [15]:
ph = df[find('pH ', df.Desc)].LOINC.to_list()
df.loc[find('pH ', df.Desc), 'Group'] = 'pH'

In [16]:
thyro = df[find('Thyrotropin ', df.Desc)].LOINC.to_list()
df.loc[find('Thyrotropin ', df.Desc), 'Group'] = 'TSH'

In [17]:
creat = df[find('Creatinine renal ', df.Desc)].LOINC.to_list()
df[find('Creatinine renal ', df.Desc)]

Unnamed: 0,LOINC,Desc,Group


In [18]:
dag = df[find('D Ag ', df.Desc)].LOINC.to_list()
df.loc[find('D Ag ', df.Desc), 'Group'] = 'D antigen'

In [19]:
cytom = df[find('Cytomegalovirus ', df.Desc)].LOINC.to_list()
df.loc[find('Cytomegalovirus ', df.Desc), 'Group'] = 'CMV'

In [20]:
herpes = df[find('Herpes ', df.Desc)].LOINC.to_list()
df.loc[find('Herpes ', df.Desc), 'Group'] = 'HSV'

In [21]:
rubella = df[find('Rubella ', df.Desc)].LOINC.to_list()
df.loc[find('Rubella ', df.Desc), 'Group'] = 'Rubella'

In [22]:
retic = df[find('Reticulocytes', df.Desc)].LOINC.to_list()
df.loc[find('Reticulocytes', df.Desc), 'Group'] = 'Reticulocytes'

In [23]:
abo = df[find('ABO group ', df.Desc)].LOINC.to_list()
df.loc[find('ABO group ', df.Desc), 'Group'] = 'ABO antigens'

In [24]:
df[find('Bicarbonate ', df.Desc)]

Unnamed: 0,LOINC,Desc,Group
19,14627-4,Bicarbonate [Moles/volume] in Venous blood,
26,16461-6,Bicarbonate [Moles/volume] in Red Blood Cells,
41,19230-2,Bicarbonate [Moles/volume] standard in Arteria...,
42,19231-0,Bicarbonate [Moles/volume] standard in Capilla...,
43,19232-8,Bicarbonate [Moles/volume] standard in Venous ...,
44,19233-6,Bicarbonate [Moles/volume] standard in Mixed v...,
46,1959-6,Bicarbonate [Moles/volume] in Blood,
47,1960-4,Bicarbonate [Moles/volume] in Arterial blood,
48,1961-2,Bicarbonate [Moles/volume] in Capillary blood,
49,1962-0,Bicarbonate [Moles/volume] in Plasma,


In [25]:
# dict(zip(df[find('Creatinine', df.Desc)].LOINC,df[find('Creatinine', df.Desc)].Desc))

In [26]:
# dict(zip(df[find('Glucose ', df.Desc)].LOINC, df[find('Glucose ', df.Desc)].Desc))

In [27]:
triio = df[find('Triiodothyronine', df.Desc)].LOINC.to_list()
df.loc[find('Triiodothyronine', df.Desc), 'Group'] = 'T3'

In [28]:
fibrin = df[find('Fibrin D-dimer', df.Desc)].LOINC.to_list()
df.loc[find('Fibrin D-dimer', df.Desc), 'Group'] = 'D-Dimer'

In [29]:
toxo = df[find('Toxoplasma', df.Desc)].LOINC.to_list()
df.loc[find('Toxoplasma', df.Desc), 'Group'] = 'Toxoplasma'

In [30]:
hiv = df[find('HIV ', df.Desc)].LOINC.to_list()
df.loc[find('HIV ', df.Desc), 'Group'] = 'HIV'

In [31]:
oxygen = df[find('Oxygen satura', df.Desc)].LOINC.to_list()
df.loc[find('Oxygen satura', df.Desc), 'Group'] = 'Oxygen saturation'

In [32]:
co2 = df[find('Carbon dioxide,', df.Desc)].LOINC.to_list()
df.loc[find('Carbon dioxide,', df.Desc), 'Group'] = 'TCO2'
df[find('Carbon dioxide,', df.Desc)]

Unnamed: 0,LOINC,Desc,Group
52,2026-3,"Carbon dioxide, total [Moles/volume] in Arteri...",TCO2
53,2028-9,"Carbon dioxide, total [Moles/volume] in Serum ...",TCO2
58,20565-8,"Carbon dioxide, total [Moles/volume] in Blood",TCO2


In [33]:
pco2 = df[find('Carbon dioxide \[', df.Desc)].LOINC.to_list()
df.loc[find('Carbon dioxide \[', df.Desc), 'Group'] = 'PCO2'

In [34]:
bicar = df[find('Bicarbonate', df.Desc)].LOINC.to_list()
df.loc[find('Bicarbonate', df.Desc), 'Group'] = 'HCO3'

In [35]:
alanine = df[find('Alanine amino', df.Desc)].LOINC.to_list()
dict(zip(df[find('Alanine amino', df.Desc)].LOINC,df[find('Alanine amino', df.Desc)].Desc))

{}

In [36]:
fibrinog = df[find('^Fibrinogen ', df.Desc)].LOINC.to_list()
df.loc[find('^Fibrinogen ', df.Desc),'Group'] = 'Fibrinogen'

In [37]:
erythro = df[find('^Erythrocytes', df.Desc)].LOINC.to_list()
df.loc[find('^Erythrocytes', df.Desc), 'Group'] = 'Erythrocytes'

In [38]:
lympho = df[find('Lymphocytes', df.Desc)].LOINC.to_list()
df[find('Lymphocytes', df.Desc)] = 'Lymphocytes'

In [39]:
neutrop = df[find('Neutrophils', df.Desc)].LOINC.to_list()
df[find('Neutrophils', df.Desc)] = 'Neutrophils'

In [40]:
cd3 = df[find('CD3\+CD4\+', df.Desc)].LOINC.to_list()
df.loc[find('CD3\+CD4\+', df.Desc), 'Group'] = 'T4 cells'

In [41]:
basoph = df[find('Basophils', df.Desc)].LOINC.to_list()
df.loc[find('Basophils', df.Desc),'Group'] = 'Basophils'

In [42]:
eosino = df[find('Eosinophils', df.Desc)].LOINC.to_list()
df.loc[find('Eosinophils', df.Desc), 'Group'] = 'Eosinophils'

In [43]:
monoc = df[find('Monocytes', df.Desc)].LOINC.to_list()
df.loc[find('Monocytes', df.Desc), 'Group'] = 'Monocytes'

In [44]:
magnes = df[find('Magnesium', df.Desc)].LOINC.to_list()
df.loc[find('Magnesium', df.Desc), 'Group'] = 'Magnesium'

In [45]:
myeloc = df[find('Myelocytes', df.Desc)].LOINC.to_list()
df.loc[find('Myelocytes', df.Desc), 'Group'] = 'Myelocytes'

In [46]:
oxygen_pp = df[find('Oxygen \[', df.Desc)].LOINC.to_list()
df.loc[find('Oxygen \[', df.Desc),'Group'] = 'PaO2'

In [47]:
interleu = df[find('Interleukin', df.Desc)].LOINC.to_list()
df.loc[find('Interleukin', df.Desc), 'Group'] = 'ILs'

In [48]:
mch = df[find('MCH', df.Desc)].LOINC.to_list()
df.loc[find('MCH', df.Desc), 'Group'] = 'MCH'

In [49]:
fibrin = df[find('Fibrin', df.Desc) & ~find('^Fibrinogen', df.Desc)].LOINC.to_list()
df.loc[find('Fibrin\+', df.Desc), 'Group'] = 'Fibrin/Fibrinogen Fragments'

In [50]:
thyroxine = df[find('Thyroxine ', df.Desc)].LOINC.to_list()
df.loc[find('Thyroxine ', df.Desc), 'Group'] = 'Thyroxine'

In [51]:
dict(zip(df[find('Albumin ', df.Desc)].LOINC,df[find('Albumin ', df.Desc)].Desc))

{}

In [52]:
dict(zip(df[find('Hemoglobin ', df.Desc)].LOINC, df[find('Hemoglobin ', df.Desc)].Desc))

{}

In [53]:
dict(zip(df[find('Troponin ', df.Desc)].LOINC, df[find('Troponin ', df.Desc)].Desc))

{}

In [54]:
erythro = df[find('^Erythrocyte', df.Desc)].LOINC.to_list()
df.loc[find('^Erythrocytes', df.Desc), 'Group'] = 'Erythrocytes'

In [55]:
erythro_dis = df[find('^Erythrocyte dis', df.Desc)].LOINC.to_list()
df.loc[find('^Erythrocyte dis', df.Desc), 'Group'] = 'RDW'

In [56]:
prothr = df[find('^Prothrombin', df.Desc)].LOINC.to_list()
df.loc[find('^Prothrombin', df.Desc), 'Group'] = 'PT'

In [57]:
mcv = df[find('^MCV', df.Desc)].LOINC.to_list()
df.loc[find('^MCV', df.Desc), 'Group'] = 'MCV'

In [58]:
abo = df[find('^ABO', df.Desc)].LOINC.to_list()
df.loc[find('^ABO', df.Desc), 'Group'] = 'ABO antigens'

In [59]:
dict(zip(df[find('^Platelet', df.Desc)].LOINC, df[find('^Platelet', df.Desc)].Desc))

{}

In [60]:
coagul = df[find('^Coagulation ', df.Desc)].LOINC.to_list()
df.loc[find('^Coagulation ', df.Desc), 'Group'] = 'Factor X'

In [61]:
procal = df[find('^Procalcitonin ', df.Desc)].LOINC.to_list()
df.loc[find('^Procalcitonin ', df.Desc), 'Group'] = 'PCT'

In [62]:
afp = df[find('^Alpha-1-fetoprotein', df.Desc)].LOINC.to_list()
df.loc[find('^Alpha-1-fetoprotein', df.Desc), 'Group'] = 'AFP'

In [63]:
for loinc in not_found:
    if loinc not in (lead + rh + anion + lactate + ph + thyro + creat + dag + cytom + \
                     herpes + rubella + retic + abo + triio + fibrin + toxo + hiv + \
                     oxygen + co2 + bicar + alanine + fibrinog + erythro + lympho + neutrop +\
                     cd3 + basoph + eosino + monoc + magnes + myeloc + oxygen_pp + interleu +\
                     mch + fibrin + thyroxine + erythro + prothr + mcv + abo + coagul + procal + \
                     erythro_dis + afp + pco2):
        print(loinc, ':', not_found[loinc])

In [64]:
# not_found

In [65]:
df_copy = df[['Group','Desc','LOINC']].copy()
df_copy.columns = ['Data element', 'Possible values and units', 'loinc']

In [66]:
data_guide['Possible values and units'] = data_guide['Possible values and units'].apply(lambda s: [_config.loinc[l] for l in _config.loinc.keys() if search(l, s)][0])

In [67]:
data_guide_exp = pd.concat([data_guide, df_copy])

In [68]:
data_guide_exp

Unnamed: 0,Data element,Possible values and units,loinc
0,AFP,Alpha-1-Fetoprotein [Mass/volume] in Serum or ...,1834-1
1,AFP,Alpha-1-Fetoprotein [Multiple of the median] a...,23811-3
2,AFP,Alpha-1-Fetoprotein [Multiple of the median] i...,20450-3
3,Albumin,Albumin [Mass/volume] in Serum or Plasma,1751-7
4,Albumin,Albumin [Mass/volume] in Serum or Plasma by Br...,61151-7
...,...,...,...
317,ABO antigens,ABO and Rh group [Type] in Capillary blood,884-7
318,D-Dimer,Fibrin D-dimer DDU [Mass/volume] in Blood by I...,91556-1
319,HSV,Herpes simplex virus IgG Ab [Units/volume] in ...,9422-7
320,Toxoplasma,Toxoplasma gondii IgG Ab [Units/volume] in Cer...,9741-0


In [69]:
loincs = ','.join(f"'{l}'" for l in data_guide_exp.loinc)

In [75]:
with omop.engine.connect() as con:
    meas = pd.read_sql(f"""select distinct measurement_concept_id as concept_id, measurement_source_value as loinc
                           from dbo.measurement 
                           where measurement_source_value in ({loincs})""", con)

In [76]:
meas

Unnamed: 0,concept_id,loinc
0,3019897,788-0
1,3008652,23869-1
2,3034868,38180-6
3,3015632,2028-9
4,3002809,31112-6
...,...,...
341,3026361,26453-1
342,3016723,2160-0
343,3025267,7905-3
344,3043706,32717-1


In [77]:
data_guide_exp

Unnamed: 0,Data element,Possible values and units,loinc
0,AFP,Alpha-1-Fetoprotein [Mass/volume] in Serum or ...,1834-1
1,AFP,Alpha-1-Fetoprotein [Multiple of the median] a...,23811-3
2,AFP,Alpha-1-Fetoprotein [Multiple of the median] i...,20450-3
3,Albumin,Albumin [Mass/volume] in Serum or Plasma,1751-7
4,Albumin,Albumin [Mass/volume] in Serum or Plasma by Br...,61151-7
...,...,...,...
317,ABO antigens,ABO and Rh group [Type] in Capillary blood,884-7
318,D-Dimer,Fibrin D-dimer DDU [Mass/volume] in Blood by I...,91556-1
319,HSV,Herpes simplex virus IgG Ab [Units/volume] in ...,9422-7
320,Toxoplasma,Toxoplasma gondii IgG Ab [Units/volume] in Cer...,9741-0


In [108]:
rel_loinc = meas.merge(data_guide_exp, on='loinc')
rel_loinc

Unnamed: 0,concept_id,loinc,Data element,Possible values and units
0,3019897,788-0,RDW,Erythrocyte distribution width [Ratio] by Auto...
1,3008652,23869-1,Hepatitis B Virus,Hepatitis B virus DNA [Mass/volume] (viral loa...
2,3034868,38180-6,Hepatitis C Virus,Hepatitis C virus RNA [log units/volume] (vira...
3,3015632,2028-9,TCO2,"Carbon dioxide, total [Moles/volume] in Serum ..."
4,3002809,31112-6,Reticulocytes,Reticulocytes/100 erythrocytes in Blood by Manual
...,...,...,...,...
348,3026361,26453-1,Erythrocytes,Erythrocytes [#/volume] in Blood
349,3016723,2160-0,Creatinine,Creatinine [Mass/volume] in Serum or Plasma
350,3025267,7905-3,Hepatitis B Virus,Hepatitis B virus surface Ag [Presence] in Ser...
351,3043706,32717-1,Sodium,Sodium [Moles/volume] in Arterial blood


In [113]:
# rel_loinc['Possible values and units'] = 
rel_loinc['Possible values and units'] = rel_loinc.apply(lambda row: f"{row['Possible values and units']} (LOINC: {row['loinc']}, Concept ID: {row['concept_id']})", axis=1)

In [116]:
# rel_loinc[['Data element', 'Possible values and units']].to_csv('labs_table.csv', index=False)