In [2]:
# imports
from dataclasses import (dataclass, 
                         is_dataclass, 
                         field, 
                         asdict, 
                         fields, 
                         InitVar)
import os
import json

from pprint import pprint
import pandas as pd


In [3]:
# defining a nested_dataclass annotation (check https://www.geeksforgeeks.org/creating-nested-dataclass-objects-in-python/)

# decorator to wrap original __init__ 
def nested_dataclass(*args, **kwargs): 
      
    def wrapper(check_class): 
          
        # passing class to investigate 
        check_class = dataclass(check_class, **kwargs) 
        o_init = check_class.__init__ 
          
        def __init__(self, *args, **kwargs): 
              
            for name, value in kwargs.items(): 
                  
                # getting field type 
                ft = check_class.__annotations__.get(name, None) 
                  
                if is_dataclass(ft) and isinstance(value, dict): 
                    obj = ft(**value) 
                    kwargs[name]= obj 
                o_init(self, *args, **kwargs) 
        check_class.__init__=__init__ 
          
        return check_class 
      
    return wrapper(args[0]) if args else wrapper 

In [4]:
sql_to_pandas_type_mapping = {
    'integer': 'int64',
    'Integer': 'int64',
    'varchar': 'string',
    'Varchar': 'string',
    'TIMESTAMP': 'string',
    'date': 'string',
    'NUMERIC': 'float64',    
}

In [6]:
sql_dll_file = "./data/OMOP_PSQL_DDL/5.3/OMOPCDM_postgresql_5.3_ddl.sql"

tables_structure = {}
sql_to_pandas_type_mapping = {
    'integer': 'int64',
    'Integer': 'int64',
    'varchar': 'string',
    'Varchar': 'string',
    'TIMESTAMP': 'string',
    'date': 'string',
    'NUMERIC': 'float64',    
}

with open(sql_dll_file, 'r') as f:
    for line in f.readlines():
        if line[:2] == "--" or line.strip() == "":
            continue
        word_list = line.split()
        # print(word_list)
        if word_list[0]=='CREATE' and word_list[1]=='TABLE':
            column_list = []
            dtype_dict = {}
            parse_dates_column_list= []
            keyword = word_list[2].lower()
        else:
            col_name = word_list[0].strip('"')
            column_list.append(col_name)
            sql_type = word_list[1]
            if 'varchar' in sql_type.lower():
                sql_type = 'varchar'
            if sql_type.lower() in ['timestamp', 'date']:
                parse_dates_column_list.append(col_name)
            dtype_dict.update({col_name:sql_to_pandas_type_mapping.get(sql_type)})
        if word_list[-1]==');':
            tables_structure.update(
                {keyword: {
                    'column_list': column_list,
                    'dtype_dict': dtype_dict,
                    'parse_dates': parse_dates_column_list
                    }
                }
            )

In [7]:
#retrieve table structure (columns and dtypes) from DDL files

def generate_OMOP_table_structure(sql_dll_file):

    tables_structure = {}
    sql_to_pandas_type_mapping = {
        'integer': 'Int64',
        'Integer': 'Int64',
        'varchar': 'string',
        'Varchar': 'string',
        'TIMESTAMP': 'string',
        'date': 'string',
        'NUMERIC': 'float64',    
    }
    with open(sql_dll_file, 'r') as f:
        for line in f.readlines():
            if line[:2] == "--" or line.strip() == "":
                continue
            word_list = line.split()
            # print(word_list)
            if word_list[0]=='CREATE' and word_list[1]=='TABLE':
                column_list = []
                dtype_dict = {}
                parse_dates_column_list= []
                keyword = word_list[2].lower()
                if '.' in keyword:
                    keyword = keyword.split('.')[1]
            else:
                col_name = word_list[0].strip('"')
                column_list.append(col_name)
                sql_type = word_list[1]
                if 'varchar' in sql_type.lower():
                    sql_type = 'varchar'
                if sql_type.lower() in ['timestamp', 'date']:
                    parse_dates_column_list.append(col_name)
                dtype_dict.update({col_name:sql_to_pandas_type_mapping.get(sql_type)})
            if word_list[-1]==');':
                tables_structure.update(
                    {keyword: {
                        'column_list': column_list,
                        'dtype_dict': dtype_dict,
                        'parse_dates': parse_dates_column_list
                        }
                    }
                )
    return tables_structure

In [8]:
sql_dll_file_list = [
    "data/OMOP_PSQL_DDL/5.3/OMOPCDM_postgresql_5.3_ddl.sql",
    "data/OMOP_PSQL_DDL/5.4.1/OMOPCDM_postgresql_5.4.1_ddl.sql"
    ]

output_folder_path = "data/OMOP_structure_and_types"
output_filenames = ["OMOPCDM_5.3_structure.json", "OMOPCDM_5.4.1_structure.json"]

for input, output in zip(sql_dll_file_list, output_filenames):
    structure_dict = generate_OMOP_table_structure(input)
    with open(os.path.join(output_folder_path,output), 'w') as f:
        f.write(json.dumps(structure_dict))
    

In [9]:
tables_structure['metadata']

{'column_list': ['metadata_concept_id',
  'metadata_type_concept_id',
  'name',
  'value_as_string',
  'value_as_concept_id',
  'metadata_date',
  'metadata_datetime'],
 'dtype_dict': {'metadata_concept_id': 'int64',
  'metadata_type_concept_id': 'int64',
  'name': 'string',
  'value_as_string': 'string',
  'value_as_concept_id': 'int64',
  'metadata_date': 'string',
  'metadata_datetime': 'string'},
 'parse_dates': ['metadata_date', 'metadata_datetime']}

In [10]:
# define a dataclass to hold the db in memory
# NOTE: this is soly for development purposes, data should be handled by a DB, to overcome memory limitations

@dataclass
class ClinicalTables:
    person: pd.DataFrame | None = None
    observation_period: pd.DataFrame | None = None
    death: pd.DataFrame | None = None
    visit_occurrence: pd.DataFrame | None = None
    visit_detail: pd.DataFrame | None = None
    condition_occurrence: pd.DataFrame | None = None
    drug_exposure: pd.DataFrame | None = None
    procedure_occurrence: pd.DataFrame | None = None
    device_exposure:pd.DataFrame | None = None
    measurement: pd.DataFrame | None = None
    observation: pd.DataFrame | None = None
    note: pd.DataFrame | None = None
    note_nlp: pd.DataFrame | None = None
    specimen: pd.DataFrame | None = None
    fact_relationship: pd.DataFrame | None = None
    
@dataclass 
class HealthSystemTables:
    provider: pd.DataFrame | None = None
    care_site: pd.DataFrame | None = None
    location: pd.DataFrame | None = None
    
@dataclass
class HealthEconomicsTabels:
    cost: pd.DataFrame | None = None
    payer_plan_period: pd.DataFrame | None = None
    
@dataclass
class StandartizedDerivedElementsTables:
    condition_era: pd.DataFrame | None = None
    drug_era: pd.DataFrame | None = None
    dose_era: pd.DataFrame | None = None
    episode: pd.DataFrame | None = None
    episode_event: pd.DataFrame | None = None
    cohort: pd.DataFrame | None = None
    cohort_definition: pd.DataFrame | None = None

@dataclass
class MetadataTables:
    metadata: pd.DataFrame | None = None
    cdm_source: pd.DataFrame | None = None
    
@dataclass
class VocabularyTables:
    concept: pd.DataFrame | None = None
    concept_class: pd.DataFrame | None = None
    vocabulary: pd.DataFrame | None = None
    source_to_concept_map: pd.DataFrame | None = None
    domain: pd.DataFrame | None = None
    concept_synonym: pd.DataFrame | None = None
    concept_relationship: pd.DataFrame | None = None
    relationship: pd.DataFrame | None = None
    drug_strength: pd.DataFrame | None = None

@dataclass
class OMOP_data:
    clinical_tables: ClinicalTables = field(default_factory=ClinicalTables)
    health_system_tables: HealthSystemTables = field(default_factory=HealthSystemTables)
    health_economics_tables: HealthEconomicsTabels = field(default_factory=HealthEconomicsTabels)
    standartized_derived_elements_tables: StandartizedDerivedElementsTables = field(default_factory=StandartizedDerivedElementsTables)
    metadata_tables: MetadataTables = field(default_factory=MetadataTables)
    vocabulary_tables: VocabularyTables = field(default_factory=VocabularyTables)
    csv_data_path: InitVar[str | None] = None
    
    def __post_init__(self, csv_data_path=None):
        self.csv_data_path = csv_data_path
        self._import_csv_files()
    
    def _import_csv_files(self):
        for field in fields(self):
            print ("Ingesting", field.name+":")
            for table in fields(field.type):   
                print("Ingesting table", table.name+".")         
                file_path = os.path.join(self.csv_data_path, table.name+'.csv')
                if os.path.isfile(file_path):
                    try:
                        df_table = pd.read_csv(file_path, 
                                               usecols=tables_structure.get(table.name).get('column_list'),
                                               dtype=tables_structure.get(table.name).get('dtype_dict'),
                                               parse_dates=tables_structure.get(table.name).get('parse_dates')
                                               )
                        setattr(getattr(self,field.name),table.name, df_table)
                        print('Ingesting file', file_path, "was successful.")
                    except:
                        print(f"Unable to ingest {field.name}.{table.name} given {table.name}.csv file is off standards.")
                    
                else:
                    print(f"Unable to ingest {field.name}.{table.name} as there is not corresponding {table.name}.csv file.")
                
            print ("\n*****\n")

In [11]:
csv_data_path = './data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv'
data = OMOP_data(csv_data_path=csv_data_path)

Ingesting clinical_tables:
Ingesting table person.
Unable to ingest clinical_tables.person given person.csv file is off standards.
Ingesting table observation_period.
Ingesting file ./data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv/observation_period.csv was successful.
Ingesting table death.
Ingesting file ./data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv/death.csv was successful.
Ingesting table visit_occurrence.
Unable to ingest clinical_tables.visit_occurrence given visit_occurrence.csv file is off standards.
Ingesting table visit_detail.
Unable to ingest clinical_tables.visit_detail given visit_detail.csv file is off standards.
Ingesting table condition_occurrence.
Unable to ingest clinical_tables.condition_occurrence given condition_occurrence.csv file is off standards.
Ingesting table drug_exposure.
Unable to ingest clinical_tables.drug_exposure given drug_exposure.csv file is off standards.
Ingesting table procedure_occurrence

In [12]:
from src.inout.load_omop import OMOP_data
csv_data_path = './data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv'

memory_db = OMOP_data(csv_data_path=csv_data_path,tables_structure=tables_structure)

Ingesting clinical_tables:
Ingesting table person.
Unable to ingest clinical_tables.person given person.csv file is off standards.
Ingesting table observation_period.
Ingesting file ./data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv/observation_period.csv was successful.
Ingesting table death.
Ingesting file ./data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv/death.csv was successful.
Ingesting table visit_occurrence.
Unable to ingest clinical_tables.visit_occurrence given visit_occurrence.csv file is off standards.
Ingesting table visit_detail.
Unable to ingest clinical_tables.visit_detail given visit_detail.csv file is off standards.
Ingesting table condition_occurrence.
Unable to ingest clinical_tables.condition_occurrence given condition_occurrence.csv file is off standards.
Ingesting table drug_exposure.
Unable to ingest clinical_tables.drug_exposure given drug_exposure.csv file is off standards.
Ingesting table procedure_occurrence

In [None]:
for key, dataframe in asdict(memory_db.clinical_tables).items():
    if dataframe is not None:
        print("***", key,"***\n", dataframe.dtypes,"\n\n")
    else:
        print("***", key,"***\n", "Not Loaded!","\n\n")
        

*** person ***
 person_id                       Int64
gender_concept_id               Int64
year_of_birth                   Int64
month_of_birth                  Int64
day_of_birth                    Int64
birth_datetime                 object
race_concept_id                 Int64
ethnicity_concept_id            Int64
location_id                     Int64
provider_id                     Int64
care_site_id                    Int64
person_source_value            string
gender_source_value            string
gender_source_concept_id        Int64
race_source_value              string
race_source_concept_id          Int64
ethnicity_source_value         string
ethnicity_source_concept_id     Int64
dtype: object 


*** observation_period ***
 observation_period_id                     Int64
person_id                                 Int64
observation_period_start_date    datetime64[ns]
observation_period_end_date      datetime64[ns]
period_type_concept_id                    Int64
dtype: object 


In [None]:
memory_db.clinical_tables.visit_occurrence

In [None]:
# Class featurizer