In [None]:
from dask.distributed import LocalCluster
client = LocalCluster().get_client()
client

In [None]:
# imports
from dataclasses import dataclass, field, fields, InitVar

import json
import os

import pandas as pd
import dask.dataframe as dd 


In [None]:

# define a dataclass to hold the db in memory
# NOTE: this is uniquely for development purposes, data should be handled by a DB, to overcome memory limitations

@dataclass
class ClinicalTables:
    person: pd.DataFrame | None = None
    observation_period: pd.DataFrame | None = None
    death: pd.DataFrame | None = None
    visit_occurrence: pd.DataFrame | None = None
    visit_detail: pd.DataFrame | None = None
    condition_occurrence: pd.DataFrame | None = None
    drug_exposure: pd.DataFrame | None = None
    procedure_occurrence: pd.DataFrame | None = None
    device_exposure:pd.DataFrame | None = None
    measurement: pd.DataFrame | None = None
    observation: pd.DataFrame | None = None
    note: pd.DataFrame | None = None
    note_nlp: pd.DataFrame | None = None
    specimen: pd.DataFrame | None = None
    fact_relationship: pd.DataFrame | None = None
    
@dataclass 
class HealthSystemTables:
    provider: pd.DataFrame | None = None
    care_site: pd.DataFrame | None = None
    location: pd.DataFrame | None = None
    
@dataclass
class HealthEconomicsTabels:
    cost: pd.DataFrame | None = None
    payer_plan_period: pd.DataFrame | None = None
    
@dataclass
class StandartizedDerivedElementsTables:
    condition_era: pd.DataFrame | None = None
    drug_era: pd.DataFrame | None = None
    dose_era: pd.DataFrame | None = None
    episode: pd.DataFrame | None = None
    episode_event: pd.DataFrame | None = None
    cohort: pd.DataFrame | None = None
    cohort_definition: pd.DataFrame | None = None

@dataclass
class MetadataTables:
    metadata: pd.DataFrame | None = None
    cdm_source: pd.DataFrame | None = None
    
@dataclass
class VocabularyTables:
    concept: pd.DataFrame | None = None
    concept_class: pd.DataFrame | None = None
    vocabulary: pd.DataFrame | None = None
    source_to_concept_map: pd.DataFrame | None = None
    domain: pd.DataFrame | None = None
    concept_synonym: pd.DataFrame | None = None
    concept_relationship: pd.DataFrame | None = None
    relationship: pd.DataFrame | None = None
    drug_strength: pd.DataFrame | None = None

@dataclass
class OMOP_data:
    csv_data_path: InitVar[str | None] = None
    tables_structure: InitVar[dict | None] = None
    clinical_tables: ClinicalTables = field(default_factory=ClinicalTables)
    health_system_tables: HealthSystemTables = field(default_factory=HealthSystemTables)
    health_economics_tables: HealthEconomicsTabels = field(default_factory=HealthEconomicsTabels)
    standartized_derived_elements_tables: StandartizedDerivedElementsTables = field(default_factory=StandartizedDerivedElementsTables)
    metadata_tables: MetadataTables = field(default_factory=MetadataTables)
    vocabulary_tables: VocabularyTables = field(default_factory=VocabularyTables)
    
    def __post_init__(self, csv_data_path:str, tables_structure:dict):
        self.csv_data_path = csv_data_path
        self.tables_structure = tables_structure
        self._import_csv_files(tables_structure)
    
    def _import_csv_files(self, tables_structure:dict):
        for field in fields(self):
            print ("Ingesting", field.name+":")
            for table in fields(field.type):   
                print("Ingesting table", table.name+".")         
                file_path = os.path.join(self.csv_data_path, table.name+'.csv')
                if os.path.isfile(file_path):
                    try:
                        df_table = dd.read_csv(file_path, 
                                            usecols=tables_structure.get(table.name).get('column_list'),
                                            dtype=tables_structure.get(table.name).get('dtype_dict'),
                                            parse_dates=tables_structure.get(table.name).get('parse_dates')
                                            )
                        setattr(getattr(self,field.name),table.name, df_table)
                        print('Ingesting file', file_path, "was successful.")
                    except Exception as e:
                        print(e)
                        print(f"Unable to ingest {field.name}.{table.name} given {table.name}.csv file is off standards.")
                    
                else:
                    print(f"Unable to ingest {field.name}.{table.name} as there is not corresponding {table.name}.csv file.")
                
            print ("\n*****\n")

In [None]:
CSV_FOLDER_PATH = os.getenv('CSV_FOLDER_PATH')
TABLE_STRUCTURE_JSON = os.getenv('TABLE_STRUCTURE_JSON')

In [None]:
def load_pandas_db(csv_folder_path:str = CSV_FOLDER_PATH, tables_structure_json :dict = TABLE_STRUCTURE_JSON) -> OMOP_data:
     with open(tables_structure_json, 'r') as f:
          tables_structure = json.load(f)
     omop_db = OMOP_data(csv_data_path=csv_folder_path, tables_structure=tables_structure)
     return omop_db

In [None]:
omop_db = load_pandas_db()

In [None]:
omop_db.clinical_tables.visit_occurrence.head()

In [None]:
# need all parsed time fields
visit_condition_df = omop_db.clinical_tables.condition_occurrence[['condition_source_value','visit_occurrence_id','condition_end_date', 'condition_end_datetime', 'condition_start_date', 'condition_start_datetime']]\
    .join(
        omop_db.clinical_tables.visit_occurrence[['visit_occurrence_id','visit_start_date','visit_end_date', 'visit_end_datetime', 'visit_start_datetime']]\
    .set_index('visit_occurrence_id'),
        on='visit_occurrence_id',
        rsuffix='_visit'
    )

In [None]:
visit_condition_df.dask

In [None]:
os.environ["PATH"] += os.pathsep + "C:/Program Files/Graphviz/bin/"

In [None]:
visit_condition_df.visualize()

In [None]:
with open(TABLE_STRUCTURE_JSON, 'r') as f:
        tables_structure = json.load(f)

In [None]:
tables_structure

In [None]:
tables_structure['drug_exposure']

In [None]:
vc_df = visit_condition_df.compute().drop(['condition_end_date', 'condition_end_datetime', 'condition_start_date', 'condition_start_datetime', 'visit_end_date', 'visit_end_datetime', 'visit_start_datetime'],axis=1)

In [None]:
vc_df.dtypes