In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import os
#os.chdir('data/coherent-11-07-2022/csv/')

In [None]:
# List all file in folder
csv_list = [x for x in os.listdir('.') if x.endswith('.csv')]
csv_list

In [None]:
# Select or comment out files to be read in
df_csv = [# 'allergies.csv',
        # 'careplans.csv',
        'conditions.csv',
        'devices.csv',
        'encounters.csv',
        # 'imaging_studies.csv',
        #'immunizations.csv',
        'medications.csv',
        'observations.csv',
        # 'organizations.csv',
        'patients.csv',
        # 'payers.csv',
        # 'payer_transitions.csv',
        'procedures.csv',
        'providers.csv',
        # 'supplies.csv'
        ]


In [None]:
# Create a dictionary of dfs for each selected csv
d = {}
for x in df_csv:
    d[x.split('.')[0]] = pd.read_csv(x)

# Lengths of the respective dataframes to join
df_lengths = pd.DataFrame({'df': d.keys(), 'df_length': [d[x].shape[0] for x in d.keys()]})
df_lengths

In [None]:
# Rename columns for join
d['patients'].rename(columns={'Id': 'PATIENT'}, inplace=True)
d['patients'].columns

In [None]:
# Rename columns for join
d['encounters'].rename(columns={'Id': 'ENCOUNTER', 'START': 'Encounter_start', 'STOP': 'Encounter_stop',
                                'REASONDESCRIPTION': 'Encounter_reason', 'REASONCODE': 
                                'Encounter_reasoncode', 'CODE': 'Encounter_classcode', 'DESCRIPTION': 
                                'Encounter_type'}, inplace=True)
d['encounters'].columns

In [None]:
# Rename columns for join
d['conditions'].rename(columns={'START': 'CONDITION_START', 'STOP': 'CONDITION_STOP',
                                'CODE': 'CONDITION_CODE', 'DESCRIPTION': 'CONDITION_DESCRIPTION'}, 
                                inplace=True)
conditions = pd.DataFrame(d['conditions'])
conditions.drop(['PATIENT'], axis = 1, inplace=True)

In [None]:
# Merge 'patients' and 'encounter' on patient column
large_df = pd.DataFrame(d['patients'].merge(d['encounters'], how='outer', on='PATIENT'))
large_df.columns

In [None]:
# Drop irrelevant columns
large_df.drop(['SSN', 'DRIVERS', 'PASSPORT', 'PREFIX', 'FIRST', 'LAST', 
                'SUFFIX', 'MAIDEN', 'ADDRESS', 'COUNTY', 'ZIP', 'LAT', 'LON', 
                'HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE', 'BASE_ENCOUNTER_COST',
                'TOTAL_CLAIM_COST', 'PAYER_COVERAGE'], axis = 1, inplace = True)
large_df.columns

In [None]:
# Merge current df with 'conditions' on the encounter column
large_df = large_df.merge(conditions, how = 'outer', on = 'ENCOUNTER')

In [None]:
# Rename columns for join, drop redundant columns 
d['observations'].rename(columns={'DATE': 'OBSERVATION_DATE', 'CODE': 'OBSERVATION_CODE', 
                                  'DESCRIPTION': 'OBSERVATION_DESCRIPTION'}, inplace=True)

observations = pd.DataFrame(d['observations'])

observations.drop(['PATIENT'], axis = 1, inplace=True)

In [None]:
# Merge current df with 'observations' on the encounter column
large_df = large_df.merge(observations, how = 'outer', on = 'ENCOUNTER')
large_df.columns

In [None]:
# Rename columns for join, drop redundant columns
d['medications'].rename(columns={'START': 'MED_START', 'STOP': 'MED_STOP', 'CODE': 'MED_CODE', 
                                 'DESCRIPTION': 'MEDICATION'}, inplace=True)

medications = pd.DataFrame(d['medications'])

medications.drop(['PATIENT', 'PAYER', 'BASE_COST', 'PAYER_COVERAGE', 'DISPENSES', 'TOTALCOST', 'REASONCODE',
       'REASONDESCRIPTION'], axis = 1, inplace=True)

In [None]:
# Merge current df with 'medicaitons' on the encounter column
large_df = large_df.merge(medications, how = 'outer', on = 'ENCOUNTER')
large_df.columns

In [None]:
# List of date-based columns to convert to datetime
date_columns = ['BIRTHDATE', 'DEATHDATE', 'OBSERVATION_DATE', 'MED_START', 'MED_STOP', 
                'Encounter_start', 'Encounter_stop', 'CONDITION_START', 'CONDIITON_STOP']

# Convert dates to datetime format
for col in date_columns:
    large_df[col] = pd.to_datetime(large_df[col])
    large_df[col] = large_df[col].dt.date.astype('datetime64[ns]')   
