In [1]:
import pandas as pd
import numpy as np
import math
import os
from sklearn.preprocessing import MultiLabelBinarizer
import itertools

In [2]:
#FILEPATHS

os.chdir('/Volumes/Encrypted/Deep-Covariant-Structure-of-Comorbidities/')

# list of rules/key which determine how ICD-10 codes are processed
look_up_table_raw = pd.read_csv('Preprocessing/ICD10/ICD10 Preprocessing Exceptions.csv', engine='python')

# ICD codes that need to be processed
diagnosis_code_df_raw = pd.read_csv('Preprocessing/Data/patientsIn_Anon_diagnoses_only.csv')

# If previously run processed raw df and merged duplicates, import here and skip to section 2 - ICD-10 code conversion
diagnosis_code_df_merged = pd.read_csv('Preprocessing/Data/patientsIn_Anon_diagnoses_only_duplicates_merged.csv', dtype='str') #processed version

# 1. Tidying & merging patient ICD-10 codes df

In [3]:
def rename_df_columns(df) -> pd.DataFrame:    
    """Tidying raw df of ICD-10 codes. Replace column headings with ints 0-11"""
    
    df = df.drop('Unnamed: 0', 1)
    column_names_conversion = {'primarydiagnosiscode1': 0}
    for i in range(2, 13):
        name = 'diagnosiscode'+str(i)
        column_names_conversion[name] = i-1
    df = df.rename(columns=column_names_conversion).set_index('patient_id')
    return df

In [4]:
def merge_duplicate_patients(df) -> pd.DataFrame:
    """
        Takes df of patient codes (rows) and each patient's ICD codes and merges based on duplicate patient codes
    """
    duplicateIndex = df.index.duplicated(keep=False)
    duplicatedDf = df[duplicateIndex]
    notDuplicatedDf = df[~duplicateIndex]
    duplicatedIds = set(duplicatedDf.index)
    
    duplicatedPatientDicts = []
    for singleId in duplicatedIds:
        codes = set(duplicatedDf.loc[singleId].values.flatten())-set([np.nan])
        patientDict = {'patient_id': singleId}
        patientDict.update(enumerate(codes))
        duplicatedPatientDicts.append(patientDict)
        
    mergedDuplicatePatientDf = pd.DataFrame(duplicatedPatientDicts).set_index('patient_id')
    return pd.concat([notDuplicatedDf, mergedDuplicatePatientDf])

In [11]:
diagnosis_code_df_tidy = rename_df_columns(diagnosis_code_df_raw)
diagnosis_code_df_merged = merge_duplicate_patients(diagnosis_code_df_tidy)
# diagnosis_code_df_merged.to_csv('Preprocessing/Data/patientsIn_Anon_diagnoses_only_duplicates_merged.csv')

1

# Section 2 - converting ICD-10 codes

In [5]:
try:
    diagnosis_code_df_merged = diagnosis_code_df_merged.set_index('patient_id')
except KeyError:
    pass #May have already been set

In [6]:
def tidy_look_up_df(df):
    df = df.dropna(axis=0, how='all').reset_index()
    rename_column_map = {"ICD10 Codes":"input",
                         "If required, give new category name": "output",
                         "further merging is possible: other and unspecified super categories": "super"}
    df = df.rename(columns=rename_column_map)
    df['output'] = df['output'].str.replace('.','').str.strip()
    df['input'] = df['input'].str.replace('.','').str.strip().str.upper()
    df['Action'] = df['Action'].str.strip().str.lower()
    
    try:
        df = df.drop(['index'], axis=1)
    except KeyError:
        pass #May have already been dropped
    return df

In [7]:
look_up_table = tidy_look_up_df(look_up_table_raw)

In [8]:
from Preprocessing.ICD10.ICD10Converter import ICD10Converter

In [9]:
converter = ICD10Converter(look_up_table)

In [10]:
converted_df = converter.convert(diagnosis_code_df_merged)