In [2]:
import pandas as pd
import numpy as np
import math
import os
from sklearn.preprocessing import MultiLabelBinarizer

In [13]:
#FILEPATHS

# your list of rules/key which determine how ICD-10 codes are processed
rules_df = pd.read_csv('Preprocessing/ICD10/ICD10 Preprocessing Exceptions.csv', engine='python')

# ICD codes that need to be processed
diagnosis_code_df_raw = pd.read_csv('Preprocessing/Data/patientsIn_Anon_diagnoses_only.csv')

In [15]:
#Tidying df of ICD-10 codes. Replace column headings with ints 0-11
diagnosis_code_df_raw = diagnosis_code_df_raw.drop('Unnamed: 0', 1)
column_names_conversion = {'primarydiagnosiscode1': 0}
for i in range(2, 13):
    name = 'diagnosiscode'+str(i)
    column_names_conversion[name] = i-1
diagnosis_code_df_raw = diagnosis_code_df_raw.rename(columns=column_names_conversion)

Converting exceptions file into a machine readable 1:1 I/O file

In [613]:
rulesDf = rulesDf.dropna(axis=0, how='all').reset_index()

In [611]:
rulesDf

Unnamed: 0,index,ICD10 Codes,ICD10 Description,Action,"If required, give new category name",Comment (optional),further merging is possible: other and unspecified super categories
0,0,A04.0,,merge,Ecoli,,
1,1,A04.1,,merge,Ecoli,,
2,2,A04.2,,merge,Ecoli,,
3,3,A04.3,,merge,Ecoli,,
4,4,A04.4,,merge,Ecoli,,
5,5,A41.51,,merge,Ecoli,,
6,6,A04.7,,parent,Cdiff A04 7x,A04.7x should all me merged to the parent A04....,
7,7,A04,,merge,Bacterial intestinal oth unsp,,Bacterial super
8,8,A04.8,,merge,Bacterial intestinal oth unsp,,Bacterial super
9,9,A04.9,,merge,Bacterial intestinal oth unsp,,Bacterial super


In [78]:
# lookUpDF = pd.DataFrame(np.nan, index=range(100000), columns=['input', 'ouput', 'mode'])

In [614]:
renameColumnNames = {"ICD10 Codes":"input",
                     "If required, give new category name": "output",
                     "further merging is possible: other and unspecified super categories": "super"}

In [626]:
rulesDf = rulesDf.rename(columns=renameColumnNames)

In [628]:
keepsDf = rulesDf[rulesDf['Action'] == 'keep'][list(renameColumnNames.values())]
keepsDf['output'] = keepsDf['input']
keepsDf['mode'] = 'keep'

In [629]:
mergeDf = rulesDf[rulesDf['Action'] == 'merge'][list(renameColumnNames.values())]
# mergeDf = mergeDf.rename(columns=renameColumnNames)
mergeDf['mode'] = 'merge'

In [631]:
deleteDf = rulesDf[rulesDf['Action'] == 'delete'][list(renameColumnNames.values())]
# deleteDf = deleteDf.rename(columns=renameColumnNames)
deleteDf['output'] = 'delete'
deleteDf['mode'] = 'delete'

In [635]:
parentDfPreExpansion = rulesDf[rulesDf['Action'] == 'parent'][list(renameColumnNames.values())]
# parentDfPreExpansion = parentDf.rename(columns=renameColumnNames)
parentDfPreExpansion['mode'] = 'parent'
parentDf=pd.DataFrame(np.repeat(parentDfPreExpansion.values,10,axis=0), columns = ['output', 'label', 'super', 'mode'])
#using 10 repeats allows setting the input for all possible input values - eg for A04.7, we want A04.70, A04.71, A04.72 etc.

# Generate all possible inputs for a given output - eg for A04.7, we want A04.70, A04.71, A04.72 etc
# so that they can all be truncated to 4 digits (parent), instead of truncated to 3 digits as standard
allUntruncated = []
for i in range(0, parentDf.shape[0], 10):
    truncated = parentDf.iloc[i, 0]
    untruncated = [truncated+str(i) for i in range(10)]
    allUntruncated.append(untruncated)
allUntruncated = [item for sublist in allUntruncated for item in sublist]

parentDf['input'] = allUntruncated

In [636]:
parentDf = parentDf.drop('label', axis=1)

In [637]:
processedLookUpDF = pd.concat([keepsDf, mergeDf, deleteDf, parentDf], sort=False)

In [638]:
# Manually written instructions in the look up DF may contain full stops after the first three digits, eg A14.82. We strip these.

processedLookUpDF['output'] = processedLookUpDF['output'].str.replace('.','')
processedLookUpDF['input'] = processedLookUpDF['input'].str.replace('.','')

In [640]:
processedLookUpDF #finished look-up conversion table

Unnamed: 0,input,output,super,mode
10,A050,A050,,keep
11,A051,A051,,keep
12,A052,A052,,keep
13,A053,A053,,keep
14,A054,A054,,keep
15,A055,A055,,keep
91,A8101,A8101,,keep
135,B0082,B0082,,keep
139,B0112,B0112,,keep
179,B2684,B2684,,keep


In [648]:
def translateCode(inputCode, lookUpDF):
    """ Converts an ICD-10 code into an output specified by lookUpDF
    If ICD-10 code is mentioned in lookUpDF, apply that rule and return corresponding output value.
    If not found, return truncated - only first 3 digits"""
    # if not found in lookUpDF, return truncated
    lookUpResult = lookUpDF[lookUpDF['input'] == inputCode]
    if lookUpResult.empty:
        return inputCode[:3]
    else:
        outputCode = lookUpResult['output'].values[0]
        if outputCode == 'delete':
            outputCode = np.NaN
        print(inputCode, '->', outputCode, ' \n')
        return outputCode

In [556]:
# def codeRowToTranslatedSetOld(row, lookUpDF):
#     patientId = row['patient_id']
#     row = row[row.index != 'patient_id']
#     translatedCodes = set()
#     for inputCode in row:
#         try:
#             math.isnan(float(inputCode))
#         except ValueError:
#             translatedCodes.add(translateCode(inputCode, lookUpDF))
#             superCode = lookUpDF[lookUpDF['input'] == inputCode]['super']
#             if superCode.any():
#                 print(', super:', superCode)
#                 supersupercodes.append(superCode)
#                 translatedCodes.add(superCode.values[0])
#     return ([patientId] + list(translatedCodes)), 

In [649]:
def codeRowToTranslatedSet(row, lookUpDF):
#     patientId = row['patient_id']
    row = row[row.index != 'patient_id'].dropna()
    translatedCodes = set()
    for inputCode in row:
        translatedCodes.add(translateCode(inputCode, lookUpDF))
        superCode = lookUpDF[lookUpDF['input'] == inputCode]['super']
        if superCode.any():
#             print(', super:', superCode)
            supersupercodes.append(superCode)
            translatedCodes.add(superCode.values[0])
    return list(translatedCodes)

In [650]:
diagnosis_code_df_raw

Unnamed: 0,0,9,10,11,1,2,3,4,5,6,7,8,patient_id
0,K628,,,,K649,Z886,D649,,,,,,wi1m4nsWl75ej4DASC1tFQ
1,N46X,,,,E230,Z854,Z923,Z926,Z880,,,,P55Vaqw6G99yLO1Zxx3QYA
2,N311,Z880,Z874,,N301,R309,J459,D510,K219,K900,E669,Z864,p2shr2dOWU/43OZZUK6F6Q
3,N359,,,,I10X,K219,M109,M512,Z854,,,,jXX5/7YwazQyXcKmcDBKsA
4,K573,M545,Z888,Z634,R11X,R12X,K509,R634,E119,I10X,K908,K760,c2eRvigWOTdfLI6ztfAEfQ
5,M4186,,,,I489,I259,Z940,F419,M109,,,,kr+sHaLXQKOrCIEwx1GP3A
6,G249,,,,G253,F259,T810,Y836,Z930,,,,d9kVjyeK+K7yDySZHC1GJQ
7,R102,,,,Z854,Z923,I10X,,,,,,r90m1wa8FiFd2sGgHCEdmQ
8,M189,,,,I10X,J459,M512,K219,,,,,pCYfJxdwk/zNG48bGHBRNA
9,M161,,,,K590,I10X,K449,Z853,,,,,0v/+5w/6LpJxIGU4vczSeg


In [844]:
convertedDiagnsosiCodesSet = diagnosis_code_df_raw[0:100].apply(lambda row: codeRowToTranslatedSet(row, processedLookUpDF), axis=1)

B182 -> Hep C  

B941 -> B941  

A630 -> Viral warts  

B199 -> Viral Hep oth unsp  

B968 -> nan  

B181 -> Hep B  



In [589]:
convertedDiagnsosiCodesSet = diagnosis_code_df_raw[0:1000].apply(lambda row: codeRowToTranslatedSet(row, processedLookUpDF), axis=1)

, super: 8    Bacterial super
Name: super, dtype: object
, super: 62    Bacterial super
Name: super, dtype: object
, super: 62    Bacterial super
Name: super, dtype: object
, super: 69    Bacterial super
Name: super, dtype: object


In [853]:
convertedDiagnsosiCodeDf = pd.DataFrame(list(convertedDiagnsosiCodesSet),
                                        index = diagnosis_code_df_raw['patient_id'][:len(convertedDiagnsosiCodesSet)]).fillna(value=np.nan)

In [854]:
convertedDiagnsosiCodeDf

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
wi1m4nsWl75ej4DASC1tFQ,D64,K64,K62,Z88,,,,,,,,
P55Vaqw6G99yLO1Zxx3QYA,N46,Z92,E23,Z85,Z88,,,,,,,
p2shr2dOWU/43OZZUK6F6Q,Z87,R30,J45,Z86,N30,E66,K21,D51,K90,N31,Z88,
jXX5/7YwazQyXcKmcDBKsA,I10,M51,K21,Z85,N35,M10,,,,,,
c2eRvigWOTdfLI6ztfAEfQ,I10,Z63,M54,K50,R11,K76,R63,K90,E11,K57,R12,Z88
kr+sHaLXQKOrCIEwx1GP3A,I48,I25,M41,Z94,F41,M10,,,,,,
d9kVjyeK+K7yDySZHC1GJQ,F25,T81,G25,G24,Y83,Z93,,,,,,
r90m1wa8FiFd2sGgHCEdmQ,I10,Z85,Z92,R10,,,,,,,,
pCYfJxdwk/zNG48bGHBRNA,I10,J45,M18,M51,K21,,,,,,,
0v/+5w/6LpJxIGU4vczSeg,I10,K44,M16,K59,Z85,,,,,,,


In [861]:
df = convertedDiagnsosiCodeDf

In [953]:
def mergeDuplicatePatients(df):
    duplicateIndex = df.index.duplicated(keep=False)
    duplicatedDf = df[duplicateIndex]
    notDuplicatedDf = df[~duplicateIndex]
    duplicatedIds = set(duplicatedDf.index)
    
    duplicatedPatientDicts = []
    for singleId in duplicatedIds:
        codes = set(duplicatedDf.loc[singleId].values.flatten())-set([np.nan])
        patientDict = {'patient_id': singleId}
        patientDict.update(enumerate(codes))
        duplicatedPatientDicts.append(patientDict)
        
    mergedDuplicatePatientDf = pd.DataFrame(duplicatedPatientDicts).set_index('patient_id')
    return pd.concat([notDuplicatedDf, mergedDuplicatePatientDf])

In [954]:
mergeDuplicatePatients(convertedDiagnsosiCodeDf)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
wi1m4nsWl75ej4DASC1tFQ,D64,K64,K62,Z88,,,,,,,,
P55Vaqw6G99yLO1Zxx3QYA,N46,Z92,E23,Z85,Z88,,,,,,,
p2shr2dOWU/43OZZUK6F6Q,Z87,R30,J45,Z86,N30,E66,K21,D51,K90,N31,Z88,
jXX5/7YwazQyXcKmcDBKsA,I10,M51,K21,Z85,N35,M10,,,,,,
c2eRvigWOTdfLI6ztfAEfQ,I10,Z63,M54,K50,R11,K76,R63,K90,E11,K57,R12,Z88
kr+sHaLXQKOrCIEwx1GP3A,I48,I25,M41,Z94,F41,M10,,,,,,
d9kVjyeK+K7yDySZHC1GJQ,F25,T81,G25,G24,Y83,Z93,,,,,,
r90m1wa8FiFd2sGgHCEdmQ,I10,Z85,Z92,R10,,,,,,,,
pCYfJxdwk/zNG48bGHBRNA,I10,J45,M18,M51,K21,,,,,,,
0v/+5w/6LpJxIGU4vczSeg,I10,K44,M16,K59,Z85,,,,,,,


{'JnHI6B/9LMsuM1XvPvs95g', 'Trk2lVta7aP2p4tAZOyHog', 'YVhB8QiRx9puEaYUeD2SuA'}

In [923]:
codes = set(duplicatedDf.loc['JnHI6B/9LMsuM1XvPvs95g'].values.flatten())
(('patient_id', singleId),) + tuple(enumerate(codes))

(('patient_id', 'JnHI6B/9LMsuM1XvPvs95g'),
 (0, 'I10'),
 (1, 'I07'),
 (2, 'D64'),
 (3, 'B01'),
 (4, 'D69'),
 (5, 'I25'),
 (6, 'I35'),
 (7, 'I50'),
 (8, 'J96'),
 (9, 'J90'),
 (10, 'E11'),
 (11, 'J17'))

In [945]:
mergedDuplicatePatientDf

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Trk2lVta7aP2p4tAZOyHog,I63,I10,Z87,F43,F32,G63,C90,M25,N95,,,
YVhB8QiRx9puEaYUeD2SuA,P27,Z38,Q77,H35,P07,P61,P60,P59,P22,P28,,
JnHI6B/9LMsuM1XvPvs95g,I10,I07,D64,B01,D69,I25,I35,I50,J96,J90,E11,J17


In [891]:
notDuplicatedDf.loc['JnHI6B/9LMsuM1XvPvs95g']

0     I10
1     I07
2     D64
3     B01
4     D69
5     I25
6     I35
7     I50
8     J96
9     J90
10    E11
11    J17
Name: JnHI6B/9LMsuM1XvPvs95g, dtype: object

TODO:
- check for duplicate patient IDs!!
- remove if no diagnsois ? 
- expansion of samples by removing one diagnosis code each time - amplification step

In [380]:
type(exampleRow['patient_id'])

str

In [598]:
diagnosis_code_df_raw.shape

(309174, 13)

In [371]:
translatedCodesAndPatientNumber = diagnosis_code_df_raw[100:200].apply(lambda row: codeRowToSet(row), axis=1)

In [379]:
translatedCodesAndPatientNumber

100    [a68K6Ftx8/v9lvR3DAxHeg, F81, K21, Z88, M41, F...
101    [1tI2iqXGfLJEaUtzDIpFNg, J45, K58, R10, R39, M...
102    [OxG0i0JbCZezkXJSSo06rA, Z86, M86, R10, E23, E...
103         [SJ6g5ywyQfEbxdVpfGYCnA, I10, G93, H91, G40]
104    [mCaoDeguFwPnSe9hLRGZrw, Z86, K85, C83, Y43, N...
105    [61Elg7qq5nAscu8z7dErzg, J45, M13, F32, M22, G...
106    [2qy7w1awBVbqyJME/q82qg, R10, K66, Y76, Z90, T83]
107         [DDnSwoaw+gl6/RBTOROcSA, M54, Z99, Z91, Z88]
108    [YVhB8QiRx9puEaYUeD2SuA, P27, Q77, H35, P07, P...
109         [O2Ck4XH+gQBp6+A3PicE5g, Z87, Z90, N32, Z93]
110    [vSTPfvxCOdlUyr/V6E1+OA, K66, K21, Z90, K80, K...
111    [CJ+dQLUXmHHVI4aJiSm3SA, I10, N40, I48, I25, I...
112         [ZBC3LWKXXTO0yzZ1ZIeloQ, F17, K25, G99, M50]
113    [oLH1zBkWAUcLvVCKlJt5QQ, Z87, Z46, Z90, N35, N32]
114         [mt8TnXFU5O3E6rnvK2esuQ, I10, Z86, K44, K22]
115    [LW5CPK1ig+/MDzEWGX1lOw, Z87, N39, N82, N89, E...
116         [zgzGxfkwCKh+2dv4vXB6nQ, N80, D25, Z88, H81]
117    [mmHeHvafYhWAt3UByvMyuA,

In [376]:
pd.DataFrame(translatedCodesAndPatientNumber[0])

KeyError: 0

In [None]:
check diagnosis 

In [286]:
diagnosis_code_df_raw[100:].apply(np.vectorize(lookUpCodes))

A05.5
A05.5


ValueError: ("could not convert string to float: 'H351'", 'occurred at index 9')

In [245]:
inputCode= 'A05.3'
processedLookUpDF[processedLookUpDF['input'] == inputCode]['output'].values[0]

'A05.3'

In [284]:
diagnosis_code_df_raw.iloc[200, 0] = 'A05.5'