### Parse the ICD-9 hierarchy obtained from https://www.hcup-us.ahrq.gov/toolssoftware/ccs/AppendixASingleDX.txt

In [1]:
import os, pickle
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from collections import Counter

## Parse ICD txt

In [2]:
icd_txt = os.path.join(os.getcwd(), 'ICD9SingleDX.txt')
with open(icd_txt, 'r') as f:
    lines = f.readlines()

In [3]:
lines[:20]

['Appendix A - Clinical Classification Software-DIAGNOSES (January 1980 through September 2015)\n',
 '\n',
 'Revised 03/24/2016\n',
 '\n',
 '1    Tuberculosis\n',
 '     01000 01001 01002 01003 01004 01005 01006 01010 01011 01012 01013 01014 01015 01016 01080 01081 01082 01083 01084 01085\n',
 '     01086 01090 01091 01092 01093 01094 01095 01096 01100 01101 01102 01103 01104 01105 01106 01110 01111 01112 01113 01114\n',
 '     01115 01116 01120 01121 01122 01123 01124 01125 01126 01130 01131 01132 01133 01134 01135 01136 01140 01141 01142 01143\n',
 '     01144 01145 01146 01150 01151 01152 01153 01154 01155 01156 01160 01161 01162 01163 01164 01165 01166 01170 01171 01172\n',
 '     01173 01174 01175 01176 01180 01181 01182 01183 01184 01185 01186 01190 01191 01192 01193 01194 01195 01196 01200 01201\n',
 '     01202 01203 01204 01205 01206 01210 01211 01212 01213 01214 01215 01216 01220 01221 01222 01223 01224 01225 01226 01230\n',
 '     01231 01232 01233 01234 01235 01236 01280 01

## Extract ICD classes

In [4]:
icd_classes = [re.sub('\n', '', y) for y in list(filter(lambda x: x[0].isdigit(), lines))]
icd_classes[:20]

['1    Tuberculosis',
 '2    Septicemia (except in labor)',
 '3    Bacterial infection; unspecified site',
 '4    Mycoses',
 '5    HIV infection',
 '6    Hepatitis',
 '7    Viral infection',
 '8    Other infections; including parasitic',
 '9    Sexually transmitted infections (not HIV or hepatitis)',
 '10   Immunizations and screening for infectious disease',
 '11   Cancer of head and neck',
 '12   Cancer of esophagus',
 '13   Cancer of stomach',
 '14   Cancer of colon',
 '15   Cancer of rectum and anus',
 '16   Cancer of liver and intrahepatic bile duct',
 '17   Cancer of pancreas',
 '18   Cancer of other GI organs; peritoneum',
 '19   Cancer of bronchus; lung',
 '20   Cancer; other respiratory and intrathoracic']

In [5]:
# line index of the icd classes
icd_classes_idx = []
for i in list(filter(lambda x: x[0].isdigit(), lines)):
    icd_classes_idx.append(lines.index(i))

## Map codes to extracted classes

In [6]:
class2code = {}
for i in range(len(icd_classes_idx)):
    if i == len(icd_classes_idx)-1:
        val = lines[icd_classes_idx[i]+1:]
    else:
        val = lines[icd_classes_idx[i]+1:icd_classes_idx[i+1]-1]
    val = [re.sub('\n', '', v) for v in val]
    val = [re.sub(' {2,}', '', v) for v in val] 
    val_final = []
    for v in val:
        val_final += v.split()
    class2code[icd_classes[i]] = set(val_final)

## Check if there're codes mapped to multiple classes

In [7]:
icd_counter = Counter()
for k in class2code:
    icd_counter.update(class2code[k])

In [8]:
icd_counter.most_common(5)

[('01336', 1), ('01303', 1), ('01203', 1), ('01183', 1), ('01383', 1)]

<strong> No ICD codes mapped to more than one classes. <strong>

## Get code2class mapping 

In [9]:
code2class = {}
for k in class2code:
    val = class2code[k]
    for v in val:
        code2class[v] = k

In [10]:
codes_count = len(code2class)
class_count = len(class2code)
print('In total there are {} ICD9 codes one-one mapped to {} classes.'.format(codes_count, class_count))

In total there are 15072 ICD9 codes one-one mapped to 283 classes.


In [11]:
# save the code2class mapping
fp = open(os.path.join(os.getcwd(), 'code2class.p'), 'wb')
pickle.dump(code2class, fp)
fp.close()

In [15]:
# Map class to an index
class2idx = {}
for i, icd in enumerate(icd_classes):
    class2idx[icd] = i
fp = open(os.path.join(os.getcwd(), 'class2idx.p'), 'wb')
pickle.dump(class2idx, fp)
fp.close()

In [18]:
# Map code to class_idx:
code2class_idx = {}
for k in code2class:
    code2class_idx[k] = class2idx[code2class[k]]
fp = open(os.path.join(os.getcwd(), 'code2class_idx.p'), 'wb')
pickle.dump(code2class_idx, fp)
fp.close()