In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

#%store -r icd_grouped

In [2]:
diag_merged = pd.read_csv('diagnose_merged.csv')

In [3]:
icd_df =  pd.read_csv('D_ICD_DIAGNOSES.csv')

In [4]:
icd_df.head(10)

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,1166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,1170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,1171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,1172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,1173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."
5,179,1174,TB pneumothorax-cult dx,"Tuberculous pneumothorax, tubercle bacilli not..."
6,180,1175,TB pneumothorax-histo dx,"Tuberculous pneumothorax, tubercle bacilli not..."
7,181,1176,TB pneumothorax-oth test,"Tuberculous pneumothorax, tubercle bacilli not..."
8,182,1180,Pulmonary TB NEC-unspec,"Other specified pulmonary tuberculosis, unspec..."
9,183,1181,Pulmonary TB NEC-no exam,"Other specified pulmonary tuberculosis, bacter..."


In [5]:
icd_df.loc[lambda df: df['ICD9_CODE'].str.contains('V3')]

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
9501,11901,V3501,Oth mult sb-in hosp w cs,"Other multiple birth (three or more), mates al..."
9502,11902,V351,Oth mult sb-before adm,"Other multiple birth (three or more), mates al..."
9503,11903,V352,Oth multiple sb-nonhosp,"Other multiple birth (three or more), mates al..."
9504,11904,V3600,Mult lb/sb-in hos w/o cs,"Other multiple birth (three or more), mates li..."
9505,11905,V3601,Mult lb/sb-in hosp w cs,"Other multiple birth (three or more), mates li..."
9506,11906,V361,Mult nb/sb-before adm,"Other multiple birth (three or more), mates li..."
9507,11907,V362,Multiple nb/sb-nonhosp,"Other multiple birth (three or more), mates li..."
9508,11908,V3700,Mult brth NOS-hos w/o cs,"Other multiple birth (three or more), unspecif..."
9509,11909,V3701,Mult birth NOS-hosp w cs,"Other multiple birth (three or more), unspecif..."
9510,11910,V371,Mult brth NOS-before adm,"Other multiple birth (three or more), unspecif..."


In [6]:
icd_grouped = icd_df.copy()

In [7]:
icd_grouped['group'] = list(map(lambda x: x[0:2], list(icd_grouped['ICD9_CODE'])))

In [8]:
icd_df.loc[lambda df: df['ICD9_CODE'].str.contains('116')]

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,01166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
254,139,01116,TB lung nodular-oth test,"Tuberculosis of lung, nodular, tubercle bacill..."
283,168,01160,TB pneumonia-unspec,"Tuberculous pneumonia [any form], unspecified"
284,169,01161,TB pneumonia-no exam,"Tuberculous pneumonia [any form], bacteriologi..."
285,170,01162,TB pneumonia-exam unkn,"Tuberculous pneumonia [any form], bacteriologi..."
286,171,01163,TB pneumonia-micro dx,"Tuberculous pneumonia [any form], tubercle bac..."
287,172,01164,TB pneumonia-cult dx,"Tuberculous pneumonia [any form], tubercle bac..."
288,173,01165,TB pneumonia-histo dx,"Tuberculous pneumonia [any form], tubercle bac..."
1065,1447,1160,Blastomycosis,Blastomycosis
1066,1448,1161,Paracoccidioidomycosis,Paracoccidioidomycosis


In [9]:
list(icd_grouped['ICD9_CODE'])[1][0:3]

'011'

In [10]:
import csv
import json
from collections import *


class Node(object):
    def __init__(self, depth, code, descr=None):
        self.depth = depth
        self.descr = descr or code
        self.code = code
        self.parent = None
        self.children = []

    def add_child(self, child):
        if child not in self.children:
            self.children.append(child)

    def search(self, code):
        if code == self.code:
            return [self]
        ret = []
        for child in self.children:
            ret.extend(child.search(code))
        return ret

    def find(self, code):
        nodes = self.search(code)
        if nodes:
            return nodes[0]
        return None

    @property
    def root(self):
        return self.parents[0]

    @property
    def description(self):
        return self.descr

    @property
    def codes(self):
        return map(lambda n: n.code, self.leaves)

    @property
    def parents(self):
        n = self
        ret = []
        while n:
            ret.append(n)
            n = n.parent
        ret.reverse()
        return ret

    @property
    def leaves(self):
        leaves = set()
        if not self.children:
            return [self]
        for child in self.children:
            leaves.update(child.leaves)
        return list(leaves)

    # return all leaf notes with a depth of @depth
    def leaves_at_depth(self, depth):
        return filter(lambda n: n.depth == depth, self.leaves)

    @property
    def siblings(self):
        parent = self.parent
        if not parent:
            return []
        return list(parent.children)

    def __str__(self):
        return '%s\t%s' % (self.depth, self.code)

    def __hash__(self):
        return hash(str(self))


class ICD9(Node):
    def __init__(self, codesfname):
        # dictionary of depth -> dictionary of code->node
        self.depth2nodes = defaultdict(dict)
        super(ICD9, self).__init__(-1, 'ROOT')

        with open(codesfname, 'r') as f:
            allcodes = json.loads(f.read())
            self.process(allcodes)

    def process(self, allcodes):
        for hierarchy in allcodes:
            self.add(hierarchy)

    def get_node(self, depth, code, descr):
        d = self.depth2nodes[depth]
        if code not in d:
            d[code] = Node(depth, code, descr)
        return d[code]

    def add(self, hierarchy):
        prev_node = self
        for depth, link in enumerate(hierarchy):
            if not link['code']:
                continue

            code = link['code']
            descr = 'descr' in link and link['descr'] or code
            node = self.get_node(depth, code, descr)
            node.parent = prev_node
            prev_node.add_child(node)
            prev_node = node

In [11]:
icd9_tree = ICD9('codes_new.json')
counter = Counter(map(str, icd9_tree.leaves))

In [12]:
print(icd9_tree.find('091').parent.description)
print(icd9_tree.find('091').description)

SYPHILIS AND OTHER VENEREAL DISEASES 
Early syphilis, symptomatic


In [13]:
#Why error?
#type(icd9_tree.find('0911')

In [None]:
lv1_group = []
lv2_group = []
lv1_code = []
lv2_code = []
code_list = list(icd_grouped['ICD9_CODE'])
for i in range(len(icd_grouped)):
    this_code = code_list[i][0:3]
    try:
        lv1_group.append(icd9_tree.find(this_code).parent.description)
        lv2_group.append(icd9_tree.find(this_code).description)
        lv1_code.append(icd9_tree.find(this_code).parent.code)
        lv2_code.append(icd9_tree.find(this_code).code)
    except:
        lv1_group.append('Not Found')
        lv2_group.append('Not Found')
        lv1_code.append('0000')
        lv2_code.append('0000')               

In [None]:
icd_grouped['lv.1_code'] = lv1_code
icd_grouped['lv.2_code'] = lv2_code
icd_grouped['lv.1_group'] = lv1_group
icd_grouped['lv.2_group'] = lv2_group

In [None]:
#icd_grouped = icd_grouped.drop(columns = ["group"])
icd_grouped

In [25]:
%store -r icd_grouped

no stored variable icd_grouped


In [None]:
#yourdf.drop(['columnheading1', 'columnheading2'], axis=1, inplace=True)
icd_grouped = icd_grouped.drop(['column_name'], axis=1)

In [27]:
icd_grouped.to_csv(r'/Users/kathy908000/github/Health-Care-Research/icd_grouped.csv', index = False)