In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from datetime import timezone, datetime

In [2]:
df1 = pd.read_csv('../data/PATIENTS.csv')

In [3]:
ll = list(df1['DOB'])
df1['TimeStamp'] = list(map(lambda x: datetime(int(x[0:4]), int(x[5:7]), int(x[8:10]))
                            .replace(tzinfo=timezone.utc).timestamp(), ll))

In [4]:
df1

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG,TimeStamp
0,234,249,F,2075-03-13 00:00:00,,,,0,3.319661e+09
1,235,250,F,2164-12-27 00:00:00,2188-11-22 00:00:00,2188-11-22 00:00:00,,1,6.153235e+09
2,236,251,M,2090-03-15 00:00:00,,,,0,3.793219e+09
3,237,252,M,2078-03-06 00:00:00,,,,0,3.413750e+09
4,238,253,F,2089-11-26 00:00:00,,,,0,3.783802e+09
5,239,255,M,2109-08-05 00:00:00,,,,0,4.405104e+09
6,240,256,M,2086-07-31 00:00:00,,,,0,3.678912e+09
7,241,257,F,2031-04-03 00:00:00,2121-07-08 00:00:00,2121-07-08 00:00:00,2121-07-08 00:00:00,1,1.932941e+09
8,242,258,F,2124-09-19 00:00:00,,,,0,4.882378e+09
9,243,260,F,2105-03-23 00:00:00,,,,0,4.267210e+09


In [8]:
#dt = datetime()
dt = datetime(int(ll[0][0:4]), int(ll[0][5:7]), int(ll[0][8:10]))
timestamp = dt.replace(tzinfo=timezone.utc).timestamp()
timestamp

3319660800.0

In [9]:
pd.Timestamp('2075-03-13 00:00:00')

Timestamp('2075-03-13 00:00:00')

In [5]:
diag_merged = pd.read_csv('../data/diagnose_merged.csv')
icd_df =  pd.read_csv('../data/D_ICD_DIAGNOSES.csv')
icd_df.head(10)
icd_grouped = icd_df.copy()

In [14]:
icd_df.loc[lambda df: df['ICD9_CODE'].str.contains('V3')]

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
9501,11901,V3501,Oth mult sb-in hosp w cs,"Other multiple birth (three or more), mates al..."
9502,11902,V351,Oth mult sb-before adm,"Other multiple birth (three or more), mates al..."
9503,11903,V352,Oth multiple sb-nonhosp,"Other multiple birth (three or more), mates al..."
9504,11904,V3600,Mult lb/sb-in hos w/o cs,"Other multiple birth (three or more), mates li..."
9505,11905,V3601,Mult lb/sb-in hosp w cs,"Other multiple birth (three or more), mates li..."
9506,11906,V361,Mult nb/sb-before adm,"Other multiple birth (three or more), mates li..."
9507,11907,V362,Multiple nb/sb-nonhosp,"Other multiple birth (three or more), mates li..."
9508,11908,V3700,Mult brth NOS-hos w/o cs,"Other multiple birth (three or more), unspecif..."
9509,11909,V3701,Mult birth NOS-hosp w cs,"Other multiple birth (three or more), unspecif..."
9510,11910,V371,Mult brth NOS-before adm,"Other multiple birth (three or more), unspecif..."


### ICD9 python library from <https://github.com/sirrice/icd9>

In [15]:
import csv
import json
from collections import *


class Node(object):
    def __init__(self, depth, code, descr=None):
        self.depth = depth
        self.descr = descr or code
        self.code = code
        self.parent = None
        self.children = []

    def add_child(self, child):
        if child not in self.children:
            self.children.append(child)

    def search(self, code):
        if code == self.code:
            return [self]
        ret = []
        for child in self.children:
            ret.extend(child.search(code))
        return ret

    def find(self, code):
        nodes = self.search(code)
        if nodes:
            return nodes[0]
        return None

    @property
    def root(self):
        return self.parents[0]

    @property
    def description(self):
        return self.descr

    @property
    def codes(self):
        return map(lambda n: n.code, self.leaves)

    @property
    def parents(self):
        n = self
        ret = []
        while n:
            ret.append(n)
            n = n.parent
        ret.reverse()
        return ret

    @property
    def leaves(self):
        leaves = set()
        if not self.children:
            return [self]
        for child in self.children:
            leaves.update(child.leaves)
        return list(leaves)

    # return all leaf notes with a depth of @depth
    def leaves_at_depth(self, depth):
        return filter(lambda n: n.depth == depth, self.leaves)

    @property
    def siblings(self):
        parent = self.parent
        if not parent:
            return []
        return list(parent.children)

    def __str__(self):
        return '%s\t%s' % (self.depth, self.code)

    def __hash__(self):
        return hash(str(self))


class ICD9(Node):
    def __init__(self, codesfname):
        # dictionary of depth -> dictionary of code->node
        self.depth2nodes = defaultdict(dict)
        super(ICD9, self).__init__(-1, 'ROOT')

        with open(codesfname, 'r') as f:
            allcodes = json.loads(f.read())
            self.process(allcodes)

    def process(self, allcodes):
        for hierarchy in allcodes:
            self.add(hierarchy)

    def get_node(self, depth, code, descr):
        d = self.depth2nodes[depth]
        if code not in d:
            d[code] = Node(depth, code, descr)
        return d[code]

    def add(self, hierarchy):
        prev_node = self
        for depth, link in enumerate(hierarchy):
            if not link['code']:
                continue

            code = link['code']
            descr = 'descr' in link and link['descr'] or code
            node = self.get_node(depth, code, descr)
            node.parent = prev_node
            prev_node.add_child(node)
            prev_node = node

In [19]:
# Generate icd9_tree for classification
icd9_tree = ICD9('codes_new.json')
counter = Counter(map(str, icd9_tree.leaves))

In [36]:
icd9_tree.find('E997').parent.description

'INJURY RESULTING FROM OPERATIONS OF WAR '

In [10]:
icd9_tree.find('V73').parent.description

'PERSONS WITHOUT REPORTED DIAGNOSIS ENCOUNTERED DURING EXAMINATION AND INVESTIGATION OF INDIVIDUALS AND POPULATIONS '

In [11]:
icd9_tree.find('V73').parent.code

'V70-V82'

In [13]:
icd_grouped['ICD9_CODE']

0        01166
1        01170
2        01171
3        01172
4        01173
5        01174
6        01175
7        01176
8        01180
9        01181
10       01182
11       01183
12       01184
13       01185
14       01186
15       01190
16       01191
17       01192
18       01193
19       01194
20       01195
21       01196
22       01200
23       01201
24       01202
25       01203
26       01204
27       01205
28       01206
29       01210
         ...  
14537    E9949
14538    E9950
14539    E9951
14540    E9952
14541    E9953
14542    E9954
14543    E9958
14544    E9959
14545    E9960
14546    E9961
14547    E9962
14548    E9963
14549    E9968
14550    E9969
14551    E9970
14552    E9971
14553    E9972
14554    E9973
14555    E9978
14556    E9979
14557    E9980
14558    E9981
14559    V7388
14560    V7389
14561    V7398
14562    V7399
14563     V740
14564     V741
14565     V742
14566     V743
Name: ICD9_CODE, Length: 14567, dtype: object

In [22]:
# Generate 4 new columns representing the Level 1 & 2 Grouping of the codes
lv1_group = []
lv2_group = []
lv1_code = []
lv2_code = []
code_list = list(icd_grouped['ICD9_CODE'])
for i in range(len(icd_grouped)):
    this_code = code_list[i][0:3]
    try:
        this_lv2 = icd9_tree.find(this_code).parent.code
        this_lv1 = icd9_tree.find(this_lv2).parent.code
        lv1_code.append(this_lv1)
        lv2_code.append(this_lv2) 
        lv1_group.append(icd9_tree.find(this_lv1).description)  
        lv2_group.append(icd9_tree.find(this_lv2).description)
    except:
        lv1_group.append('Not Found')
        lv2_group.append('Not Found')
        lv1_code.append('0000')
        lv2_code.append('0000')    

In [24]:
icd_grouped['Lv1_code'] = lv1_code
icd_grouped['Lv2_code'] = lv2_code
icd_grouped['Lv1_group'] = lv1_group
icd_grouped['Lv2_group'] = lv2_group

In [37]:
icd9_tree.find('235').parent.description

AttributeError: 'NoneType' object has no attribute 'parent'

In [25]:
icd_grouped

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE,Lv1_code,Lv2_code,Lv1_group,Lv2_group
0,174,01166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac...",001-139,010-018,INFECTIOUS AND PARASITIC DISEASES,TUBERCULOSIS
1,175,01170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified",001-139,010-018,INFECTIOUS AND PARASITIC DISEASES,TUBERCULOSIS
2,176,01171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h...",001-139,010-018,INFECTIOUS AND PARASITIC DISEASES,TUBERCULOSIS
3,177,01172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h...",001-139,010-018,INFECTIOUS AND PARASITIC DISEASES,TUBERCULOSIS
4,178,01173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou...",001-139,010-018,INFECTIOUS AND PARASITIC DISEASES,TUBERCULOSIS
5,179,01174,TB pneumothorax-cult dx,"Tuberculous pneumothorax, tubercle bacilli not...",001-139,010-018,INFECTIOUS AND PARASITIC DISEASES,TUBERCULOSIS
6,180,01175,TB pneumothorax-histo dx,"Tuberculous pneumothorax, tubercle bacilli not...",001-139,010-018,INFECTIOUS AND PARASITIC DISEASES,TUBERCULOSIS
7,181,01176,TB pneumothorax-oth test,"Tuberculous pneumothorax, tubercle bacilli not...",001-139,010-018,INFECTIOUS AND PARASITIC DISEASES,TUBERCULOSIS
8,182,01180,Pulmonary TB NEC-unspec,"Other specified pulmonary tuberculosis, unspec...",001-139,010-018,INFECTIOUS AND PARASITIC DISEASES,TUBERCULOSIS
9,183,01181,Pulmonary TB NEC-no exam,"Other specified pulmonary tuberculosis, bacter...",001-139,010-018,INFECTIOUS AND PARASITIC DISEASES,TUBERCULOSIS


In [None]:
icd9_tree.find('235').parent.description

In [113]:
icd_grouped[icd_grouped['Lv1_code'] == '0000']

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE,Lv1_code,Lv2_code,Lv1_group,Lv2_group
1714,1717,2355,Unc behav neo GI NEC,Neoplasm of uncertain behavior of other and un...,0000,0000,Not Found,Not Found
1715,1718,2356,Unc behav neo larynx,Neoplasm of uncertain behavior of larynx,0000,0000,Not Found,Not Found
1716,1719,2357,Unc behav neo lung,"Neoplasm of uncertain behavior of trachea, bro...",0000,0000,Not Found,Not Found
1717,1720,2358,Unc behav neo pleura,"Neoplasm of uncertain behavior of pleura, thym...",0000,0000,Not Found,Not Found
1718,1721,2359,Unc behav neo resp NEC,Neoplasm of uncertain behavior of other and un...,0000,0000,Not Found,Not Found
2208,2567,2350,Unc behav neo salivary,Neoplasm of uncertain behavior of major saliva...,0000,0000,Not Found,Not Found
2209,2568,2351,Unc behav neo oral/phar,"Neoplasm of uncertain behavior of lip, oral ca...",0000,0000,Not Found,Not Found
2210,2569,2352,Unc behav neo intestine,"Neoplasm of uncertain behavior of stomach, int...",0000,0000,Not Found,Not Found
2211,2570,2353,Unc behav neo liver,Neoplasm of uncertain behavior of liver and bi...,0000,0000,Not Found,Not Found
2212,2571,2354,Unc behav neo peritoneum,Neoplasm of uncertain behavior of retroperiton...,0000,0000,Not Found,Not Found


In [18]:
icd_grouped['Lv_2code']

KeyError: 'Lv_2code'

In [117]:
ungrouped = icd_grouped[icd_grouped['Lv2_code'] == '0000']

In [142]:
cd = icd9_tree.find('140-239').leaves
cd

[<__main__.Node at 0x1a1867c320>,
 <__main__.Node at 0x1a18678898>,
 <__main__.Node at 0x1a186929e8>,
 <__main__.Node at 0x1a18686f28>,
 <__main__.Node at 0x1a186a0a58>,
 <__main__.Node at 0x1a18689198>,
 <__main__.Node at 0x1a186a4898>,
 <__main__.Node at 0x1a18697320>,
 <__main__.Node at 0x1a1869b5c0>,
 <__main__.Node at 0x1a186a0b00>,
 <__main__.Node at 0x1a18692390>,
 <__main__.Node at 0x1a1867c390>,
 <__main__.Node at 0x1a18692c50>,
 <__main__.Node at 0x1a18692780>,
 <__main__.Node at 0x1a18692d68>,
 <__main__.Node at 0x1a186897f0>,
 <__main__.Node at 0x1a1868dfd0>,
 <__main__.Node at 0x1a186922e8>,
 <__main__.Node at 0x1a1868d2b0>,
 <__main__.Node at 0x1a18678160>,
 <__main__.Node at 0x1a1868dda0>,
 <__main__.Node at 0x1a1868d470>,
 <__main__.Node at 0x1a1869bd30>,
 <__main__.Node at 0x1a1869b240>,
 <__main__.Node at 0x1a18678278>,
 <__main__.Node at 0x1a18697f98>,
 <__main__.Node at 0x1a186976d8>,
 <__main__.Node at 0x1a18686160>,
 <__main__.Node at 0x1a186977f0>,
 <__main__.Nod

In [118]:
ungrouped

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE,Lv1_code,Lv2_code,Lv1_group,Lv2_group
1714,1717,2355,Unc behav neo GI NEC,Neoplasm of uncertain behavior of other and un...,0000,0000,Not Found,Not Found
1715,1718,2356,Unc behav neo larynx,Neoplasm of uncertain behavior of larynx,0000,0000,Not Found,Not Found
1716,1719,2357,Unc behav neo lung,"Neoplasm of uncertain behavior of trachea, bro...",0000,0000,Not Found,Not Found
1717,1720,2358,Unc behav neo pleura,"Neoplasm of uncertain behavior of pleura, thym...",0000,0000,Not Found,Not Found
1718,1721,2359,Unc behav neo resp NEC,Neoplasm of uncertain behavior of other and un...,0000,0000,Not Found,Not Found
2208,2567,2350,Unc behav neo salivary,Neoplasm of uncertain behavior of major saliva...,0000,0000,Not Found,Not Found
2209,2568,2351,Unc behav neo oral/phar,"Neoplasm of uncertain behavior of lip, oral ca...",0000,0000,Not Found,Not Found
2210,2569,2352,Unc behav neo intestine,"Neoplasm of uncertain behavior of stomach, int...",0000,0000,Not Found,Not Found
2211,2570,2353,Unc behav neo liver,Neoplasm of uncertain behavior of liver and bi...,0000,0000,Not Found,Not Found
2212,2571,2354,Unc behav neo peritoneum,Neoplasm of uncertain behavior of retroperiton...,0000,0000,Not Found,Not Found


In [120]:
num_ungrouped = len(ungrouped)
num_ungrouped

1522

In [127]:
undefined = undefined.reset_index(drop = True)
undefined

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE,Lv1_code,Lv2_code,Lv1_group,Lv2_group
0,1717,2355,Unc behav neo GI NEC,Neoplasm of uncertain behavior of other and un...,0000,0000,Not Found,Not Found
1,1718,2356,Unc behav neo larynx,Neoplasm of uncertain behavior of larynx,0000,0000,Not Found,Not Found
2,1719,2357,Unc behav neo lung,"Neoplasm of uncertain behavior of trachea, bro...",0000,0000,Not Found,Not Found
3,1720,2358,Unc behav neo pleura,"Neoplasm of uncertain behavior of pleura, thym...",0000,0000,Not Found,Not Found
4,1721,2359,Unc behav neo resp NEC,Neoplasm of uncertain behavior of other and un...,0000,0000,Not Found,Not Found
5,2567,2350,Unc behav neo salivary,Neoplasm of uncertain behavior of major saliva...,0000,0000,Not Found,Not Found
6,2568,2351,Unc behav neo oral/phar,"Neoplasm of uncertain behavior of lip, oral ca...",0000,0000,Not Found,Not Found
7,2569,2352,Unc behav neo intestine,"Neoplasm of uncertain behavior of stomach, int...",0000,0000,Not Found,Not Found
8,2570,2353,Unc behav neo liver,Neoplasm of uncertain behavior of liver and bi...,0000,0000,Not Found,Not Found
9,2571,2354,Unc behav neo peritoneum,Neoplasm of uncertain behavior of retroperiton...,0000,0000,Not Found,Not Found


In [138]:
un = undefined.copy()

In [140]:
un.iloc[0, 1]

'2355'

In [139]:
un

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE,Lv1_code,Lv2_code,Lv1_group,Lv2_group
0,1717,2355,Unc behav neo GI NEC,Neoplasm of uncertain behavior of other and un...,0000,0000,Not Found,Not Found
1,1718,2356,Unc behav neo larynx,Neoplasm of uncertain behavior of larynx,0000,0000,Not Found,Not Found
2,1719,2357,Unc behav neo lung,"Neoplasm of uncertain behavior of trachea, bro...",0000,0000,Not Found,Not Found
3,1720,2358,Unc behav neo pleura,"Neoplasm of uncertain behavior of pleura, thym...",0000,0000,Not Found,Not Found
4,1721,2359,Unc behav neo resp NEC,Neoplasm of uncertain behavior of other and un...,0000,0000,Not Found,Not Found
5,2567,2350,Unc behav neo salivary,Neoplasm of uncertain behavior of major saliva...,0000,0000,Not Found,Not Found
6,2568,2351,Unc behav neo oral/phar,"Neoplasm of uncertain behavior of lip, oral ca...",0000,0000,Not Found,Not Found
7,2569,2352,Unc behav neo intestine,"Neoplasm of uncertain behavior of stomach, int...",0000,0000,Not Found,Not Found
8,2570,2353,Unc behav neo liver,Neoplasm of uncertain behavior of liver and bi...,0000,0000,Not Found,Not Found
9,2571,2354,Unc behav neo peritoneum,Neoplasm of uncertain behavior of retroperiton...,0000,0000,Not Found,Not Found
