In [1]:
import pandas as pd
import numpy as np
from omop_etl.io import to_csv
from omop_etl.datastore import DataStore
from omop_etl.utils import timeitc, find, search

In [2]:
ds = DataStore('//share.ahc.ufl.edu/share$/DSS/IDR_Projects/Cancer_Center/omop_cc_etl/config.yml')

In [3]:
with ds.engine.connect() as con:
    icd = pd.read_sql('''
        with icd_counts as (
        select diag_cd_decml as diag_cd
              ,icd_type
              ,count(distinct patient_key) rc
        from stage.CONDITION
        group by diag_cd_decml, icd_type
        )
        select * from icd_counts
        where rc < 11
        --and diag_cd like 'T21%'
    ''', con)

In [4]:
icd.shape

(24424, 3)

In [5]:
# icd['agg1'] = icd.diag_cd.apply(lambda s: s[:-1] if search('\.', s) else s)
# icd['agg1'] = icd.agg1.apply(lambda s: s.replace('.','') if search('\.$', s) else s)

# agg1 = icd.groupby(['agg1','icd_type']).sum().reset_index()
# agg1 = agg1[(agg1.rc < 11) & find('\.', agg1.agg1)].copy()

# agg1['agg2'] = agg1.agg1.apply(lambda s: s[:-1] if search('\.', s) else s)
# agg1['agg2'] = agg1.agg2.apply(lambda s: s.replace('.','') if search('\.$', s) else s)

# icd['agg2'] = icd.merge(agg1, on='agg1', how='left')['agg2']

# icd

# agg2 = agg1.groupby(['agg2','icd_type']).sum().reset_index()
# agg2 = agg2[(agg2.rc < 11) & find('\.', agg2.agg2)].copy()

# agg2['agg3'] = agg2.agg2.apply(lambda s: s[:-1] if search('\.', s) else s)
# agg2['agg3'] = agg2.agg3.apply(lambda s: s.replace('.','') if search('\.$', s) else s)

In [6]:
def aggregate_icd(df, columns):
    '''
    Requires one column with diagnosis codes and one column with patient counts per ICD code.
    '''
    df = df[columns].copy()
    df.columns = ['prev_icd','rc']
    icd, rc = df.columns
    new_cd = 'icd'
    
    # Remove last char from codes with decimal
    df[new_cd] = df[icd].apply(lambda s: s[:-1] if search('\.', s) else s)
    df[new_cd] = df[new_cd].apply(lambda s: s.replace('.','') if search('\.$', s) else s)
    agg_icd = df.groupby([new_cd]).sum().reset_index()
    
    return df.merge(agg_icd, on=new_cd, how='left', suffixes=('_prev', ''))

In [7]:
icd['new_cd'] = icd.diag_cd
# icd['lev1'] = icd.diag_cd.apply(lambda s: s.split('.')[0])

agg = aggregate_icd(icd, columns=['new_cd','rc'])
rep_icd = {x:y for x,y in zip(agg.prev_icd, agg.icd)}
icd.replace({'new_cd':rep_icd}, inplace=True)
to_group = icd.groupby('new_cd').sum().reset_index()
icd_list = to_group[to_group.rc < 11]['new_cd'].to_list()

# Move all codes one level up until aggregated counts are >= 11
while not agg.empty:
    agg = aggregate_icd(icd[icd.new_cd.isin(icd_list)], columns=['new_cd','rc'])
    agg1 = agg[agg.rc > 10]
    if agg1.empty:
        icd.replace({'new_cd': {x:y for x,y in zip(agg.prev_icd, agg.icd)}}, inplace=True)
        agg1 = aggregate_icd(agg, columns=['icd','rc_prev'])

    agg1 = agg[agg.rc > 10]
    icd.replace({'new_cd':{x:y for x,y in zip(agg1.prev_icd, agg1.icd)}}, inplace=True)

    to_group = icd.groupby('new_cd').sum().reset_index()
    icd_list = to_group[to_group.rc < 11]['new_cd'].to_list()

    # break loop if all codes reached max level or if icd_list is empty.
    if not [x for x in icd_list if search('\.', x)]: 
        break

    # print(len(icd_list), count)

mask = to_group[to_group.rc < 11]
icd.replace({'new_cd':{x:y for x,y in zip(mask.new_cd, ['000']*len(mask.new_cd))}}, inplace=True)

In [9]:
with ds.engine.connect() as con:
    icd.drop('rc', 1).to_sql('deid_diag_cd', con, schema='xref', index=False, if_exists='replace')

In [8]:
# g1.groupby('new_cd').sum().head(50)
# g1[g1.new_cd == 'S82.0']
# lev1

In [9]:
#S60.945
code = '003'

In [10]:
icd[find('^' + code, icd.diag_cd)].groupby('new_cd').sum()

Unnamed: 0_level_0,rc
new_cd,Unnamed: 1_level_1
3,13


In [11]:
# icd.lev1.unique()
icd[find('^'+code, icd.diag_cd)].sort_values('new_cd')

Unnamed: 0,diag_cd,icd_type,rc,new_cd
761,3.29,ICD9,1,3
4903,3.24,ICD9,1,3
5609,3.2,ICD9,1,3
11099,3.1,ICD9,7,3
12408,3.8,ICD9,1,3
20196,3.23,ICD9,2,3


In [12]:
icd[icd.new_cd == code]#.rc.sum()

Unnamed: 0,diag_cd,icd_type,rc,new_cd
761,3.29,ICD9,1,3
4903,3.24,ICD9,1,3
5609,3.2,ICD9,1,3
11099,3.1,ICD9,7,3
12408,3.8,ICD9,1,3
20196,3.23,ICD9,2,3


In [13]:
aggregate_icd(icd[icd.new_cd == code], columns=['diag_cd', 'rc']).sort_values('icd')

Unnamed: 0,prev_icd,rc_prev,icd,rc
3,3.1,7,3.0,8
4,3.8,1,3.0,8
0,3.29,1,3.2,5
1,3.24,1,3.2,5
2,3.2,1,3.2,5
5,3.23,2,3.2,5


In [8]:
icd[icd.new_cd == 'S25.5']

Unnamed: 0,diag_cd,icd_type,rc,new_cd
357,S25.509A,ICD10,3,S25.5
794,S25.592A,ICD10,3,S25.5
4212,S25.512A,ICD10,2,S25.5
8735,S25.502A,ICD10,1,S25.5
14286,S25.599A,ICD10,1,S25.5
16695,S25.591A,ICD10,1,S25.5
24078,S25.501A,ICD10,1,S25.5
