In [1]:
# Load data from crawler
import pandas as pd
codes = pd.read_json('codes.json').set_index('code')
codes['source'] = 'icdcode.info'

In [2]:
# Load and prepare data from deis
deis_codes = pd.read_excel('cie10.xls')
# Fill COD_3 rows
cache = ''
def extend_cod3(row):
    global cache
    if pd.isnull(row['COD_3']):
        row['COD_3'] = cache
    else:
        cache = row['COD_3']
    return row
# 3d codes
deis_codes_3 = deis_codes.drop(columns=['COD_4', 'DESCRIPCION CODIGOS DE CUATRO CARACTERES']).dropna()
deis_codes_3 =deis_codes_3.rename(columns={'COD_3': 'code', 'DESRIPCION CATEGORIAS DE TRES CARACTERES': 'description'})
deis_codes_3['description'] = deis_codes_3.description.str.capitalize()
# 4d codes
deis_codes_4 = deis_codes.apply(extend_cod3, axis=1).drop(columns='DESRIPCION CATEGORIAS DE TRES CARACTERES')
deis_codes_4 = deis_codes_4.rename(columns={'COD_3': 'parent_code', 'COD_4': 'code', 'DESCRIPCION CODIGOS DE CUATRO CARACTERES': 'description'})
deis_codes_4['description'] = deis_codes_4.description.str.capitalize()

In [7]:
# Add codes from deis which don't exist in crawled db
# and report ignored ones
import sys
from tqdm.auto import tqdm
def remove_last_char_crap(string):
    crap = ['†', '*']
    if string[-1] in crap:
        return string[:-1]
    else:
        return string
    
codesm = codes

print('Checking 3d codes:')
for index, row in deis_codes_3.iterrows():
    if not remove_last_char_crap(row.code) in codes.index:
        print('Ignored: missing {}: {}'.format(row.code, row.description))

print('Checking 4d codes:')
missing4c = 0
for index, row in tqdm(deis_codes_4.iterrows(), unit='codes', total=12423):
    if not remove_last_char_crap(row.code) in codes.index:
        missing4c += 1
        if not row.parent_code in codes.index:
            print('Ignored: missing parent of {}: {} => {}'.format(row.code,
                                                              row.description,
                                                              row.parent_code))
        else:
            new_row = pd.Series(name=remove_last_char_crap(row.code))
            #new_row['code'] = remove_last_char_crap(row.code)
            new_row['level'] = codes.loc[row.parent_code].level + 1
            # Grandfather and beyond
            for p_level in range(0, new_row['level'] - 1):
                tmp_key = 'code_{}'.format(p_level)
                new_row[tmp_key] = codes.loc[row.parent_code][tmp_key]
            # Parent
            tmp_key = 'code_{}'.format(new_row['level'] - 1)
            new_row[tmp_key] = row.parent_code
            
            new_row['description'] = row.description
            new_row['source'] = 'deis.cl'
            
            codesm = codesm.append(new_row, sort=False)
            #new_row['parent'] = codes.loc[row.parent_code]
            #print(new_row)
            #sys.exit()
            
print('Missing {} 4d codes'.format(missing4c))

Checking 3d codes:
Ignored: missing G07*A2492: Absceso y granuloma intracraneal e intrarraquideo en enfermedades clasificadas en otra parte
Ignored: missing G32*A2561: Otros trastornos degenerativos del sistema nervioso en enfermedades clasificados en otra parte
Ignored: missing G73*: Trastornos del musculo y de la union neuromuscular en enfermedades clasificadas en otra parte
Checking 4d codes:


HBox(children=(IntProgress(value=0, max=12423), HTML(value='')))

Ignored: missing parent of G730*: Sindromes miastenicos en enfermedades endocrinas => G73*
Ignored: missing parent of G731*: Sindrome de eaton-lambert (c80†) => G73*
Ignored: missing parent of G732*: Otros sindromes miastenicos en enfermedad neoplasica (c00-d48†) => G73*
Ignored: missing parent of G733*: Sindromes miastenicos en otras enfermedades clasificadas en otra parte => G73*
Ignored: missing parent of G734*: Miopatia en enfermedades infecciosas y parasitarias clasificadas en otra parte => G73*
Ignored: missing parent of G735*: Miopatia en enfermedades endocrinas => G73*
Ignored: missing parent of G736*: Miopatia en enfermedades metabolicas => G73*
Ignored: missing parent of G737*: Miopatia en otras enfermedades clasificadas en otra parte => G73*
Ignored: missing parent of O93: Muerte materna de causa basica especificada en otro capitulo relacionada con el embarazo => O93
Missing 3239 4d codes


In [10]:
# Check the merge
codesm.source.value_counts()

icdcode.info    11268
deis.cl          3230
Name: source, dtype: int64

In [12]:
codesm.reset_index().to_csv('cie-10.csv', index=False)