Steven Emrick - steve.emrick@nih.gov
usage: python crosswalk.py -k <your-api-key>
You can specify a specific UMLS version with the -v argument, but it is not required
This reads a file with codes from the Human Phenotype Ontology and maps them to the US Edition of SNOMED CT through UMLS CUIs

In [2]:
from __future__ import print_function
from Authentication import *
import requests
import json
import argparse
import collections
import sys
import os
import pandas as pd
import numpy as np
import timeit

# Cleaning the data

In [3]:
#read the source_data 
df = pd.read_csv("source_data.csv")

In [4]:
df.head()

Unnamed: 0,patient_id,age,gender,race,admission_date,diagnosis1,diagnosis2,diagnosis3,diagnosis4,diagnosis5,hcpcs,procedure1,procedure2,procedure3,procedure4,procedure5
0,P302,70,F,Unknown,12/14/2030,41071,51881,99672,78551,5856,81001,66,3607,45,40,3722
1,P430,62,F,Unknown,4/28/2021,41401,5849,4139,4148,V4582,82550,66,3607,3722,8853,45
2,P492,78,F,Unknown,7/7/2048,41011,42821,5185,4271,9971,82948,66,3606,3722,8853,45
3,P651,80,F,Unknown,2/18/2016,2848,42731,486,11284,5781,86901,4131,3893,4513,9904,8871
4,P679,68,F,Unknown,10/21/2013,9961,25040,40391,5856,41400,85014,3927,3895,3995,9904,3895


In [5]:
df.dtypes

patient_id        object
age                int64
gender            object
race              object
admission_date    object
diagnosis1        object
diagnosis2        object
diagnosis3        object
diagnosis4        object
diagnosis5        object
hcpcs             object
procedure1        object
procedure2        object
procedure3        object
procedure4        object
procedure5        object
dtype: object

## add a decimal for diagnosis colomes after the 3rd digit 

In [6]:
columns = ['diagnosis1','diagnosis2','diagnosis3','diagnosis4','diagnosis5']
for i in columns:
    df[i] = df[i].apply(lambda x: x if(x[0] == 'V' and len(x) < 4) 
                            else (x[:3] + '.' + x[3:] if (x[0] == 'V' and len(x) >= 4) 
                            else (x if (x[0].isalpha() and len(x) < 5) 
                            else (x[:4] + '.' + x[4:] if (x[0].isalpha() and len(x) >= 5) 
                            else (x if (x[0].isnumeric() and len(x) < 4) else (x[:3] + '.' + x[3:]))))))

In [6]:
df.head()

Unnamed: 0,patient_id,age,gender,race,admission_date,diagnosis1,diagnosis2,diagnosis3,diagnosis4,diagnosis5,hcpcs,procedure1,procedure2,procedure3,procedure4,procedure5
0,P302,70,F,Unknown,12/14/2030,410.71,518.81,996.72,785.51,585.6,81001,66,3607,45,40,3722
1,P430,62,F,Unknown,4/28/2021,414.01,584.9,413.9,414.8,V45.82,82550,66,3607,3722,8853,45
2,P492,78,F,Unknown,7/7/2048,410.11,428.21,518.5,427.1,997.1,82948,66,3606,3722,8853,45
3,P651,80,F,Unknown,2/18/2016,284.8,427.31,486.0,112.84,578.1,86901,4131,3893,4513,9904,8871
4,P679,68,F,Unknown,10/21/2013,996.1,250.4,403.91,585.6,414.00,85014,3927,3895,3995,9904,3895


## add a decimal for procedure colomes after the 2rd digit if the procedure length greater than 2

In [7]:
columns = ["procedure1", "procedure2", "procedure3", "procedure4", "procedure5"]
for i in columns:
    df[i] = df[i].apply(lambda x: x[:2] + "." + x[2:] if len(x) > 2 else x)

In [8]:
df.head()

Unnamed: 0,patient_id,age,gender,race,admission_date,diagnosis1,diagnosis2,diagnosis3,diagnosis4,diagnosis5,hcpcs,procedure1,procedure2,procedure3,procedure4,procedure5
0,P302,70,F,Unknown,12/14/2030,410.71,518.81,996.72,785.51,585.6,81001,66.0,36.07,45.0,40.0,37.22
1,P430,62,F,Unknown,4/28/2021,414.01,584.9,413.9,414.8,V45.82,82550,66.0,36.07,37.22,88.53,45.0
2,P492,78,F,Unknown,7/7/2048,410.11,428.21,518.5,427.1,997.1,82948,66.0,36.06,37.22,88.53,45.0
3,P651,80,F,Unknown,2/18/2016,284.8,427.31,486.0,112.84,578.1,86901,41.31,38.93,45.13,99.04,88.71
4,P679,68,F,Unknown,10/21/2013,996.1,250.4,403.91,585.6,414.00,85014,39.27,38.95,39.95,99.04,38.95


# creating ICD9 code dataframe

In [9]:
ICD9_Code = []
columns = ['diagnosis1','diagnosis2','diagnosis3','diagnosis4','diagnosis5', 
           'procedure1','procedure2','procedure3','procedure4','procedure5']
for i in columns:
    ICD9_Code.append(df[i].tolist())

In [10]:
Code = [item for sublist in ICD9_Code for item in sublist]

In [11]:
x = np.array(Code) 
Code = np.unique(x) 

In [12]:
ICD9_CUI_mapping = pd.DataFrame(Code, columns = ['ICD9_Code'])

In [13]:
ICD9_CUI_mapping['ICD9_Code']

0        10.42
1        10.49
2         10.5
3         10.9
4        10.91
5        11.00
6        11.39
7        11.49
8        11.52
9        11.63
10       11.64
11       11.73
12       110.1
13       112.0
14       112.5
15      112.84
16       114.6
17      115.90
18       117.5
19       12.14
20       12.32
21       12.33
22        12.4
23       12.41
24       12.42
25       12.64
26       12.69
27       12.89
28       12.91
29       12.92
         ...  
1762    V46.11
1763    V49.72
1764    V49.73
1765    V49.76
1766    V53.31
1767    V53.32
1768     V54.8
1769     V55.0
1770     V57.1
1771     V58.0
1772     V58.1
1773     V58.3
1774    V58.49
1775    V58.61
1776    V58.69
1777    V58.73
1778    V58.81
1779    V62.84
1780     V65.2
1781    V67.09
1782     V70.7
1783     V72.6
1784    V72.81
1785    V72.82
1786    V72.83
1787    V72.84
1788    V76.41
1789    V76.44
1790    V76.49
1791     V81.5
Name: ICD9_Code, Length: 1792, dtype: object

# API connection Code

In [18]:
apikey =  "b693c885-4a5f-4cb5-a58c-1c80f7d025ee"
version = "2019AB"
source = "ICD9CM"
serch_type = "exact"
base_uri = "https://uts-ws.nlm.nih.gov"

In [19]:
#connect to the server (Authentication part)
if sys.version_info < (3, 0):
    reload(sys)
    sys.setdefaultencoding('utf-8')

AuthClient = Authentication(apikey)
tgt = AuthClient.gettgt()

In [20]:
def request_code(path, query):
    r = requests.get(base_uri + path, params=query)
    r.encoding = 'utf-8'
    #print(r.url + "\n")
    items = json.loads(r.text)
    return items

There are 2 ways to get the CUIs:
as a list:
- result: has the list of CUIs
- com


In [23]:
start = timeit.default_timer()
result = []

for index, row in ICD9_CUI_mapping.iterrows():
    code = row['ICD9_Code']
    name_path =  "/rest/content/"+str(version)+"/source/"+str(source)+"/"+code
    name_query = query = {'ticket': AuthClient.getst(tgt)}
    
    #cui_query = {'string':string,'searchType':serch_type, 'ticket':AuthClient.getst(tgt)}
    cui_path = "/rest/search/current"
    
    print (index)
    try:
        #request data based on ICD9CM code
        results = request_code(name_path, name_query)
        #print the IDC9CM code
        print(code)
        #get the name of code
        string = results["result"]["name"]
        #print the name of the code
        print(string)
        cui_query = {'string':string,'searchType':serch_type, 
                     'ticket':AuthClient.getst(tgt)}

        #request the cui based on the name
        results = request_code(cui_path, cui_query)
        #get the cui from the result
        jsonData = results["result"]["results"][0]['ui']
        #print the cui
        print(jsonData)
        result.append(jsonData)
        
        ICD9_CUI_mapping.set_value(index, 'CUI', jsonData) 
        ICD9_CUI_mapping.set_value(index, 'name', string)
        
    
    except ValueError:
        result.append("No result")
        pass
    print("-------------------------------------------------------------------")
    
stop = timeit.default_timer()
print('Time: ', stop - start)

0
10.42
Reconstruction of conjunctival cul-de-sac with free graft
-------------------------------------------------------------------




1
10.49
Other conjunctivoplasty
-------------------------------------------------------------------
2
10.5
Lysis of adhesions of conjunctiva and eyelid
-------------------------------------------------------------------
3
10.9
Other operations on conjunctiva
-------------------------------------------------------------------
4
10.91
Subconjunctival injection
-------------------------------------------------------------------
5
-------------------------------------------------------------------
6
11.39
Other excision of pterygium
-------------------------------------------------------------------
7
11.49
Other removal or destruction of corneal lesion
-------------------------------------------------------------------
8
11.52
Repair of postoperative wound dehiscence of cornea
-------------------------------------------------------------------
9


KeyboardInterrupt: 

In [24]:
ICD9_CUI_mapping

Unnamed: 0,ICD9_Code,CUI,name
0,10.42,C0176199,Reconstruction of conjunctival cul-de-sac with...
1,10.49,C0176202,Other conjunctivoplasty
2,10.5,C0176203,Lysis of adhesions of conjunctiva and eyelid
3,10.9,C0175568,Other operations on conjunctiva
4,10.91,C0197180,Subconjunctival injection
5,11.00,,
6,11.39,C0176209,Other excision of pterygium
7,11.49,C0176212,Other removal or destruction of corneal lesion
8,11.52,C0197453,Repair of postoperative wound dehiscence of co...
9,11.63,,


Add column to the ICD9_CUI_mapping dataframe

In [None]:
ICD9_CUI_mapping['CUI'] = result

Save dataframe into csv file

In [None]:
ICD9_CUI_mapping.to_csv(r'C:\\Users\\naren\\OneDrive\\Desktop\\Result.csv')