Steven Emrick - steve.emrick@nih.gov
usage: python crosswalk.py -k <your-api-key>
You can specify a specific UMLS version with the -v argument, but it is not required
This reads a file with codes from the Human Phenotype Ontology and maps them to the US Edition of SNOMED CT through UMLS CUIs

In [1]:
from __future__ import print_function
from Authentication import *
import requests
import json
import argparse
import collections
import sys
import os
import pandas as pd

# Cleaning the data

In [6]:
df = pd.read_csv("source_data.csv")

In [7]:
df.head()

Unnamed: 0,patient_id,age,gender,race,admission_date,diagnosis1,diagnosis2,diagnosis3,diagnosis4,diagnosis5,hcpcs,procedure1,procedure2,procedure3,procedure4,procedure5
0,P302,70,F,Unknown,12/14/2030,41071,51881,99672,78551,5856,81001,66,3607,45,40,3722
1,P430,62,F,Unknown,4/28/2021,41401,5849,4139,4148,V4582,82550,66,3607,3722,8853,45
2,P492,78,F,Unknown,7/7/2048,41011,42821,5185,4271,9971,82948,66,3606,3722,8853,45
3,P651,80,F,Unknown,2/18/2016,2848,42731,486,11284,5781,86901,4131,3893,4513,9904,8871
4,P679,68,F,Unknown,10/21/2013,9961,25040,40391,5856,41400,85014,3927,3895,3995,9904,3895


In [10]:
df.dtypes

patient_id        object
age                int64
gender            object
race              object
admission_date    object
diagnosis1        object
diagnosis2        object
diagnosis3        object
diagnosis4        object
diagnosis5        object
hcpcs             object
procedure1        object
procedure2        object
procedure3        object
procedure4        object
procedure5        object
dtype: object

# Prepare the CUI table

In [108]:
df2= pd.DataFrame(columns = ['IDC9', 'CUI'])

In [110]:
df2.head()


Unnamed: 0,IDC9,CUI


In [115]:
columns = ["diagnosis1","diagnosis2","diagnosis3","diagnosis4","diagnosis5", "procedure1", "procedure2", "procedure3", "procedure4", "procedure5"]
for i in columns:
    print(pd.concat([df2['IDC9'], df[i]], ignore_index = True))

0       410.71
1       414.01
2       410.11
3        284.8
4        996.1
5        883.0
6       414.01
7        512.0
8       414.01
9        473.2
10      414.01
11       473.8
12      414.01
13      414.01
14      414.01
15       592.1
16       458.9
17       384.3
18      444.22
19      375.56
20      414.00
21       424.0
22       173.3
23      414.01
24      786.50
25        470.
26      370.00
27      716.90
28      374.30
29       873.8
         ...  
1970    403.91
1971    536.42
1972    414.01
1973    V72.81
1974     441.3
1975     523.1
1976     432.1
1977    414.01
1978     428.0
1979     707.1
1980     473.0
1981     473.9
1982    414.01
1983    786.50
1984     781.2
1985    414.01
1986     473.0
1987     428.0
1988    414.01
1989    812.01
1990     998.2
1991    414.01
1992    414.02
1993     473.2
1994    414.01
1995      470.
1996     425.4
1997    996.72
1998     413.9
1999    410.41
Length: 2000, dtype: object
0       410.71
1       414.01
2       410.11
3        284

In [114]:
len(df2['IDC9'])

1000

In [107]:
s1 = pd.Series(['a', 'b'])
s2 = pd.Series(['c', 'd'])
s3 = pd.Series(['e', 'f'])
pd.concat([s1, s2, s3])

0    a
1    b
0    c
1    d
0    e
1    f
dtype: object

## add a decimal for diagnosis colomes after the 3rd digit 

In [14]:
for i in ["diagnosis1","diagnosis2","diagnosis3","diagnosis4","diagnosis5"]:
    df[i] = df[i].str[:3] + '.' + df[i].str[3:]

In [15]:
df.head()

Unnamed: 0,patient_id,age,gender,race,admission_date,diagnosis1,diagnosis2,diagnosis3,diagnosis4,diagnosis5,hcpcs,procedure1,procedure2,procedure3,procedure4,procedure5
0,P302,70,F,Unknown,12/14/2030,410.71,518.81,996.72,785.51,585.6,81001,66,3607,45,40,3722
1,P430,62,F,Unknown,4/28/2021,414.01,584.9,413.9,414.8,V45.82,82550,66,3607,3722,8853,45
2,P492,78,F,Unknown,7/7/2048,410.11,428.21,518.5,427.1,997.1,82948,66,3606,3722,8853,45
3,P651,80,F,Unknown,2/18/2016,284.8,427.31,486.0,112.84,578.1,86901,4131,3893,4513,9904,8871
4,P679,68,F,Unknown,10/21/2013,996.1,250.4,403.91,585.6,414.00,85014,3927,3895,3995,9904,3895


## add a decimal for procedure colomes after the 2rd digit if the procedure length greater than 2

In [17]:
for i in ["procedure1", "procedure2", "procedure3", "procedure4", "procedure5"]:
    df[i] = df[i].apply(lambda x: x[:2] + "." + x[2:] if len(x) > 2 else x)

In [18]:
df.head()

Unnamed: 0,patient_id,age,gender,race,admission_date,diagnosis1,diagnosis2,diagnosis3,diagnosis4,diagnosis5,hcpcs,procedure1,procedure2,procedure3,procedure4,procedure5
0,P302,70,F,Unknown,12/14/2030,410.71,518.81,996.72,785.51,585.6,81001,66.0,36.07,45.0,40.0,37.22
1,P430,62,F,Unknown,4/28/2021,414.01,584.9,413.9,414.8,V45.82,82550,66.0,36.07,37.22,88.53,45.0
2,P492,78,F,Unknown,7/7/2048,410.11,428.21,518.5,427.1,997.1,82948,66.0,36.06,37.22,88.53,45.0
3,P651,80,F,Unknown,2/18/2016,284.8,427.31,486.0,112.84,578.1,86901,41.31,38.93,45.13,99.04,88.71
4,P679,68,F,Unknown,10/21/2013,996.1,250.4,403.91,585.6,414.00,85014,39.27,38.95,39.95,99.04,38.95


# API connection Code

In [98]:
if sys.version_info < (3, 0):
    reload(sys)
    sys.setdefaultencoding('utf-8')


apikey =  "b693c885-4a5f-4cb5-a58c-1c80f7d025ee"
version = "2019AB"
source = "ICD9CM"
serch_type = 'exact'
AuthClient = Authentication(apikey)

In [26]:
###################################
#get TGT for our session
###################################

In [51]:
tgt = AuthClient.gettgt()
base_uri = "https://uts-ws.nlm.nih.gov"
crosswalk_endpoint = "/rest/crosswalk/"+version+"/source/ICD9CM"
content_endpoint = "/rest/content/"+str(version)+"/source/"+str(source)

In [52]:
def crosswalk_code(path):
    query = {'ticket': AuthClient.getst(tgt),'targetSource': 'SNOMEDCT_US'}
    r = requests.get(base_uri + path, params=query)
    #print(r.url + "\n")
    items = json.loads(r.text)
    return items

In [53]:
def content_code(path, query = {'ticket': AuthClient.getst(tgt)}):
    #query = {'ticket': AuthClient.getst(tgt)}
    r = requests.get(base_uri + path, params=query)
    #print(r.url + "\n")
    items = json.loads(r.text)
    return items

In [103]:
pageNumber=0
for index, row in df.iterrows():
    code = row['diagnosis1']
    path =  content_endpoint+"/"+code
    try:
        results = content_code(path)
        print(code)
        string = results["result"]["name"]
        
        print(string)
        pageNumber += 1
        query_2 = {'searchType' :serch_type, 'string':string,'ticket':AuthClient.getst(tgt)}
        
        content_endpoint_2 = "/rest/search/"+version
        r = requests.get(uri+content_endpoint_2,params=query_2)
        r.encoding = 'utf-8'
        items  = json.loads(r.text)
        jsonData = items["result"]["results"][0]['ui']
       
        print(jsonData)
        
    
    except ValueError:
        print("No result found for "+code)
        pass

410.71
Subendocardial infarction, initial episode of care
C0155657
414.01
Coronary atherosclerosis of native coronary artery
C0837134
410.11
Acute myocardial infarction of other anterior wall, initial episode of care
C0155633
284.8
Other specified aplastic anemias
C0029745
996.1
Mechanical complication of other vascular device, implant, and graft
C0161763
883.0
Open wound of finger(s), without mention of complication
C0273365
414.01
Coronary atherosclerosis of native coronary artery
C0837134
512.0
Spontaneous tension pneumothorax
C0155907
414.01
Coronary atherosclerosis of native coronary artery
C0837134
473.2
Chronic ethmoidal sinusitis
C0008681
414.01
Coronary atherosclerosis of native coronary artery
C0837134
473.8
Other chronic sinusitis
C0395986
414.01
Coronary atherosclerosis of native coronary artery
C0837134
414.01
Coronary atherosclerosis of native coronary artery
C0837134
414.01
Coronary atherosclerosis of native coronary artery
C0837134
592.1
Calculus of ureter
C0041952
458.

KeyboardInterrupt: 

In [92]:
results

{'pageSize': 25,
 'pageNumber': 1,
 'pageCount': 1,
 'result': {'classType': 'SourceAtomCluster',
  'ui': '414.01',
  'suppressible': False,
  'obsolete': False,
  'rootSource': 'ICD9CM',
  'atomCount': 2,
  'cVMemberCount': 0,
  'attributes': 'NONE',
  'atoms': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/ICD9CM/414.01/atoms',
  'descendants': 'NONE',
  'ancestors': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/ICD9CM/414.01/ancestors',
  'parents': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/ICD9CM/414.01/parents',
  'children': 'NONE',
  'relations': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/ICD9CM/414.01/relations',
  'definitions': 'NONE',
  'concepts': 'https://uts-ws.nlm.nih.gov/rest/search/2019AB?string=414.01&sabs=ICD9CM&searchType=exact&inputType=sourceUi',
  'defaultPreferredAtom': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/ICD9CM/414.01/atoms/preferred',
  'subsetMemberships': [],
  'contentViewMemberships': [],
  'name

In [24]:
with open('dr_min_data.csv','r') as f:
     for line in f:
         ##get rid of newlines
         code = line.strip()
         path =  crosswalk_endpoint+"/"+code
         try:
             results = crosswalk_code(path)
             for sourceAtomCluster in results["result"]:
                 print('ICD9CM Code - ' + code+ '\t' + 'SNOMEDCT concept -- ' + sourceAtomCluster["ui"] + ': ' + sourceAtomCluster["name"])

         except ValueError:
             print("No result found for "+code)
             pass

f.close()

NameError: name 'crosswalk_endpoint' is not defined

In [7]:
sourceAtomCluster


{'classType': 'SourceAtomCluster',
 'ui': '126660000',
 'suppressible': False,
 'obsolete': False,
 'rootSource': 'SNOMEDCT_US',
 'atomCount': 2,
 'cVMemberCount': 0,
 'attributes': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/attributes',
 'atoms': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/atoms',
 'descendants': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/descendants',
 'ancestors': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/ancestors',
 'parents': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/parents',
 'children': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/children',
 'relations': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/relations',
 'definitions': 'NONE',
 'concepts': 'https://uts-ws.nlm.nih.gov/rest/search/2019AB?string=126660000&sabs=SNOMEDCT_US&se

In [8]:

path =  crosswalk_endpoint+"/"+"470"
results = crosswalk_code(path)

In [9]:
results

{'pageSize': 25,
 'pageNumber': 1,
 'pageCount': 1,
 'result': [{'classType': 'SourceAtomCluster',
   'ui': '126660000',
   'suppressible': False,
   'obsolete': False,
   'rootSource': 'SNOMEDCT_US',
   'atomCount': 2,
   'cVMemberCount': 0,
   'attributes': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/attributes',
   'atoms': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/atoms',
   'descendants': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/descendants',
   'ancestors': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/ancestors',
   'parents': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/parents',
   'children': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/children',
   'relations': 'https://uts-ws.nlm.nih.gov/rest/content/2019AB/source/SNOMEDCT_US/126660000/relations',
   'definitions': 'NONE',
   '

In [46]:
from __future__ import print_function
from Authentication import *
import requests
import json
import argparse

data_source =[]


#source = "ICD9CM"
#username = args.username
#password = args.password
apikey = "b693c885-4a5f-4cb5-a58c-1c80f7d025ee"
version = "2019AB"
string = "458.9"
source = "ICD9CM"
uri = "https://uts-ws.nlm.nih.gov"
content_endpoint = "/rest/search/"+version
##get at ticket granting ticket for the session
AuthClient = Authentication(apikey)
tgt = AuthClient.gettgt()
pageNumber=0

while True:
    ##generate a new service ticket for each page if needed
    ticket = AuthClient.getst(tgt)
    pageNumber += 1
    query = {'string':string,'ticket':ticket, 'pageNumber':pageNumber}
    #query['includeObsolete'] = 'true'
    #query['includeSuppressible'] = 'true'
    #query['returnIdType'] = "sourceConcept"
    query['sabs'] = "ICD9CM"
    r = requests.get(uri+content_endpoint,params=query)
    r.encoding = 'utf-8'
    items  = json.loads(r.text)
    jsonData = items["result"]
    #print (json.dumps(items, indent = 4))

    print("Results for page " + str(pageNumber)+"\n")
    
    for result in jsonData["results"]:
      #data_source.append(result["rootSource"])
      print(result)
      try:
        print("ui: " + result["ui"])
      except:
        NameError
      try:
        print("uri: " + result["uri"])
      except:
        NameError
      try:
        print("name: " + result["name"])
      except:
        NameError
      try:
        print("Source Vocabulary: " + result["rootSource"])
      except:
        NameError
      
      print("\n")
        
    
    ##Either our search returned nothing, or we're at the end
    if jsonData["results"][0]["ui"] == "NONE":
        break
    print("*********")

Results for page 1

{'ui': 'NONE', 'name': 'NO RESULTS'}
ui: NONE
name: NO RESULTS




In [41]:
set(data_source)

{'HCPCS',
 'LNC',
 'MEDCIN',
 'MMSL',
 'MMX',
 'MSH',
 'MTH',
 'MTHSPL',
 'NCBI',
 'NCI',
 'NDDF',
 'OMIM',
 'RCD',
 'RXNORM',
 'SNOMEDCT_US',
 'VANDF'}