Steven Emrick - steve.emrick@nih.gov
usage: python crosswalk.py -k <your-api-key>
You can specify a specific UMLS version with the -v argument, but it is not required
This reads a file with codes from the Human Phenotype Ontology and maps them to the US Edition of SNOMED CT through UMLS CUIs

In [1]:
from __future__ import print_function
from Authentication import *
import requests
import json
import argparse
import collections
import sys
import os
import pandas as pd

# Cleaning the data

In [15]:
df = pd.read_csv("source_data.csv")

In [16]:
df.head()

Unnamed: 0,patient_id,age,gender,race,admission_date,diagnosis1,diagnosis2,diagnosis3,diagnosis4,diagnosis5,hcpcs,procedure1,procedure2,procedure3,procedure4,procedure5
0,P302,70,F,Unknown,12/14/2030,41071,51881,99672,78551,5856,81001,66,3607,45,40,3722
1,P430,62,F,Unknown,4/28/2021,41401,5849,4139,4148,V4582,82550,66,3607,3722,8853,45
2,P492,78,F,Unknown,7/7/2048,41011,42821,5185,4271,9971,82948,66,3606,3722,8853,45
3,P651,80,F,Unknown,2/18/2016,2848,42731,486,11284,5781,86901,4131,3893,4513,9904,8871
4,P679,68,F,Unknown,10/21/2013,9961,25040,40391,5856,41400,85014,3927,3895,3995,9904,3895


In [17]:
df.dtypes

patient_id        object
age                int64
gender            object
race              object
admission_date    object
diagnosis1        object
diagnosis2        object
diagnosis3        object
diagnosis4        object
diagnosis5        object
hcpcs             object
procedure1        object
procedure2        object
procedure3        object
procedure4        object
procedure5        object
dtype: object

# Prepare the CUI table

In [18]:
df2= pd.DataFrame(columns = ['IDC9', 'CUI'])

In [19]:
df2.head()


Unnamed: 0,IDC9,CUI


In [20]:
columns = ["diagnosis1","diagnosis2","diagnosis3","diagnosis4","diagnosis5", "procedure1", "procedure2", "procedure3", "procedure4", "procedure5"]
for i in columns:
    print(pd.concat([df2['IDC9'], df[i]], ignore_index = True))

0      41071
1      41401
2      41011
3       2848
4       9961
5       8830
6      41401
7       5120
8      41401
9       4732
10     41401
11      4738
12     41401
13     41401
14     41401
15      5921
16      4589
17      3843
18     44422
19     37556
20     41400
21      4240
22      1733
23     41401
24     78650
25       470
26     37000
27     71690
28     37430
29      8738
       ...  
970    40391
971    53642
972    41401
973    V7281
974     4413
975     5231
976     4321
977    41401
978     4280
979     7071
980     4730
981     4739
982    41401
983    78650
984     7812
985    41401
986     4730
987     4280
988    41401
989    81201
990     9982
991    41401
992    41402
993     4732
994    41401
995      470
996     4254
997    99672
998     4139
999    41041
Length: 1000, dtype: object
0      51881
1       5849
2      42821
3      42731
4      25040
5      V1051
6       4139
7       4928
8       4401
9       4730
10      4111
11       496
12      4292
13      40

In [21]:
len(df2['IDC9'])

0

In [22]:
s1 = pd.Series(['a', 'b'])
s2 = pd.Series(['c', 'd'])
s3 = pd.Series(['e', 'f'])
pd.concat([s1, s2, s3])

0    a
1    b
0    c
1    d
0    e
1    f
dtype: object

## add a decimal for diagnosis colomes after the 3rd digit 

In [23]:
for i in ["diagnosis1","diagnosis2","diagnosis3","diagnosis4","diagnosis5"]:
    df[i] = df[i].str[:3] + '.' + df[i].str[3:]

In [24]:
df.head()

Unnamed: 0,patient_id,age,gender,race,admission_date,diagnosis1,diagnosis2,diagnosis3,diagnosis4,diagnosis5,hcpcs,procedure1,procedure2,procedure3,procedure4,procedure5
0,P302,70,F,Unknown,12/14/2030,410.71,518.81,996.72,785.51,585.6,81001,66,3607,45,40,3722
1,P430,62,F,Unknown,4/28/2021,414.01,584.9,413.9,414.8,V45.82,82550,66,3607,3722,8853,45
2,P492,78,F,Unknown,7/7/2048,410.11,428.21,518.5,427.1,997.1,82948,66,3606,3722,8853,45
3,P651,80,F,Unknown,2/18/2016,284.8,427.31,486.0,112.84,578.1,86901,4131,3893,4513,9904,8871
4,P679,68,F,Unknown,10/21/2013,996.1,250.4,403.91,585.6,414.00,85014,3927,3895,3995,9904,3895


## add a decimal for procedure colomes after the 2rd digit if the procedure length greater than 2

In [25]:
for i in ["procedure1", "procedure2", "procedure3", "procedure4", "procedure5"]:
    df[i] = df[i].apply(lambda x: x[:2] + "." + x[2:] if len(x) > 2 else x)

In [26]:
df.head()

Unnamed: 0,patient_id,age,gender,race,admission_date,diagnosis1,diagnosis2,diagnosis3,diagnosis4,diagnosis5,hcpcs,procedure1,procedure2,procedure3,procedure4,procedure5
0,P302,70,F,Unknown,12/14/2030,410.71,518.81,996.72,785.51,585.6,81001,66.0,36.07,45.0,40.0,37.22
1,P430,62,F,Unknown,4/28/2021,414.01,584.9,413.9,414.8,V45.82,82550,66.0,36.07,37.22,88.53,45.0
2,P492,78,F,Unknown,7/7/2048,410.11,428.21,518.5,427.1,997.1,82948,66.0,36.06,37.22,88.53,45.0
3,P651,80,F,Unknown,2/18/2016,284.8,427.31,486.0,112.84,578.1,86901,41.31,38.93,45.13,99.04,88.71
4,P679,68,F,Unknown,10/21/2013,996.1,250.4,403.91,585.6,414.00,85014,39.27,38.95,39.95,99.04,38.95


# API connection Code

In [65]:
if sys.version_info < (3, 0):
    reload(sys)
    sys.setdefaultencoding('utf-8')


apikey =  "b693c885-4a5f-4cb5-a58c-1c80f7d025ee"
version = "2019AB"
source = "ICD9CM"
serch_type = "exact"
AuthClient = Authentication(apikey)

In [66]:
###################################
#get TGT for our session
###################################
#https://utslogin.nlm.nih.gov/rest/search/current
#https://uts-ws.nlm.nih.gov/rest/search/current/2019AB?string=Reconstruction of conjunctival cul-de-sac with free graft&searchType=exact&ticket=ST-6534140-cE65gegbW4i77pLEE2s3-cas

In [67]:
tgt = AuthClient.gettgt()
base_uri = "https://uts-ws.nlm.nih.gov"
crosswalk_endpoint = "/rest/crosswalk/"+version+"/source/ICD9CM"
content_endpoint = "/rest/content/"+str(version)+"/source/"+str(source)

In [68]:
def crosswalk_code(path):
    query = {'ticket': AuthClient.getst(tgt),'targetSource': 'SNOMEDCT_US'}
    r = requests.get(base_uri + path, params=query)
    #print(r.url + "\n")
    items = json.loads(r.text)
    return items

In [69]:
def content_code(path):
    query = {'ticket': AuthClient.getst(tgt)}
    r = requests.get(base_uri + path, params=query)
    #print(r.url + "\n")
    items = json.loads(r.text)
    return items

In [78]:
pageNumber=0

for index, row in data1.iterrows():
    code = row['ICD9_Code']
    path =  content_endpoint+"/"+code
    try:
        results = content_code(path)
        print(code)
        string = results["result"]["name"]
        
        print(string)
        pageNumber += 1
        query_2 = {'string':string,'searchType':serch_type, 'ticket':AuthClient.getst(tgt)}
        
        content_endpoint_2 = "/rest/search/current"
        r = requests.get(uri+content_endpoint_2,params=query_2)
        print(uri+content_endpoint_2)
        r.encoding = 'utf-8'
        items  = json.loads(r.text)
        jsonData = items["result"]
        #jsonData = items["result"]["results"][0]['ui']
       
        print(jsonData)
        
    
    except ValueError:
        print("No result found for "+code)
        pass

10.42
Reconstruction of conjunctival cul-de-sac with free graft
https://utslogin.nlm.nih.gov/rest/search/current
No result found for 10.42
10.49
Other conjunctivoplasty
https://utslogin.nlm.nih.gov/rest/search/current
No result found for 10.49
10.5
Lysis of adhesions of conjunctiva and eyelid


KeyboardInterrupt: 

In [None]:
results

In [None]:
with open('dr_min_data.csv','r') as f:
     for line in f:
         ##get rid of newlines
         code = line.strip()
         path =  crosswalk_endpoint+"/"+code
         try:
             results = crosswalk_code(path)
             for sourceAtomCluster in results["result"]:
                 print('ICD9CM Code - ' + code+ '\t' + 'SNOMEDCT concept -- ' + sourceAtomCluster["ui"] + ': ' + sourceAtomCluster["name"])

         except ValueError:
             print("No result found for "+code)
             pass

f.close()

In [None]:
sourceAtomCluster


In [None]:

path =  crosswalk_endpoint+"/"+"470"
results = crosswalk_code(path)

In [None]:
results

In [None]:
from __future__ import print_function
from Authentication import *
import requests
import json
import argparse

data_source =[]


#source = "ICD9CM"
#username = args.username
#password = args.password
apikey = "b693c885-4a5f-4cb5-a58c-1c80f7d025ee"
version = "2019AB"
string = "458.9"
source = "ICD9CM"
uri = "https://uts-ws.nlm.nih.gov"
content_endpoint = "/rest/search/"+version
##get at ticket granting ticket for the session
AuthClient = Authentication(apikey)
tgt = AuthClient.gettgt()
pageNumber=0

while True:
    ##generate a new service ticket for each page if needed
    ticket = AuthClient.getst(tgt)
    pageNumber += 1
    query = {'string':string,'ticket':ticket, 'pageNumber':pageNumber}
    #query['includeObsolete'] = 'true'
    #query['includeSuppressible'] = 'true'
    #query['returnIdType'] = "sourceConcept"
    query['sabs'] = "ICD9CM"
    r = requests.get(uri+content_endpoint,params=query)
    r.encoding = 'utf-8'
    items  = json.loads(r.text)
    jsonData = items["result"]
    #print (json.dumps(items, indent = 4))

    print("Results for page " + str(pageNumber)+"\n")
    
    for result in jsonData["results"]:
      #data_source.append(result["rootSource"])
      print(result)
      try:
        print("ui: " + result["ui"])
      except:
        NameError
      try:
        print("uri: " + result["uri"])
      except:
        NameError
      try:
        print("name: " + result["name"])
      except:
        NameError
      try:
        print("Source Vocabulary: " + result["rootSource"])
      except:
        NameError
      
      print("\n")
        
    
    ##Either our search returned nothing, or we're at the end
    if jsonData["results"][0]["ui"] == "NONE":
        break
    print("*********")

In [None]:
set(data_source)

# creating ICD9 code dataframe


In [27]:
ICD9_Code = []
for i in ['diagnosis1','diagnosis2','diagnosis3','diagnosis4','diagnosis5', 'procedure1','procedure2','procedure3','procedure4','procedure5']:
    #print(i)
    #data1['ICD9_Code'] = data1['ICD9_Code'].concat(data[i], ignore_index = True)
    
    ICD9_Code.append(df[i].tolist())
    
    

In [28]:
Code = [item for sublist in ICD9_Code for item in sublist]

In [29]:
import numpy as np
x = np.array(Code) 
Code = np.unique(x) 

In [30]:
data1 = pd.DataFrame(Code, columns = ['ICD9_Code'])

In [31]:
data1['ICD9_Code']

0        10.42
1        10.49
2         10.5
3         10.9
4        10.91
5        11.00
6        11.39
7        11.49
8        11.52
9        11.63
10       11.64
11       11.73
12       110.1
13       112.0
14       112.5
15      112.84
16       114.6
17      115.90
18       117.5
19       12.14
20       12.32
21       12.33
22        12.4
23       12.41
24       12.42
25       12.64
26       12.69
27       12.89
28       12.91
29       12.92
         ...  
1763    V46.11
1764    V49.72
1765    V49.73
1766    V49.76
1767    V53.31
1768    V53.32
1769     V54.8
1770     V55.0
1771     V57.1
1772     V58.0
1773     V58.1
1774     V58.3
1775    V58.49
1776    V58.61
1777    V58.69
1778    V58.73
1779    V58.81
1780    V62.84
1781     V65.2
1782    V67.09
1783     V70.7
1784     V72.6
1785    V72.81
1786    V72.82
1787    V72.83
1788    V72.84
1789    V76.41
1790    V76.44
1791    V76.49
1792     V81.5
Name: ICD9_Code, Length: 1793, dtype: object