In [1]:
rxnorm_input_path="C:\\Users\\visha\\Documents\\novartis\\implementation_withoutapi\\data\\rxnorm"
fda_input_path="C:\\Users\\visha\\Documents\\novartis\\implementation_withoutapi\\data\\fda"
triples_output_path="C:\\Users\\visha\\Documents\\novartis\\implementation_withoutapi\\results"

In [2]:
import csv
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import sys
import os

In [3]:
#Function: Write results to tsv file
def write_tsv(file_name, data_lists):
    with open(file_name, "w", encoding="utf-8") as f:
        f.write("node1\tlabel\tnode2\n")
        for data in data_lists:
            line = "\t".join(data)
            line += "\n"
            f.write(line)

In [4]:
#Declare list to hold all triples
triples_all=[]

In [5]:
#Create a dictionary to hold all the Term types for an RXNorm
rxcui_tty_dict={}

#RXNCONSO table: gives rxcui, name(label),tty(description), synonym(alias), language(ENG) and suppress(Y/N)
#RXNCONSO table: gives related identifiers e.g. MSH, DRUGBANK, SNOMEDCT
filename= os.path.join(rxnorm_input_path, 'rxnconso.csv')

with open(filename, encoding="utf-8") as f:
    reader = csv.reader(f)
    
    #Get the list of headers from the file
    header_list = next(reader)
    print("Header=", header_list)
    
    
    for row in reader:
        triples=[]
        
        #Check the identifier is not RXNORM i.e MSH, SNOMED, DRUGBANK etc.
        #since RXNORM is the subject and we already get the rxcui from the name
        if row[11] != 'RXNORM':
            #print("Subject Identifier=",row[0])
            #print(row[0], row[11], row[13])
            
            #Get the triples related to identifiers- MSH, SNOMED etc.
            triples.append("\""+row[0]+"\"")
            triples.append(row[11])
            triples.append("\""+row[13]+"\"")
            
            #Append triples related to identifiers to list of all triples
            triples_all.append(triples)
                
        
        #Declare list of triples for lang, rxcui, tty, name, synonym, suppress
        triples_lang=[]
        triples_rxcui=[]
        triples_tty=[]
        triples_name=[]
        triples_suppress=[]
        triples_synonym=[]
        
        #Check identifier is RXNORM and is not a Synonym/Tall Man Lettering Synonym/Prescribable Name
        #This gives us all the required RXNORM triples other than synonym
        if row[11] == 'RXNORM' and row[12] not in ('SY', 'TMSY', 'PSN'):
            #print("Subject Triples=",row[0])
            
            #Get the triples for language
            triples_lang.append("\""+row[0]+"\"")
            triples_lang.append("language")
            triples_lang.append("\""+row[1]+"\"")
            
            #Get the triples for rxcui
            triples_rxcui.append("\""+row[0]+"\"")
            triples_rxcui.append("rxcui")
            triples_rxcui.append("\""+row[13]+"\"")
            
            #Get the triples for tty (description)
            triples_tty.append("\""+row[0]+"\"")
            triples_tty.append("tty")
            triples_tty.append("\""+row[12]+"\"")
            
            #Add the tty in the dictionary for the subject
            rxcui_tty_dict[row[0]]=row[12]
            
            #Get the triples for name (label)
            triples_name.append("\""+row[0]+"\"")
            triples_name.append("name")
            triples_name.append("\""+row[14]+"\"")
            
            #Get the triples for suppress
            triples_suppress.append("\""+row[0]+"\"")
            triples_suppress.append("suppress")
            triples_suppress.append("\""+row[16]+"\"")
            
            #Append triples for lang, rxcui, tty, name, synonym, suppress
            if triples_lang != []:
                triples_all.append(triples_lang)
            if triples_rxcui != []:
                triples_all.append(triples_rxcui)
            if triples_tty != []:
                triples_all.append(triples_tty)
            if triples_name != []:
                triples_all.append(triples_name)
            if triples_suppress !=[]:
                triples_all.append(triples_suppress)
        
        #Check identifier is RXNORM and is a Synonym/Tall Man Lettering Synonym/Prescribable Name
        #This gives us all the required triples for synonym
        if row[11] == 'RXNORM' and row[12] in ('SY', 'TMSY', 'PSN'):
            #print("Subject Triples=",row[0])
            
            #Get all triples for synonym
            triples_synonym.append("\""+row[0]+"\"")
            triples_synonym.append("synonym")
            triples_synonym.append("\""+row[14]+"\"")
            
            #Append triples for synonym
            if triples_synonym !=[]:
                triples_all.append(triples_synonym)

Header= ['RXCUI', 'LAT', 'TS', 'LUI', 'STT', 'SUI', 'ISPREF', 'RXAUI', 'SAUI', 'SCUI', 'SDUI', 'SAB', 'TTY', 'CODE', 'STR', 'SRL', 'SUPPRESS', 'CVF']


In [6]:
#Write the required triples for RXNorm information and Identifiers to rxnorm_triples.tsv file
filename_output= os.path.join(triples_output_path, 'rxnorm_triples.tsv')
write_tsv(filename_output, triples_all)

In [7]:
#Create a dictionary to hold all relations for a subject
rxcui_rel_dict={}

#RXNREL table: Gives us all the related RXNORMs for an RXNORMID
filename= os.path.join(rxnorm_input_path, 'rxnrel.csv')

with open(filename, encoding="utf-8") as f:
    reader = csv.reader(f)
    
    #Get the list of headers from the file
    header_list = next(reader)
    print("Header=", header_list)
    
    
    for row in reader:
        triples=[]
        
        #We only need the relations between the concepts (CUI) and not atoms (AUI)
        if row[2]=='AUI':
            break
            
        #For the subject we need to split the RXNORM which is present in this form '1656341.0'
        x=row[4].split(".",1)[0]
    
        #We only need the relation between the concepts
        if row[2]=='CUI' and row[6]=='CUI':
            subject=x
            #print("Subject Rel=",x)
            # print(row[4], row[7], row[0])
            
            #Get the triples for RXNORM Relations
            triples.append("\""+subject+"\"")
            triples.append(row[7])
            #Split the object since object is also of the form '1656341.0'
            obj=row[0].split(".",1)[0]
            triples.append("\""+obj+"\"")
            
            #Check if the subject is in the Relation dictionary=> Then Add the relation for the subject
            if subject not in rxcui_rel_dict:
                rxcui_rel_dict[subject]=[]
                rxcui_rel_dict[subject].append([row[7],obj])
            else:
                rxcui_rel_dict[subject].append([row[7],obj])
                
                
            #Append the triples for RXNORM relations to the list of all triples
            triples_all.append(triples)
           

Header= ['RXCUI1', 'RXAUI1', 'STYPE1', 'REL', 'RXCUI2', 'RXAUI2', 'STYPE2', 'RELA', 'RUI', 'SRUI', 'SAB', 'SL', 'RG', 'DIR', 'SUPPRESS', 'CVF']


In [8]:
#Write the required triples for RXNorm Relations to rxnorm_triples.tsv file
filename_output= os.path.join(triples_output_path, 'rxnorm_triples.tsv')
write_tsv(filename_output, triples_all)

In [9]:
#print("Tty dict:",rxcui_tty_dict)

In [10]:
#print("Tty rel:",rxcui_rel_dict)

In [11]:
#RXNSAT: Gives us information about the Source Atoms such as NDC Codes and UMLSCUI:
filename= os.path.join(rxnorm_input_path, 'rxnsat.csv')
with open(filename, encoding="utf-8") as f:

    reader = csv.reader(f)
    
    #Get the list of headers from the file
    header_list = next(reader)
    #print("Header=", header_list)
    
    
    for row in reader:
        triples=[]
          
        #Check if the identifier source is RXNORM and the related fields are NDC or UMLSCUI
        if row[9]== 'RXNORM' and row[8] in ('NDC', 'UMLSCUI'):
            #print("Subject SAT=",row[0])
            
            #Get the triples for NDC and UMLSCUI
            triples.append("\""+row[0]+"\"")
            triples.append(row[8])
            triples.append("\""+row[10]+"\"")
            
            #Append triples for NDC and UMLSCUI to the list of all triples
            triples_all.append(triples)

In [12]:
#Write the required triples for RXNorm Source Identifiers- NDC, UMLSCUI to rxnorm_triples.tsv file
filename_output= os.path.join(triples_output_path, 'rxnorm_triples.tsv')
write_tsv(filename_output, triples_all)

In [13]:
#Create a list to hold all known identifiers
identifier_source_list=[
"USP"
,"GS"
,"SNOMEDCT_US"
,"VANDF"
,"MTHSPL"
,"NDDF"
,"ATC"
,"MMSL"
,"MSH"
,"DRUGBANK"
,"MMX"]

In [14]:
#Create a list to hold all known relationship types
rela_types_list=[
'consists_of',
'constitutes',
'contained_in',
'contains',
'dose_form_of',
'form_of',
'has_dose_form',
'doseformgroup_of',
'has_form',
'has_ingredient',
'has_ingredients',
'has_part',
'has_precise_ingredient',
'has_quantified_form',
'has_tradename',
'has_doseformgroup',
'ingredient_of',
'ingredients_of',
'inverse_isa',
'isa',
'part_of',
'precise_ingredient_of',
'quantified_form_of',
'reformulated_to',
'reformulation_of',
'tradename_of'
]

In [15]:
#Create a dictionary to hold all Term Types and their meanings 
tty_dict={
   'BN': 'brand_name' 
  ,'BPCK': 'branded_pack'    
  ,'DF': 'dose_form'   
  ,'DFG': 'dose_form_group'
  ,'ET': 'dose_form_entry_term'    
  ,'GPCK': 'generic_pack'    
  ,'IN': 'ingredient'   
  ,'MIN': 'multiple_ingredients'   
  ,'PIN': 'precise_ingredient'    
  ,'SBD': 'branded_drug'
  ,'SBDC': 'branded_drug_component'
  ,'SBDF': 'branded_dose_form'
  ,'SBDG': 'branded_dose_form_group'
  ,'SCD': 'clinical_drug'
  ,'SCDC': 'clinical_drug_component'
  ,'SCDF': 'clinical_dose_form'
  ,'SCDG': 'clinical_dose_form_group'}

In [16]:
#Create a dictionary to hold all predicates and their corresponding Wikidata PNodes
pred_wikidata_dict={
        'instanceOf':'P31'
        ,'rxcui':'P3345'
        ,'UMLSCUI':'P2892'
        ,'SNOMEDCT_US':'P5806'
        ,'MSH':'P486'
        ,'DRUGBANK':'P715'
        ,'NDC': 'P3640'
        }

In [17]:
#Create a dictionary to hold the predicates not in Wikidata
pred_notinwikidata_dict={}

#Create a list to hold output of triples of QRXNodes with predicates in Wikidata
output_rows_pred_wiki=[]

#Create a list to hold output of triples of QRXNodes with predicates NOT in Wikidata
output_rows_pred_notwiki=[]

#Create a list to hold output of triples of PRXNOdes
output_rows_prxnode=[]

#Create a list to hold output of triples of PRXNode- Edges and DataTypes
output_rows_prxnode_edges=[]
output_rows_prxnode_datatype=[]

In [18]:
#Function: Generate a SPARQL query given an identifier
def get_query(identifier):
    query = """#All items with a property
    # Sample to query all values of a property
    # Property talk pages on Wikidata include basic queries adapted to each property
    SELECT
      ?item ?itemLabel
      ?value ?valueLabel
    # valueLabel is only useful for properties with item-datatype
    WHERE 
    {
      ?item wdt:"""+pred_wikidata_dict[identifier]+""" ?value
              
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }
    # remove or change limit for more results
    """
    return query

#Function: Get the results from Wikidata SPARQL endpoint given a query
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

In [19]:
#Create a dict to hold all QNodes in Wikidata which have RXNORMIDs
qnode_dict_inwiki={}

#Specify the endpoint for the url and the identifier name
endpoint_url = "https://query.wikidata.org/sparql"
identifier="rxcui"

#Generate the query
query=get_query(identifier)
#print(query)

In [20]:
#Get the results from the Query
results = get_results(endpoint_url, query)
   
#From the results, get the RXNorm IDs as the Key and QNodes as the Value   
for result in results["results"]["bindings"]:
    
    if result['item']['type']=='uri':
        qnode=result['item']['value']
        identifier_value=result['value']['value']
        qnode=qnode.split("entity/",1)[1]
        if identifier=='rxcui':
            qnode_dict_inwiki[identifier_value]=qnode



In [21]:
#print("IN Wiki=",qnode_dict_inwiki)

In [22]:
#Create a list to hold triples for RXNorm Qnodes
triples_qnode_all=[]

#Create triples for subject QNode, Predicate- P3345 and value as the RXNormID
for x in qnode_dict_inwiki:
    triples_qnode=[]
    triples_qnode.append(qnode_dict_inwiki[x])
    triples_qnode.append('P3345')
    triples_qnode.append("\""+x+"\"")
    
    #Append the result to the list of all Qnode triples
    triples_qnode_all.append(triples_qnode)

In [23]:
#Write the list of triples for RXNorm QNodes to Qnode_Predicates_Wiki file
filename_output= os.path.join(triples_output_path, 'qnode_pred_wiki.tsv')
write_tsv(filename_output, triples_qnode_all)

In [24]:
#Load the rxnorm_triples file which contains all RXNorm Information, Identifiers, Relations and Source Identifiers
rxnorm_triples_file= os.path.join(triples_output_path, 'rxnorm_triples.tsv')

with open(rxnorm_triples_file, encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    #Get the list of all headers in the file
    header_list = next(reader)
    print("Header=", header_list)
    
    #Create a dictionary to hold QRXNodes for the RXNormIDs
    qnode_dict_notinwiki={}
    
    #Create label and description as '' (Not needed since we are using QRXNode method)
    label=''
    desc=''
    
    #Read every row from the rxnorm_triples file
    for row in reader:
        output_row=[]
        
       
        #output_row.append(str(row[0]))
        
        #Get the label and description 
        #Alt-Method: label+description can also work- Not used here
        if row[1]=='name':
            label=str(row[2])
            
        if row[1]=='tty':
            desc=str(tty_dict[row[2]].replace("_"," "))
        
        #If subject is not in the dictionary and not in Wikidata, then add the QRXNode in the value
        if row[0] not in qnode_dict_notinwiki and row[0] not in qnode_dict_inwiki:
            qnode_dict_notinwiki[row[0]]='QRX'+str(row[0])
            
            #Alt-Method: label+description can also work- Not used here
            #label_desc_dict[row[0]]=label+'-'+desc

Header= ['node1', 'label', 'node2']


In [25]:
#print(qnode_dict_notinwiki)

In [26]:
#Get all the QRXNodes in qnode_dict_notinwiki and add P31 predicate for all
#The object is Pharmaceutical Product/ Q28885102 for all
for x in qnode_dict_notinwiki.keys():
   
    #print(x)
    output_row=[]  
   
    #Get all triples for P31
    output_row.append(qnode_dict_notinwiki[x])
    output_row.append(pred_wikidata_dict['instanceOf'])    
    output_row.append("Q28885102")
    #print(output_row)
    
    #Append triples to the Output of Predicates in Wikidata
    output_rows_pred_wiki.append(output_row)

In [27]:
#Write the output to QRXNode Predicates in Wikidata file
filename_output= os.path.join(triples_output_path, 'qrxnode_pred_wiki.tsv')
write_tsv(filename_output, output_rows_pred_wiki)

In [28]:
#Load the rxnorm_triples file which contains all RXNorm Information, Identifiers, Relations and Source Identifiers
rxnorm_triples_file= os.path.join(triples_output_path, 'rxnorm_triples.tsv')

with open(rxnorm_triples_file, encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    #Get all the headers from the file
    header_list = next(reader)
    print("Header=", header_list)
    
    #Check every row for QRXNode triples
    for row in reader:
        output_row=[]
    
        #Get the object
        obj=str(row[2])
        
        #Get the predicate
        pred=str(row[1])
        
        #If predicate is name, change it to label
        if pred=="name":
            pred="label"
        #If predicate is tty, change it to description, remove the underscore(_) from the object
        elif pred=="tty":
            pred="description"
            obj=tty_dict[obj].replace("_"," ")
        #If predicate is synonym, change it to alias
        elif pred=="synonym":
            pred="alias"
        #If predicate is present in Term Type dictionary, change it to meaniningful context using the dictionary
        elif pred in tty_dict.keys():
            pred=tty_dict[pred]
            
            #Check if object has QRXNode then make object as QRXNode
            if obj in qnode_dict_notinwiki.keys():
                obj=qnode_dict_notinwiki[obj]
            #Check if object has QNode then make object as QNode
            elif obj in qnode_dict_inwiki.keys():
                obj=qnode_dict_inwiki[obj]
            #Else just a check but this is not possible
            else:
                print("Not in Both!")
        #If predicate is present in RXNorm Relations list
        elif pred in rela_types_list:
            #Check if object has QRXNode then make object as QRXNode
            if obj in qnode_dict_notinwiki.keys():
                obj=qnode_dict_notinwiki[obj]
            #Check if object has QNode then make object as QNode
            elif obj in qnode_dict_inwiki.keys():
                obj=qnode_dict_inwiki[obj]
            #Else just a check but this is not possible
            else:
                obj="QRX"+obj #temp
                
           
        
        
        #If predicate is label, description, alias or suppress, use escape sequence for Quotes inside (')
        if pred in ("alias", "label", "description", "suppress"):
            if "'" in obj:
                obj=obj.replace("\'","")
            
            #Change the object to have Quotes followed by @ and then the language en as default since all are english
            if obj != '':
                obj="\'"+obj+"\'"+"@en"   
                
        #If predicate is RXCUI or UMLSCUI or in any of the identifiers then change the object to have quotes
        if pred in ( "rxcui", "UMLSCUI") or pred in identifier_source_list or pred == 'NDC':
            obj="\""+obj+"\""
            
        #If predicate is language, change the object to en as default since all are english
        elif pred =="language":
            obj="en"
        
        
        #Check if predicate is an RXNorm information such as label, alias, description 
        #Check if predicate is an identifier- SNOMED, MSH, DRUGBANK, NDC which has Wikidata PNodes
        #Check if object is not empty
        if pred in ("alias","label","description", "rxcui", "UMLSCUI", "SNOMEDCT_US", "MSH", "DRUGBANK", "NDC") and str(obj) != '':
            #Check if the predicate is an identifier, then change the predicate to the corresponding PNodes
            if pred in ("rxcui", "UMLSCUI","SNOMEDCT_US", "MSH", "DRUGBANK", "NDC"):
                pred=pred_wikidata_dict[pred]
                
            #If the subject is Not In Wikidata (has a QRXNode), then append the triples for that QRXNode
            if row[0] in qnode_dict_notinwiki:
                output_row.append(str(qnode_dict_notinwiki[row[0]]))
                output_row.append(str(pred))
                output_row.append(str(obj))
                
                #Append the triples to the QRXNode Predicates in Wikidata file
                output_rows_pred_wiki.append(output_row)
        else:
            #Check if the predicate is not language and object is not empty
            if pred not in("language") and str(obj) != '':
                #Add key, values pair to the predicates NOT in Wikidata dictionary
                
                #Check if predicate is in Term Type Dictionary Values, then append PRX_TTY_
                if pred in tty_dict.values():
                    pred_notinwikidata_dict[pred]="PRX_TTY_"+pred
                #Check if predicate is in Identifier Values, then append PRX_ID_
                elif pred in identifier_source_list:
                    pred_notinwikidata_dict[pred]="PRX_ID_"+pred
                #Check if predicate is in RXnorm Relations Values, then append PRX_REL_
                elif pred in rela_types_list:
                    pred_notinwikidata_dict[pred]="PRX_REL_"+pred
                #Else just append PRX_ to denote it is not in Wikidata
                else:
                     pred_notinwikidata_dict[pred]="PRX_"+pred
                pred=pred_notinwikidata_dict[pred]
                
                #If the subject is Not In Wikidata (has a QRXNode), then append the triples for that QRXNode
                if row[0] in qnode_dict_notinwiki:
                    
                    output_row.append(str(qnode_dict_notinwiki[row[0]]))
                    
                    output_row.append(str(pred))
                    output_row.append(str(obj))
                    
                    #Append the triples to the QRXNode Predicates NOT in Wikidata file
                    output_rows_pred_notwiki.append(output_row)

Header= ['node1', 'label', 'node2']


In [29]:
#Write the results to the QRXNode Predicate in Wikidata file
filename_output= os.path.join(triples_output_path, 'qrxnode_pred_wiki.tsv')
write_tsv(filename_output, output_rows_pred_wiki)
#Write the results to the QRXNode Predicate NOT in Wikidata file
filename_output= os.path.join(triples_output_path, 'qrxnode_pred_notwiki.tsv')
write_tsv(filename_output, output_rows_pred_notwiki)

In [30]:
#Get all the keys from the Predicate NOT in Wikidata Dictionary
for x in pred_notinwikidata_dict.keys():
    
    print(x)
    
    output_row=[]
    pred_value=pred_notinwikidata_dict[x]
    
    #Get the triple for label
    output_row.append(pred_value)
    output_row.append("label")
    output_row.append("\'"+x+"\'"+"@en")
    
    #Append the label triple to both PRXNode file and PRXNode Edges file
    output_rows_prxnode.append(output_row)
    output_rows_prxnode_edges.append(output_row)
    
    output_row=[]
    #Get the triple for description- For now desciption and label are same
    output_row.append(pred_value)
    output_row.append("description")
    output_row.append("\'"+x+"\'"+"@en")
    
    #Append the description triple to both PRXNode file and PRXNode Edges file
    output_rows_prxnode.append(output_row)
    output_rows_prxnode_edges.append(output_row)
    
    output_row=[]
    #Get the triple for Data Type
    output_row.append(pred_value)
    output_row.append("data_type")
    
    #Check if predicate is Relation or Term Type, then data type is an item
    if ("PRX_REL_" in pred_value or "PRX_TTY_" in pred_value):
        output_row.append("\""+"item"+"\"")
    #Check if predicate is Identifier, then data type is an external-identifier
    elif ("PRX_ID_" in pred_value):
        output_row.append("\""+"external-identifier"+"\"")
    #Else predicate is simply a string
    elif ("PRX_" in pred_value):
        output_row.append("\""+"string"+"\"")
        
    #Append the data-type triple to both PRXNode file and PRXNode DataType file
    output_rows_prxnode.append(output_row)
    output_rows_prxnode_datatype.append(output_row)

MMSL
suppress
USP
GS
VANDF
MTHSPL
NDDF
ATC
CVX
MMX
MTHCMSFRF
has_precise_ingredient
has_form
has_part
has_ingredients
part_of
precise_ingredient_of
has_ingredient
has_tradename
reformulated_to
reformulation_of
has_dose_form
inverse_isa
tradename_of
form_of
dose_form_of
contained_in
ingredient_of
consists_of
isa
constitutes
quantified_form_of
contains
has_quantified_form
ingredients_of
has_doseformgroup
doseformgroup_of


In [31]:
#Write the results to PRXNode file
filename_output= os.path.join(triples_output_path, 'prxnode_notinwiki.tsv')
write_tsv(filename_output, output_rows_prxnode)

#Write the results to PRXNode Edges file
filename_output= os.path.join(triples_output_path, 'prxnode_notinwiki_edges.tsv')
write_tsv(filename_output, output_rows_prxnode_edges)

#Write the results to PRXNode Data-Type file
filename_output= os.path.join(triples_output_path, 'prxnode_notinwiki_datatype.tsv')
write_tsv(filename_output, output_rows_prxnode_datatype)

In [32]:
print("Triples Generated")

Triples Generated
