In [1]:
!pip install kgtk

Collecting kgtk
[?25l  Downloading https://files.pythonhosted.org/packages/c8/6d/ba377d07a6d0e4cd1399cde1dee79a0ef9e6d67710af4d47467aa60229f6/kgtk-0.5.0-py3-none-any.whl (432kB)
[K     |████████████████████████████████| 440kB 13.2MB/s 
[?25hCollecting rfc3986
  Downloading https://files.pythonhosted.org/packages/78/be/7b8b99fd74ff5684225f50dd0e865393d2265656ef3b4ba9eaaaffe622b8/rfc3986-1.4.0-py2.py3-none-any.whl
Collecting torchbiggraph
[?25l  Downloading https://files.pythonhosted.org/packages/c3/db/925f84ea4eccc12945749015d81b9f0cd9e09d2ed15ca2a91bef69509a4a/torchbiggraph-1.0.0-py3-none-any.whl (99kB)
[K     |████████████████████████████████| 102kB 9.3MB/s 
[?25hCollecting mgzip>=0.2.1
  Downloading https://files.pythonhosted.org/packages/80/31/0f83d46a92aae1a39d6b78c22def34c6791de2a300f019695d6aee3e4e5a/mgzip-0.2.1.tar.gz
Collecting pycountry
[?25l  Downloading https://files.pythonhosted.org/packages/76/73/6f1a412f14f68c273feea29a6ea9b9f1e268177d32e0e69ad6790d306312/pycountry

# **Step 0: Download RXNORM and FDA Data**
**RXNORM**:
1. Use this DATABASE creation automation script from RXNORM Technical Documentation- https://www.nlm.nih.gov/research/umls/rxnorm/docs/techdoc.html#s13_0
2. Convert the Resultant SQL Files to get required CSV Files 

**FDA**:
1. Use this JSON from OPENFDA Documentation to get required - https://api.fda.gov/download.json
2. Get the Required FDA files from: fda_json["results"]["drug"]["ndc"], fda_json["results"]["drug"]["label"], fda_json["results"]["drug"]["drugsfda"] & fda_json["results"]["drug"]["enforcement"]






In [2]:
cd sample_data/

/content/sample_data


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd ../drive/MyDrive/Colab\ Notebooks/rxnorm_fda_kg/notebooks

/content/drive/MyDrive/Colab Notebooks/rxnorm_fda_kg/notebooks


In [5]:
import csv
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import sys
import re

In [6]:
#Function: Write results to tsv file
def write_tsv(file_name, data_lists):
	with open(file_name, "w", encoding="utf-8") as f:
		f.write("node1\tlabel\tnode2\n")
		for data in data_lists:
			line = "\t".join(data)
			line += "\n"
			f.write(line)

In [7]:
#subject_list= ['1656341', '317541', '69749', '1656328', '1656339', '1656346', '1656351', '1656356', '1656342', '1656350', '1656355', '1656343', '1656340', '1656349', '1656354', '1656334', '1656335', '1656347', '1656348', '1656352', '1656353', '1656338', '1656336', '1656337', '1656344', '1656345', '1151131', '1151133']

# **Step 1: Generate Intermediate RXNORM Triples**

# **1A. Generate Intermediate RXNORM Triples from RXNCONSO table**
RXNCONSO table provides the following information:<br>
*   RXNorm General Information: rxcui (RXNormID), name (label), tty (description), language (lat) and suppress (Y/N)
*   RXNorm Synonym Information: synonym (alias)
*   RXNorm Related Identifier Information: such as MSH, DRUGBANK and SNOMEDCT etc.

**RXNCONSO Headers for Reference**:<br>
RXCUI,LAT,TS,LUI,STT,SUI,ISPREF,RXAUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF

**RXNorm General Information**:<br>
Example for Entresto sample record:<br>
1656341,ENG,,,,,,7249807,7249807.0,1656341,,RXNORM,BN,1656341,Entresto,,N,4096.0<br>
Subject: RXCUI (1656341) <br>
Predicates & Object: [Read as Predicate: Source Header (Object Value)]<br>
*   label: STR (Entresto)<br>
*   description: TTY (BN)<br>
*   language: LAT (ENG)<br>
*   suppress: SUPPRESS (N)<br>
*   rxcui: CODE (1656341) <br>

**RXNorm Synonym Information**:<br>
Example for Entresto sample record:<br>
3 entries- One Regular Information for Term Type SBD followed by 2 synonyms- PSN (Prescribable Name) and SY (Synonym)<br>
Regular Record:<br>
1656346,ENG,,,,,,7249812,7249812.0,1656346,,RXNORM,SBD,1656346,sacubitril 24 MG / valsartan 26 MG Oral Tablet [Entresto],,N,4096.0<br>
2 Synonym Records:<br>
1656346,ENG,,,,,,7249813,7249813.0,1656346,,RXNORM,PSN,1656346,Entresto 24 MG / 26 MG Oral Tablet,,N,4096.0
1656346,ENG,,,,,,7249814,7249814.0,1656346,,RXNORM,SY,1656346,Entresto (sacubitril 24 MG / valsartan 26 MG) Oral Tablet,,N,4096.0

Subject: RXCUI (1656346) <br>
Predicates & object:<br>
*   alias: STR (Entresto 24 MG / 26 MG Oral Tablet)<br>
*   alias: STR (Entresto (sacubitril 24 MG / valsartan 26 MG) Oral Tablet)<br>

**RXNorm Identifier Information**:<br>
Example for Entresto sample record:<br>
2 entries- One for MMSL and other for MSH:<br>
1656341,ENG,,,,,,7255921,,,,MMSL,BN,234762,Entresto,,N,
1656341,ENG,,,,,,8138471,,M000614616,C549068,MSH,PCE,C549068,entresto,,N,

Subject: RXCUI (1656346) <br>
Predicates & Object:<br>
*   MMSL: CODE (234762)<br>
*   MSH: CODE (C549068)<br>





In [8]:
#Declare list to hold all RXNorm triples
triples_all=[]

In [9]:
#Create a dictionary to hold all the Term types for an RXNorm
rxcui_tty_dict={}

#Create a dictionary to hold the Identifier Label as a qualifier
rxcui_identifier_qual_dict={}

#RXNCONSO table: gives rxcui, name(label),tty(description), synonym(alias), language(ENG) and suppress(Y/N)
#RXNCONSO table: gives related identifiers e.g. MSH, DRUGBANK, SNOMEDCT
with open('../data/rxnorm/rxnconso.csv', encoding="utf-8") as f:
    reader = csv.reader(f)
    
    #Get the list of headers from the file
    header_list = next(reader)
    #print("Header=", header_list)
    
    
    for row in reader:
        triples=[]
        
        #Check the identifier is not RXNORM i.e MSH, SNOMED, DRUGBANK etc.
        #since RXNORM is the subject and we already get the rxcui from the name
        if row[11] != 'RXNORM':
            #print("Subject Identifier=",row[0])
            #print(row[0], row[11], row[13])
            
            #Get the triples related to identifiers- MSH, SNOMED etc.
            triples.append("\""+row[0]+"\"")
            triples.append(row[11])
            triples.append("\""+row[13]+"\"")

            #Construct the dictionary for Identifier Label related Qualifier
            rxcui_identifier_qual_dict_key=row[0]+"_"+row[11]
            if rxcui_identifier_qual_dict_key not in rxcui_identifier_qual_dict:
              rxcui_identifier_qual_dict[rxcui_identifier_qual_dict_key]=[]
              rxcui_identifier_qual_dict[rxcui_identifier_qual_dict_key].append(str(row[14]))
            else:
              rxcui_identifier_qual_dict[rxcui_identifier_qual_dict_key].append(str(row[14]))

            #Append triples related to identifiers to list of all triples
            triples_all.append(triples)
        
        #Declare list of triples for lang, rxcui, tty, name, synonym, suppress
        triples_lang=[]
        triples_rxcui=[]
        triples_tty=[]
        triples_name=[]
        triples_suppress=[]
        triples_synonym=[]
        
        #Check identifier is RXNORM and is not a Synonym/Tall Man Lettering Synonym/Prescribable Name
        #This gives us all the required RXNORM triples other than synonym
        if row[11] == 'RXNORM' and row[12] not in ('SY', 'TMSY', 'PSN'):
            #print("Subject Triples=",row[0])
            
            #Get the triples for language
            triples_lang.append("\""+row[0]+"\"")
            triples_lang.append("language")
            triples_lang.append("\""+row[1]+"\"")
            
            #Get the triples for rxcui
            triples_rxcui.append("\""+row[0]+"\"")
            triples_rxcui.append("rxcui")
            triples_rxcui.append("\""+row[13]+"\"")
            
            #Get the triples for tty (description)
            triples_tty.append("\""+row[0]+"\"")
            triples_tty.append("tty")
            triples_tty.append("\""+row[12]+"\"")

            #Add the tty in the dictionary for the subject
            rxcui_tty_dict[row[0]]=row[12]
            
            #Get the triples for name (label)
            triples_name.append("\""+row[0]+"\"")
            triples_name.append("name")
            triples_name.append("\""+row[14]+"\"")
            
            #Get the triples for suppress
            triples_suppress.append("\""+row[0]+"\"")
            triples_suppress.append("suppress")
            triples_suppress.append("\""+row[16]+"\"")
            
            #Append triples for lang, rxcui, tty, name, synonym, suppress
            if triples_lang != []:
                triples_all.append(triples_lang)
            if triples_rxcui != []:
                triples_all.append(triples_rxcui)
            if triples_tty != []:
                triples_all.append(triples_tty)
            if triples_name != []:
                triples_all.append(triples_name)
            if triples_suppress !=[]:
                triples_all.append(triples_suppress)
        
        #Check identifier is RXNORM and is a Synonym/Tall Man Lettering Synonym/Prescribable Name
        #This gives us all the required triples for synonym
        if row[11] == 'RXNORM' and row[12] in ('SY', 'TMSY', 'PSN'):
            #print("Subject Triples=",row[0])
            
            #Get all triples for synonym
            triples_synonym.append("\""+row[0]+"\"")
            triples_synonym.append("synonym")
            triples_synonym.append("\""+row[14]+"\"")

            #Append triples for synonym
            if triples_synonym !=[]:
                triples_all.append(triples_synonym)

#print("Added Triples from RXNCONSO")

In [10]:
#Write the required triples for RXNorm information and Identifiers to rxnorm_triples.tsv file
write_tsv('../results/rxnorm/intermediate_triples/rxnorm_triples.tsv', triples_all)

# **1B. Generate Intermediate RXNORM Triples from RXNREL table**
RXNREL table provides the following information:<br>
*   RXNorm Relation Information: has_tradename, ingredient_of etc.<br>


**RXNREL Headers for Reference**:<br>
RXCUI1,RXAUI1,STYPE1,REL,RXCUI2,RXAUI2,STYPE2,RELA,RUI,SRUI,SAB,SL,RG,DIR,SUPPRESS,CVF<br>
Note: Relationship is what RXCUI2 HAS TO RXCUI1<br>

**RXNorm Relationship Information**:<br>
Example for Entresto sample record:<br>
1656355.0,,CUI,RO,1656341.0,,CUI,ingredient_of,86154613.0,,RXNORM,,,,,4096.0<br>
1656346.0,,CUI,RO,1656341.0,,CUI,ingredient_of,86154533.0,,RXNORM,,,,,4096.0<br>
1656328.0,,CUI,RN,1656341.0,,CUI,tradename_of,86154502.0,,RXNORM,,,,,4096.0<br>

Subject: RXCUI2 (1656341) <br>
Predicates & Object: [Read as Predicate: Source Header (Object Value)]<br>
*   ingredient_of: RELA (1656355)<br>
*   ingredient_of: RELA (1656346)<br>
*   tradename_of: RELA (1656328)<br>








In [11]:
#Create a dictionary to hold all relations for a subject
rxcui_rel_dict={}

#RXNREL table: Gives us all the related RXNORMs for an RXNORMID
with open('../data/rxnorm/rxnrel.csv', encoding="utf-8") as f:
    reader = csv.reader(f)
    
    #Get the list of headers from the file
    header_list = next(reader)
    #print("Header=", header_list)
    
    
    for row in reader:
        triples=[]
        
        #We only need the relations between the concepts (CUI) and not atoms (AUI)
        if row[2]=='AUI':
            break
        
        #For the subject we need to split the RXNORM which is present in this form '1656341.0'
        x=row[4].split(".",1)[0]
    
        #We only need the relation between the concepts
        if row[2]=='CUI' and row[6]=='CUI':
            subject=x
            #print("Subject Rel=",x)
            # print(row[4], row[7], row[0])
            
            #Get the triples for RXNORM Relations
            triples.append("\""+subject+"\"")
            triples.append(row[7])

            #Split the object since object is also of the form '1656341.0'
            obj=row[0].split(".",1)[0]
            triples.append("\""+obj+"\"")

            #Check if the subject is in the Relation dictionary=> Then Add the relation for the subject
            if subject not in rxcui_rel_dict:
                 rxcui_rel_dict[subject]=[]
                 rxcui_rel_dict[subject].append([row[7],obj])
            else:
                rxcui_rel_dict[subject].append([row[7],obj])
           
            #Append the triples for RXNORM relations to the list of all triples
            triples_all.append(triples)
           
#print("Added Triples from RXNREL")

In [12]:
#Write the required triples for RXNorm Relations to rxnorm_triples.tsv file
write_tsv('../results/rxnorm/intermediate_triples/rxnorm_triples.tsv', triples_all)

# **1C. Generate Intermediate RXNORM Triples from RXNSAT table**
RXNSAT table provides the following information:<br>
*   RXNorm Strength Information: RXN_STRENGTH, RXN_AVAILABLE STRENGTH
*   NDC Code Information: NDC11 Code, NDC 2 Segment, NDC 3 Segment, SPL_SET_ID, DrugsFDA Application Number
*   UMLS Code Information: UMLSCUI, UMLSAUI

**Note: NDC Code Information Identifiers provide the Link to FDA**

**RXNSAT Headers for Reference**:<br>
RXCUI,LUI,SUI,RXAUI,STYPE,CODE,ATUI,SATUI,ATN,SAB,ATV,SUPPRESS,CVF

**RXNorm Strength Information**:<br>
Example for Entresto sample record:<br>
1656340,,,7249806,AUI,1656340,,,RXN_AVAILABLE_STRENGTH,RXNORM,24 MG / 26 MG,N,4096.0<br>
Subject: RXCUI (1656340) <br>
Predicates & Object: [Read as Predicate: Source Header (Object Value)]<br>
*   RXN_AVAILABLE_STRENGTH: ATV (24 MG / 26 MG)<br>

**NDC Code Information**:<br>
Example for NDC Code:<br>
1305100,,,12332251,AUI,1305100,,,NDC,RXNORM,75142000109,N,4096.0<br>
1305100,,,12332251,AUI,1305100,,,NDC,RXNORM,52687000201,N,4096.0<br>
1305100,,,12387798,AUI,50563-195,,,NDC,MTHSPL,50563-195-08,N,4096.0
1305100,,,12374233,AUI,75556-001,,,NDC,MTHSPL,75556-001-05,N,4096.0

Subject: RXCUI (1305100) <br>
Predicates & object:<br>
*   NDC11: ATV (75142000109)<br>
*   NDC11: ATV (52687000201)<br>
*   NDC 3 Segment: ATV (50563-195-08)<br>
*   NDC 3 Segment: ATV (75556-001-05)<br>
*   NDC 2 Segment: ATV (50563-195)<br>
*   NDC 2 Segment: ATV (75556-001)<br>

Example for SPL_SET_ID Code:<br>
1305100,,,12388790,AUI,76861-001,,,SPL_SET_ID,MTHSPL,a4f6c932-fe40-7226-e053-2a95a90a2205,N,4096.0<br>

Subject: RXCUI (1305100) <br>
Predicates & object:<br>
*   SPL_SET_ID: ATV (a4f6c932-fe40-7226-e053-2a95a90a2205)<br>

Example for Application Number:<br>
995253,,,12387879,AUI,55700-860,,,ANDA,MTHSPL,ANDA040156,N,4096.0<br>

Subject: RXCUI (995253) <br>
Predicates & object:<br>
*   ANDA: ATV (ANDA040156)<br>

**UMLS Code Information**:<br>
Example for Entresto sample record:<br>
2 entries- One for UMLSCUI and other for UMLSAUI:<br>
1656341,,,7249807,AUI,1656341,,,UMLSCUI,RXNORM,C4033616,,4096.0
1656341,,,7255921,AUI,234762,,,UMLSAUI,RXNORM,A24842892,,

Subject: RXCUI (1656341) <br>
Predicates & Object:<br>
*   UMLSCUI: ATV (C4033616)<br>
*   UMLSAUI: ATV (A24842892)<br>





In [13]:
#Create dictionaries to hold RXNormIDs for NDC11 Code, NDC 2 Segment, NDC 3 Segment, Application No and Spl Set ID
ndc11_dict={}
ndc2seg_dict={}
ndc3seg_dict={}
spl_setid_dict={}
application_dict={}

In [14]:
#RXNSAT: Gives us information about the Source Atoms such as RXNorm Strength Attribtues, NDC Codes and UMLSCUI:
with open('../data/rxnorm/rxnsat.csv', encoding="utf-8") as f:
    reader = csv.reader(f)
    
    #Get the list of headers from the file
    header_list = next(reader)
    #print("Header=", header_list)
    
    
    for row in reader:
      
        #Check if the identifier source is RXNORM or the related fields are NDC, Application No (NDA/ANDA), Spl Set ID or UMLSCUI
        if ((row[9]== 'RXNORM') or row[8] in ('NDC', 'SPL_SET_ID', 'NDA', 'ANDA')):
            #print("Subject SAT=",row[0])
            
            #Count the frequency of '-' for NDC Code
            counter = row[10].count('-') 
            if row[8] =='NDC':
                #If counter is 0, then it is NDC11 Code
                if counter==0:
                    #Construct the dict for NDC11
                    ndc11_dict[row[10]]=row[0]

                    #Get all triples for NDC11
                    triples=[]
                    triples.append("\""+row[0]+"\"")
                    triples.append(row[8])
                    triples.append("\""+row[10]+"\"")
                    #Append NDC11 triples to all triples
                    triples_all.append(triples)
                
                #If counter is 1, then it is NDC 2 Segment Code  
                elif counter==1:
                    #Construct the dict for NDC 2 Segment
                    ndc2seg_dict[row[10]]=row[0]

                    #Get all triples for NDC 2 Segment
                    triples=[]
                    triples.append("\""+row[0]+"\"")
                    triples.append("NDC 2 Segment")
                    triples.append("\""+row[10]+"\"")
                    #Append NDC 2 Segment triples to all triples
                    triples_all.append(triples)
                
                #If counter is 2, then it is NDC 3 Segment Code  
                elif counter==2:
                    #Construct the dict for NDC 3 Segment
                    ndc3seg_dict[row[10]]=row[0]

                    #Get all triples for NDC 3 Segment
                    triples=[]
                    triples.append("\""+row[0]+"\"")
                    triples.append("NDC 3 Segment")
                    triples.append("\""+row[10]+"\"")
                    #Append NDC 3 Segment Triples to all triples
                    triples_all.append(triples)
                    
                    #Get the NDC 2 Segment Code from 3 Segment
                    ndc2seg='-'.join(row[10].split('-')[0:2])
                    ndc2seg_dict[ndc2seg]=row[0]

                    #Get all triples for NDC 2 Segment
                    triples=[]
                    triples.append("\""+row[0]+"\"")
                    triples.append("NDC 2 Segment")
                    triples.append("\""+ndc2seg+"\"")
                    #Append NDC 2 Segment Triples to all triples
                    triples_all.append(triples)

            #Check if predicate is Application Number- NDA/ANDA
            elif row[8] in ('NDA', 'ANDA'):
                #Construct Dict for Application Number
                application_dict[row[10]]=row[0]

                #Get all triples for Application Number
                triples=[]
                triples.append("\""+row[0]+"\"")
                triples.append("DrugsFDA Application Number")
                triples.append("\""+row[10]+"\"")

                #Append Application Number Triples to all triples
                triples_all.append(triples)
            
            #Check if predicate is Spl Set ID
            elif row[8] in ('SPL_SET_ID'):
                #Construct Dict for SPL Set ID
                spl_setid_dict[row[10]]=row[0]

                #Get all triples for Spl Set ID
                triples=[]
                triples.append("\""+row[0]+"\"")
                triples.append(row[8])
                triples.append("\""+row[10]+"\"")

                #Append Spl Set ID triples to all triples
                triples_all.append(triples)
            
            #Else predicates are RXNorm Strength Related or UMLS Codes
            else:
                #Get all triples for RXNorm Strength and UMLS Codes
                triples=[]
                triples.append("\""+row[0]+"\"")
                triples.append(row[8])
                triples.append("\""+row[10]+"\"")

                #Append triples to all triples
                triples_all.append(triples)
#print("Added Triples from RXNSAT")

In [15]:
#Write the required triples for RXNorm  Strength and Source Identifiers- NDC, UMLSCUI to rxnorm intermediate folder for rxnorm_triples.tsv file
write_tsv('../results/rxnorm/intermediate_triples/rxnorm_triples.tsv', triples_all)

# **1D. Create List/Dict to hold Known Information**
1.   Identifier Source List
2.   RXNORM Relationship Types
3.   RXNorm Term Type Dictionary
4.   Predicates in Wikidata Dictionary



In [16]:
#Create a list to hold all known identifiers
identifier_source_list=[
"USP"
,"GS"
,"SNOMEDCT_US"
,"VANDF"
,"MTHSPL"
,"NDDF"
,"ATC"
,"MMSL"
,"MSH"
,"DRUGBANK"
,"MMX"
,"NDC"
,"NDC 3 Segment"
,"NDC 2 Segment"
,"SPL_SET_ID"
,"DrugsFDA Application Number"
,"spl_id"
,"product_id"
,"unii"
,"event_id"
,"upc"]

In [17]:
#Create a list to hold all known RXNORM relationship types
rela_types_list=[
'consists_of',
'constitutes',
'contained_in',
'contains',
'dose_form_of',
'form_of',
'has_dose_form',
'doseformgroup_of',
'has_form',
'has_ingredient',
'has_ingredients',
'has_part',
'has_precise_ingredient',
'has_quantified_form',
'has_tradename',
'has_doseformgroup',
'ingredient_of',
'ingredients_of',
'inverse_isa',
'isa',
'part_of',
'precise_ingredient_of',
'quantified_form_of',
'reformulated_to',
'reformulation_of',
'tradename_of'
]

In [18]:
#Create a dictionary to hold all RXNORM Term Types and their meanings 
tty_dict={
   'BN': 'brand_name'	    
  ,'BPCK': 'branded_pack'	    
  ,'DF': 'dose_form'	    
  ,'DFG': 'dose_form_group'	
  ,'ET': 'dose_form_entry_term'    
  ,'GPCK': 'generic_pack'	    
  ,'IN': 'ingredient'	    
  ,'MIN': 'multiple_ingredients'	    
  ,'PIN': 'precise_ingredient'	    
  ,'SBD': 'branded_drug'
  ,'SBDC': 'branded_drug_component'
  ,'SBDF': 'branded_dose_form'
  ,'SBDG': 'branded_dose_form_group'
  ,'SCD': 'clinical_drug'
  ,'SCDC': 'clinical_drug_component'
  ,'SCDF': 'clinical_dose_form'
  ,'SCDG': 'clinical_dose_form_group'}

In [19]:
#Create a dictionary to hold all predicates and their corresponding Wikidata PNodes
pred_wikidata_dict={
        'instanceOf':'P31'
        ,'rxcui':'P3345'
        ,'UMLSCUI':'P2892'
        ,'SNOMEDCT_US':'P5806'
        ,'MSH':'P486'
        ,'DRUGBANK':'P715'
        ,'NDC': 'P3640'
        ,'ATC': 'P267'
        ,'unii':'P652'
        ,'manufacturer_name':'P176'
        ,'sponsor_name':'P859'
        ,'postal_code':'P281'
        }

# **Step 2. Find RXNormID Coverage in WIKIDATA**




# **2A. Query Wikidata SPARQL Endpoint**

Create Functions to:<br>
Generate Query for RXNorm Identifier<br>
Generate Results from the Query<br>

Note: Wikidata SPARQL Endpoint is used to get Full Coverage. To Remove API Dependency, Wikidata Dump must be used <br>



In [20]:
#Create a dictionary to hold the predicates not in Wikidata
pred_notinwikidata_dict_rxnorm={}

In [21]:
#Create a list to hold output of triples of RXNorm QNodes with predicates in Wikidata
output_rows_qnode_pnode_rxnorm=[]
#Create a list to hold output of triples of RXNorm QNodes with predicates NOT in Wikidata
output_rows_qnode_prxnode_rxnorm=[]

#Create a list to hold output of triples of RXNorm QRXNodes with predicates in Wikidata
output_rows_qrxnode_pnode_rxnorm=[]
#Create a list to hold output of triples of RXNorm QNodes with predicates  NOT in Wikidata
output_rows_qrxnode_prxnode_rxnorm=[]

#Create a list to hold output of triples of RXNorm PRXNOdes
output_rows_prxnode_rxnorm=[]

#Create a list to hold output of Edges and DataType triples of RXNorm PRXNOdes
output_rows_prxnode_edges_rxnorm=[]
output_rows_prxnode_datatype_rxnorm=[]

In [22]:
#Function: Generate a SPARQL query given an identifier
def get_query(identifier):
    query = """#All items with a property
    # Sample to query all values of a property
    # Property talk pages on Wikidata include basic queries adapted to each property
    SELECT
      ?item ?itemLabel
      ?value ?valueLabel
    # valueLabel is only useful for properties with item-datatype
    WHERE 
    {
      ?item wdt:"""+pred_wikidata_dict[identifier]+""" ?value
              
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }
    # remove or change limit for more results
    """
    return query

In [23]:
#Function: Get the results from Wikidata SPARQL endpoint given a query
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

# **2B. Create Dictionary for RXNormIDs with Wikidata QNode**
1.   Get the results from function created
2.   Create Dictionary qnode_dict_inwiki
3.   Write the results for P3345 [RXNORMID] to RXNorm QRXNode_PNode file




In [24]:
#Create a dict to hold all QNodes in Wikidata which have RXNORMIDs
qnode_dict_inwiki={}

#Specify the endpoint for the url and the identifier name
endpoint_url = "https://query.wikidata.org/sparql"
identifier="rxcui"

#Generate the query
query=get_query(identifier)

In [25]:
#Get the results from the Query
results = get_results(endpoint_url, query)
   
#From the results, get the RXNorm IDs as the Key and QNodes as the Value   
for result in results["results"]["bindings"]:
    
    if result['item']['type']=='uri':
        qnode=result['item']['value']
        identifier_value=result['value']['value']
        qnode=qnode.split("entity/",1)[1]
        if identifier=='rxcui':
            qnode_dict_inwiki[identifier_value]=qnode

In [26]:
#Create a list to hold triples for RXNorm Qnodes
triples_qnode_all=[]
#Create triples for subject QNode, Predicate- P3345 and value as the RXNormID
for x in qnode_dict_inwiki:
    triples_qnode=[]
    triples_qnode.append(qnode_dict_inwiki[x])
    triples_qnode.append('P3345')
    triples_qnode.append("\""+x+"\"")
    #Append the result to the list of all Qnode triples
    triples_qnode_all.append(triples_qnode)

In [27]:
#Write the list of triples for RXNorm QNodes to Qnode Predicates in Wiki file
write_tsv("../results/rxnorm/kgtk_triples/nodes/qnode_pnode_rxnorm.tsv", triples_qnode_all)

# **Step 3. Find RXNORM NOT in Wikidata coverage using Intermediate Triples**





# **3A. Create Dictionary for RXNormIDs NOT in Wikidata**
1.   Get the RXNORMIDs from Intermediate Triples [using results from Step 1]
2.   Create Dictionary qnode_dict_notinwiki by checking if RXNORMID is in Wikidata or not [using results from Step 2]
3.   Assign QRXNode to RXNormIDs NOT in Wikidata




In [28]:
#Load the rxnorm_triples file which contains all RXNorm Information, Identifiers, Relations and Source Identifiers
with open('../results/rxnorm/intermediate_triples/rxnorm_triples.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    #Get the list of all headers in the file
    header_list = next(reader)
    #print("Header=", header_list)
    
    #Create a dictionary to hold QRXNodes for the RXNormIDs
    qnode_dict_notinwiki={}
    #Create label and description as '' (Not needed since we are using QRXNode method)
    label=''
    desc=''
    
    #Read every row from the rxnorm_triples file
    for row in reader:
        output_row=[]
        
       
        #Get the label and description 
        #Alt-Method: label+description can also work- Not used here
        if row[1]=='name':
            label=str(row[2])
            
        if row[1]=='tty':
            desc=str(tty_dict[row[2]].replace("_"," "))
        
        #If subject is not in the dictionary and not in Wikidata, then add the QRXNode in the value
        if row[0] not in qnode_dict_notinwiki and row[0] not in qnode_dict_inwiki:
            qnode_dict_notinwiki[row[0]]='QRX'+str(row[0])

            #Alt-Method: label+description can also work- Not used here
            #label_desc_dict[row[0]]=label+'-'+desc
        
#print(qnode_dict_notinwiki)

# **3B. Add InstanceOf Predicate for RXNormIDs NOT in Wikidata**
1.   Add P31 as 'Pharmaceutical Product' for this QRXNode. 
2.   Write the results to QRXNode_PNode file




In [29]:
#Get all the QRXNodes in qnode_dict_notinwiki and add P31 predicate for all
#The object is Pharmaceutical Product/ Q28885102 for all
for x in qnode_dict_notinwiki.keys():
   
    output_row=[]  
   
    #Get all triples for P31
    output_row.append(qnode_dict_notinwiki[x])
    output_row.append(pred_wikidata_dict['instanceOf'])    
    output_row.append("Q28885102")
    #print(output_row)

    #Append triples to the Output of Predicates in Wikidata
    output_rows_qrxnode_pnode_rxnorm.append(output_row)

In [30]:
#Write the output to QRXNode Predicates in Wikidata file
write_tsv("../results/rxnorm/kgtk_triples/nodes/qrxnode_pnode_rxnorm.tsv", output_rows_qrxnode_pnode_rxnorm)

# **Step 4: Generate Intermediate Triples for FDA using NDC Code Identifiers**
Uses NDC Code information from RXNORM intermediate Triples generated in Step 1C:
1. NDC 2 Segment: Used to link Predicates in Drug-NDC source
2. NDC 3 Segment: Used to link Predicates in Drug-NDC and Drug-Enforcement Source
3. SPL_SET_ID: Used to link Predicates in Drug-NDC and Drug-Label source
4. Application Number: Used to link Predicates in Drug-Drugs@FDA source




# **Step 4A: Generate Intermediate Triples from Drug-NDC Source**
1. Load the data present in JSON Format. There is 1 file for this source<br> 
2. For Drug-NDC, we get information at 3 levels: NDC 2 Segment, NDC 3 Segment and SPL_SET_ID
For FDA-NDC, we get predicates for Product NDC Code [NDC 2 segment] E.g. marketing_start_date, product_type, marketing_category etc.<br>
For FDA-NDC, we also get predicates for OpenFDA attributes which uses [SPL_SET_ID] E.g. is_original_packager, manufacturer_name, unii etc.<br>
For FDA-NDC, we also get predicates for Packaging attributes which uses Package_NDC_Code [NDC 3 Segment] E.g. marketing_start_date, sample, description etc.<br>
3. For Active Ingredients, we also get **Qualifiers** for Strength
4. Write the results to 3 Intermediate Triple Files:
fda_triples_product_ndc, fda_triples_package_ndc and fda_triples_spl_ndc

In [31]:
#Create a dict to hold active ingredient strength as a qualifier
active_ingredients_dict={}
#FDA DRUG-NDC: Source provides information for 3 different identifiers- Product NDC (2 Segment), Package NDC (3 Segment) AND Spl Set ID
with open('../data/fda/drug-ndc-0001-of-0001.json') as ndc_json:
    data=json.load(ndc_json)
    #Check meta-data
    #print(data['meta'])

    #Create list to gold triples for Product NDC, Package NDC and Spl Set ID
    triples_product_ndc=[]
    triples_spl_ndc=[]
    triples_package_ndc=[]
    for entry in data['results']:
        
        #Get the Product NDC Code 
        product_ndc=entry['product_ndc']
        #print(product_ndc)
        #Get list of all predicates for Drug-NDC Source
        klist=entry.keys()

        #Check if Product NDC is in NDC 2 Segment Dicitonary
        if product_ndc in ndc2seg_dict:
        
            #Iterate for all predicates
            for k in klist:
                
                
                #Check if predicate is NOT Product NDC itself
                #Check if predicate is NOT Package NDC related (packaging), Spl_Set_ID related (openfda) 
                #Check if predicate is NOT List(route/pharm class) or Dict (Active Ingredients)
                if k not in ('product_ndc', 'openfda', 'packaging', 'route', 'pharm_class', 'active_ingredients'):
                    #Get triples for all other predicates
                    triple=[]
                    triple.append(product_ndc)
                    triple.append(str(k))
                    triple.append(str(entry[k]))

                    #Append triples to all product_ndc triples
                    triples_product_ndc.append(triple)

                #Check if predicate is List(route/pharm class)
                elif k in ('route', 'pharm_class'):
                    for field in entry[k]:
                        #Get triples for all entries in list
                        triple=[]
                        triple.append(product_ndc)
                        triple.append(str(k))
                        triple.append(str(field))

                        #Append triples to all product_ndc triples
                        triples_product_ndc.append(triple)
                #Check if predicate is active ingredient
                elif k in ('active_ingredients'):
                    for field in entry[k]:
                        #Get the triples for active ingredient name
                        
                        triple=[]
                        triple.append(product_ndc)
                        triple.append(str(k))
                        triple.append(str(field['name']))

                        #Check if strength is present for active ingredient and Construct Dict for Active_ingredients strength qualifiers
                        if 'strength' in field:
                          active_ingredients_dict_key=ndc2seg_dict[product_ndc]+"_"+str(field['name'])
                          active_ingredients_dict[active_ingredients_dict_key]=str(field['strength'])

                        #Append triples to all product_ndc triples
                        triples_product_ndc.append(triple)
                    
         
      
        #Check if spl_set_id is in Key list of openfda Predicate of ProductNDC [This contains data at Spl_set_id level]
        if 'spl_set_id' in entry['openfda'].keys():
            #Get the spl_set_id
            spl_set_id=str(entry['openfda']['spl_set_id']).replace("['","").replace("']","")
            #print(spl_set_id)
            
            #Check if spl_set_id is in spl_set_id dictionary:
            if spl_set_id in spl_setid_dict:
                #Get the keys in openfda predicate
                openfda=entry['openfda']
                slist=openfda.keys()
                
                #Check all keys in openfda
                for p in slist:
                    triple=[]
                    
                    if p != 'spl_set_id':
                        #All the objects are lists in openfda, so iterate over all elements
                        for field in openfda[p]:
                            #Get all triples for openfda predicates
                            triple=[]
                            triple.append(str(spl_set_id))
                            triple.append(str(p))
                            triple.append(str(field))
                            #Append triples to list of all Spl_ndc triples
                            triples_spl_ndc.append(triple)
        
        #Check if packaging is present in predicates of Product NDC
        if 'packaging' in klist:
            for package in entry['packaging']:
                #Get the package NDC Code
                package_ndc=package['package_ndc']
                #print(package_ndc)
                
                #Check if package_ndc is in NDC 3 Segment dictionary
                if package_ndc in ndc3seg_dict:
                    #print(package_ndc)
                    
                    #Get list of all keys for packaging predicate
                    plist=package.keys()

                    #Iterate over list of all keys
                    for p in plist:
                        triple=[]
                        #Get all predicates for package_ndc related 
                        if p != 'package_ndc':
                            triple.append(str(package_ndc))
                            triple.append(str(p))
                            triple.append(str(package[p]))
                            #Append predicates to list of all package_ndc triples
                            triples_package_ndc.append(triple)
#print("Added triples from FDA NDC")

In [32]:
#Write the results to intermediate_triples folder for product_ndc, spl_ndc and package_ndc
write_tsv('../results/fda/intermediate_triples/fda_triples_product_ndc.tsv', triples_product_ndc)
write_tsv('../results/fda/intermediate_triples/fda_triples_spl_ndc.tsv', triples_spl_ndc)
write_tsv('../results/fda/intermediate_triples/fda_triples_package_ndc.tsv', triples_package_ndc)

# **Step 4B: Generate Intermediate Triples from Drug-Label Source**
1. Load the data present in JSON Format. There are 9 file for this source<br> 
2. For Drug-Label, we get information at 1 levels: SPL_SET_ID
For FDA-Labeling, we get predicates at SPL_SET_ID level E.G package_label_principal_display_panel, pregnancy, pharmacokinetics, drug_interactions etc.<br>
4. Write the results to 1 Intermediate Triple Files:
fda_triples_spl_label

In [33]:
#Create a list to hold all triples for FDA-Labeling
triples=[]
#FDA- Drug-Label: This gives data for labeling information at Spl_Set_ID level
#Create a counter to read from all 9 files for FDA Labeling
for counter in range(1,2):
    #print(counter)
    filename='../data/fda/drug-label-000'+str(counter)+'-of-0009.json'
    #print(filename)
    
    with open(filename) as ndc_json: 
        #Load data from json
        data=json.load(ndc_json)
        #Check meta-data
        #print(data['meta'])
        
        for entry in data['results']:
            #Get the Spl_set_id from keys
            set_id=entry['set_id']
            #print(set_id)
            
            #Get all key list
            klist=entry.keys()
            #print(klist)
            
            #Check if spl_set_id is in spl setid dictionary
            if set_id in spl_setid_dict:
   
                #Iterate all keys in the Labeling keylist:
                for k in klist:
                    #Check if key is NOT openfda
                    if k not in ('openfda'):
                        #Check if object type is a list
                        if type(entry[k])== list:
                            #Iterate over all entries in the list
                            for field in entry[k]:
                                triple=[]
                                #Get all triples for each entry in the list
                                triple.append(str(set_id))
                                triple.append(str(k))
                                triple.append(str(field))
                                #Append triples to list of all triples
                                triples.append(triple)
                        else:
                            #Get all triples
                            triple=[]
                            triple.append(str(set_id))
                            triple.append(str(k))
                            triple.append(str(entry[k]))
                            #Append triples to list of all triples
                            triples.append(triple)
                    #Check if key is not openfda
                    elif k== 'openfda':
                        #Get all keys in openfda
                        openfda=entry['openfda']
                        slist=openfda.keys()
                        
                        #Iterate over all keys
                        for p in slist:
                            #Check if type is list
                            if type(openfda[p]) == list:
                                for field in openfda[p]:
                                    #Get triples for all entries in list
                                    triple=[]
                                    triple.append(str(set_id))
                                    triple.append(str(p))
                                    triple.append(str(field))
                                    #Append to list of all triples
                                    triples.append(triple)
                            else:
                                #Get all non-list triples
                                triple=[]
                                triple.append(str(set_id))
                                triple.append(str(p))
                                triple.append(str(openfda[p]))
                                #Append to list of all triples
                                triples.append(triple)
                
#print("Added triples from FDA labeling")

In [34]:
#Write the results to intermediate_triples folder for spl_labeling
write_tsv('../results/fda/intermediate_triples/fda_triples_spl_labeling.tsv', triples)

# **Step 4C: Generate Intermediate Triples from Drug-Drugs@FDA Source**
1. Load the data present in JSON Format. There is 1 file for this source<br> 
2. For Drug-Drugs@FDA, we get information at 1 level: Application Number
For FDA-Drugs@FDA, we get predicates for Application Number: Openfda related predicates, sponsor_name, products information and submissions information<br>
3. For Products Information and Submissions Information, we also get a set of **Related Qualifiers**
4. Write the results to 1 Intermediate Triple Files:
fda_triples_application_drugsfda

In [35]:
#Create a list to hold triples for FDA- Drugs@FDA
triples=[]
#Create 2 dictionaries to hold Qualifiers related to Products and Submissions in Drugs@FDA
products_dict={}
submisions_dict={}

#FDA- Drugs-Drugs@FDA: This contains data at Application Number level
with open('../data/fda/drug-drugsfda-0001-of-0001.json') as drugsfda_json:
    #Load data from json file
    data=json.load(drugsfda_json)
    
    #Get the data from json results
    for entry in data['results']:
        #Get list of all keys
        
        klist=entry.keys()
        #print(klist)

        #Check if application number is in key list
        if 'application_number' in klist:
            #Get the application number
            application_no=entry['application_number']
            
            #Check if application number is in application dictionary
            if application_no in application_dict:
                #Iterate over all keys in key list
                for k in klist:
                    #Check if key is openfda
                    if k=='openfda':
                        #Get all list of openfda keys
                        openfda=entry['openfda']
                        slist=openfda.keys()
                        
                        #Iterate over all keys in openfda
                        for p in slist:
                            #if object type is list
                            if type(openfda[p]) == list:
                                #Iterate over all entries in the list
                                for field in openfda[p]:
                                    #Get all triples for entries in list
                                    triple=[]
                                    triple.append(str(application_no))
                                    triple.append(str(p))
                                    triple.append(str(field))
                                    #Append triples to list of all triples
                                    triples.append(triple)
                            else:
                                #Get all non-list triples
                                triple=[]
                                triple.append(str(application_no))
                                triple.append(str(p))
                                triple.append(str(openfda[p]))
                                #Append to list to all triples
                                triples.append(triple)
                    #Check if key is sponsor name or application number
                    elif k  in ('sponsor_name', 'application_number'):
                        #Get all triples
                        triple=[]
                        triple.append(str(application_no))
                        triple.append(str(k))
                        triple.append(str(entry[k]))
                        #Append to list of triples
                        triples.append(triple)
                    #Check if key is products, get the triple for product number
                    elif k in ('products'):
                        #Get the object for products
                        products=entry[k]
                        #Iterate over all keys in product
                        for prod in products:
                            #Get list of all keys in the keylist of each entry
                            slist=prod.keys()
                            #Get the product number
                            prod_no=prod['product_number']

                            #Iterate over each key 
                            for p in slist:
                                
                                #Check if obect type is a field
                                if type(prod[p]) == list:
                                    for field in prod[p]:
                                        #Get triples for all entries in list
                                        triple=[]
                                        triple.append(str(application_no))
                                        triple.append(str(p))
                                        triple.append(str(field))
                                        #triples.append(triple)
                                else:
                                    #Get triples for all non-list triples
                                    triple=[]
                                    triple.append(str(application_no))
                                    triple.append(str(p))
                                    triple.append(str(prod[p]))

                                #Append to triples if predicate is product number 
                                #This triple will be linked to all other qualifier triples
                                if p=='product_number':
                                    triples.append(triple)
                                #Otherwise add the triples as a value in the products dictionary Qualifier
                                else:
                                    #Generate the product key as concatenation of application no and product number
                                    product_dict_key=str(application_dict[application_no])+"_"+str(prod_no)
                                    #Construct the products dictionary for Qualifiers
                                    if product_dict_key not in products_dict:
                                         products_dict[product_dict_key]=[]
                                         products_dict[product_dict_key].append(triple)
                                    else:
                                        products_dict[product_dict_key].append(triple)
                    
                    #Check if key is submissions, get the triple for submission date
                    elif k == 'submissions':
                        #Get the object for submissions
                        submissions=entry[k]
                        #Iterate over all keys in submissions
                        for sub in submissions:
                            #Get list of all keys
                            slist=sub.keys()
                            #print(slist)

                            #Get the submission status date
                            sub_date=sub['submission_status_date']

                            #Iterate over all keys
                            for p in slist:
                                #Check if object type is a list
                                if type(sub[p]) == list:
                                    #Iterate over entries in the list
                                    for field in sub[p]:
                                        #Get all triples for all entries in list
                                        triple=[]
                                        triple.append(str(application_no))
                                        triple.append(str(p))
                                        triple.append(str(field))
                                        #triples.append(triple)
                                #Else get triples for non-list objects
                                else:
                                    triple=[]
                                    triple.append(str(application_no))
                                    triple.append(str(p))
                                    triple.append(str(sub[p]))
                                
                                #if predicate is submission status date, append to list of all triples
                                #This triple will be linked to all other qualifier triples 
                                if p=='submission_status_date':
                                    triples.append(triple)
                                #Otherwise add the triples as a value in the products dictionary Qualifier
                                else:
                                    #Generate the submissions key as concatenation of application no and submissions status date
                                    submisions_dict_key=str(application_dict[application_no])+"_"+str(sub_date)
                                    #Construct the submissions dictionary for Qualifiers
                                    if submisions_dict_key not in submisions_dict:
                                         submisions_dict[submisions_dict_key]=[]
                                         submisions_dict[submisions_dict_key].append(triple)
                                    else:
                                        submisions_dict[submisions_dict_key].append(triple)
#print("Added triples from FDA DrugsFDA")

In [36]:
#Write the results to intermediate_triples folder for application_drugsfda
write_tsv('../results/fda/intermediate_triples/fda_triples_application_drugsfda.tsv', triples)   

# **Step 4D: Generate Intermediate Triples from Drug-Enforcement Source**
1. Load the data present in JSON Format. There is 1 file for this source<br> 
2. For Drug-Drugs@FDA, we get information at 1 level: Package-NDC Code which is extracted from the Product Description
For FDA-Drugs@FDA, we get predicates for Package-NDC: Recall, Location, Reason for Recall, Event_id<br>
3. Write the results to 1 Intermediate Triple Files:
fda_triples_package_enforcement

In [37]:
#FDA- Drug-Enforcement: Contains drug recall info, Need to extract Package Ndc Code from Product Description
with open('../data/fda/drug-enforcement-0001-of-0001.json') as ndc_json:
    #Load data from json
    data=json.load(ndc_json)
    #Check meta-data
    #print(data['meta'])

    #Create list to hold all Enforcement triples
    triples=[]
    for entry in data['results']:
        #Check if NDC is in product description
        if "NDC" in entry['product_description']:

            #Get the Package NDC Code using regex function
            code = re.search(r'(\d+-?){1,3}$', entry['product_description'])
            
            #If we get a code
            if code != None:
                
            
                #print(code.group())
                #print(product_ndc)

                #Get the subject
                subj=code.group()
                #If the Package NDC code actually exists and present in NDC 3 Segment Dictionary
                if (len(str(code.group()))>10) and subj in ndc3seg_dict:
                    
                    #Get all key list
                    klist=entry.keys()
                    #print(klist)
                    
                    for k in klist:
                        #Get triples for Enforcement predicates
                        triple=[]
                        
                        triple.append(subj)
                        triple.append(str(k))
                        triple.append(str(entry[k]))

                        #Add to list of all triples
                        triples.append(triple)
#print("Added triples from FDA Enforcement")

In [38]:
#Write the results to intermediate_triples folder for package-enforcement
write_tsv('../results/fda/intermediate_triples/fda_triples_package_enforcement.tsv', triples)

# **Step 5: Generate KGTK Triples for RXNORM using RXNORM-Intermediate Triples**
Uses the RXNORM Intermediate Triples File generated in Step 1<br>
Uses the 2 dictionaries created in Step 2 and Step 3:<br>
qnode_dict_inwiki: RXNormIDs with QNODE in Wikidata<br>
qnode_dict_notinwiki: RXNormIDs with QRXNODE NOT in Wikidata<br>



# **Step 5A: Generate KGTK Triples for RXNORM NODE Edge Files**
1. Get the data in required KGTK Format
2. Dump the Output in 4 files [Naming convention is as follows]:
*   Subject in Wikidata, Predicate in Wikidata: QNODE_PNODE_RXNORM
*   Subject in Wikidata, Predicate NOT in Wikidata: QNODE_PRXNODE_RXNORM
*   Subject NOT in Wikidata, Predicate in Wikidata: QRXNODE_PNODE_RXNORM
*   Subject NOT in Wikidata, Predicate NOT in Wikidata: QRXNODE_PRXNODE_RXNORM
3. Create the Predicates NOT in Wikidata dictionary
4. Write the results to these 4 KGTK Triples Files






In [39]:
#Load the rxnorm_triples file which contains all RXNorm Information, Identifiers, Relations and Source Identifiers
with open('../results/rxnorm/intermediate_triples/rxnorm_triples.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    #Get all the headers from the file
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    
    #Check every row for QRXNode triples
    for row in reader:
        output_row=[]
    
        #Get the object
        obj=str(row[2])
        #Get the predicate
        pred=str(row[1])
        
        #If predicate is name, change it to label
        if pred=="name":
            pred="label"
        #If predicate is tty, change it to description, remove the underscore(_) from the object
        elif pred=="tty":
            pred="description"
            obj=tty_dict[obj].replace("_"," ")
        #If predicate is synonym, change it to alias
        elif pred=="synonym":
            pred="alias"
        #If predicate is present in Term Type dictionary, change it to meaniningful context using the dictionary
        elif pred in tty_dict.keys():
            pred=tty_dict[pred]
            #Check if object has QRXNode then make object as QRXNode
            if obj in qnode_dict_notinwiki.keys():
                obj=qnode_dict_notinwiki[obj]
            #Check if object has QNode then make object as QNode
            elif obj in qnode_dict_inwiki.keys():
                obj=qnode_dict_inwiki[obj]
            #Else just a check but this is not possible
            else:
                print("Not in Both!")
        #If predicate is present in RXNorm Relations list
        elif pred in rela_types_list:
             #Check if object has QRXNode then make object as QRXNode
             if obj in qnode_dict_notinwiki.keys():
                obj=qnode_dict_notinwiki[obj]
             #Check if object has QNode then make object as QNode
             elif obj in qnode_dict_inwiki.keys():
                obj=qnode_dict_inwiki[obj]
             #Else just a check but this is not possible
             else:
                obj="QRX"+obj #temp
                
        #If predicate is label, description, alias or suppress, use escape sequence for Quotes inside (')
        if "'" in obj:
            obj=obj.replace("\'","\\'")
        
        
        if pred in ("alias", "label", "description", "suppress"):
            if "'" in obj:
                obj=obj.replace("\'","")
            #Change the object to have Quotes followed by @ and then the language en as default since all are english
            if obj != '':
                obj="\'"+obj+"\'"+"@en"   #This is hardcoded for now
        #If predicate is RXCUI or UMLSCUI or in any of the identifiers then change the object to have quotes
        if pred in ( "rxcui", "UMLSCUI", "UMLSAUI") or pred in identifier_source_list or pred == 'NDC' or "RX" in pred or "ORIG_" in pred:
            obj="\""+obj+"\""
        #If predicate is language, change the object to en as default since all are english
        elif pred =="language":
            obj="en"
        
        
        
        #Check if predicate is an RXNorm information such as label, alias, description 
        #Check if predicate is an identifier- SNOMED, MSH, DRUGBANK, NDC which has Wikidata PNodes
        #Check if object is not empty
        if pred in ("alias","label","description", "rxcui", "UMLSCUI", "SNOMEDCT_US", "MSH", "DRUGBANK", "NDC", "ATC") and str(obj) != '':
            #Check if the predicate is an identifier, then change the predicate to the corresponding PNodes
            if pred in ("rxcui", "UMLSCUI","SNOMEDCT_US", "MSH", "DRUGBANK", "NDC", "ATC"):
                pred=pred_wikidata_dict[pred]
            #If the subject is Not In Wikidata (has a QRXNode), then append the triples for that QRXNode
            if row[0] in qnode_dict_notinwiki:
                output_row.append(str(qnode_dict_notinwiki[row[0]]))
                output_row.append(str(pred))
                output_row.append(str(obj))

                #Append the triples to the QRXNode Predicates in Wikidata file
                output_rows_qrxnode_pnode_rxnorm.append(output_row)
            
            #Else If the subject is In Wikidata (has a QNode), then append the triples for that QNode
            else:
                output_row.append(str(qnode_dict_inwiki[row[0]]))
                output_row.append(str(pred))
                output_row.append(str(obj))

                #Append the triples to the QNode Predicates in Wikidata file
                output_rows_qnode_pnode_rxnorm.append(output_row)
        else:
            #Check if the predicate is not language and object is not empty
            if pred not in("language") and str(obj) != '':
                #Add key, values pair to the predicates NOT in Wikidata dictionary
                
                #Check if predicate is in Term Type Dictionary Values, then append PRX_TTY_
                if pred in tty_dict.values():
                    pred_notinwikidata_dict_rxnorm[pred]="PRX_TTY_"+pred
                #Check if predicate is in Identifier Values, then append PRX_ID_
                elif pred in identifier_source_list:
                    pred_notinwikidata_dict_rxnorm[pred]="PRX_ID_"+pred
                #Check if predicate is in RXnorm Relations Values, then append PRX_REL_
                elif pred in rela_types_list:
                    pred_notinwikidata_dict_rxnorm[pred]="PRX_REL_"+pred
                #Else just append PRX_ to denote it is not in Wikidata
                else:
                     pred_notinwikidata_dict_rxnorm[pred]="PRX_"+pred
                pred=pred_notinwikidata_dict_rxnorm[pred]
                
                #If the subject is Not In Wikidata (has a QRXNode), then append the triples for that QRXNode
                if row[0] in qnode_dict_notinwiki:
                    output_row.append(str(qnode_dict_notinwiki[row[0]]))
                    
                    output_row.append(str(pred))
                    output_row.append(str(obj))
                    output_rows_qrxnode_prxnode_rxnorm.append(output_row)
                #Else If the subject is In Wikidata (has a QNode), then append the triples for that QNode
                else:
                    output_row.append(str(qnode_dict_inwiki[row[0]]))
                    output_row.append(str(pred))
                    output_row.append(str(obj))
                
                    output_rows_qnode_prxnode_rxnorm.append(output_row)

#print(output_rows_qrxnode_pnode_rxnorm)

In [40]:
#Write the results to the QRXNode Predicate in Wikidata RXNORM file and QRXNode Predicate NOT in Wikidata RXNORM file
write_tsv("../results/rxnorm/kgtk_triples/nodes/qrxnode_pnode_rxnorm.tsv", output_rows_qrxnode_pnode_rxnorm)
write_tsv("../results/rxnorm/kgtk_triples/nodes/qrxnode_prxnode_rxnorm.tsv", output_rows_qrxnode_prxnode_rxnorm)

#Write the results to the QNode Predicate in Wikidata RXNORM file and QNode Predicate NOT in Wikidata RXNORM file
write_tsv("../results/rxnorm/kgtk_triples/nodes/qnode_pnode_rxnorm.tsv", output_rows_qnode_pnode_rxnorm)
write_tsv("../results/rxnorm/kgtk_triples/nodes/qnode_prxnode_rxnorm.tsv", output_rows_qnode_prxnode_rxnorm)

# **Step 5B: Generate KGTK Triples for RXNORM PROPERTIES Edge & DataType Files**
1. Segregate and Get the data in required KGTK Format for Edges and DataType using the Predicates NOT in Wikidata dictionary
2. Dump the Output in 3 files [Naming convention is as follows]:
*   Predicates NOT in Wikidata: PRXNODE_RXNORM [For Reference Only]
*   Predicates NOT in Wikidata Edges: PRXNODE_Edges_RXNORM
*   Predicates NOT in Wikidata DataType: PRXNODE_DataType_RXNORM
3. Write the results to these 3 KGTK Triples Files






In [41]:
#Get all the keys from the Predicate NOT in Wikidata Dictionary
for x in pred_notinwikidata_dict_rxnorm.keys():
    
    #print(x)
    
    output_row=[]
    pred_value=pred_notinwikidata_dict_rxnorm[x]

    #Get the triple for label
    output_row.append(pred_value)
    output_row.append("label")
    output_row.append("\'"+x+"\'"+"@en")

    #Append the label triple to both PRXNode file and PRXNode Edges file
    output_rows_prxnode_rxnorm.append(output_row)
    output_rows_prxnode_edges_rxnorm.append(output_row)
    
    output_row=[]
    #Get the triple for description- For now desciption and label are same
    output_row.append(pred_value)
    output_row.append("description")
    output_row.append("\'"+x+"\'"+"@en")

    #Append the description triple to both PRXNode file and PRXNode Edges file
    output_rows_prxnode_rxnorm.append(output_row)
    output_rows_prxnode_edges_rxnorm.append(output_row)
    
    output_row=[]
    #Get the triple for Data Type
    output_row.append(pred_value)
    output_row.append("data_type")

    #Check if predicate is Relation or Term Type, then data type is an item
    if ("PRX_REL_" in pred_value or "PRX_TTY_" in pred_value):
        output_row.append("\""+"item"+"\"")
    #Check if predicate is Identifier, then data type is an external-identifier
    elif ("PRX_ID_" in pred_value):
        output_row.append("\""+"external-identifier"+"\"")
    #Else predicate is simply a string
    elif ("PRX_" in pred_value):
        output_row.append("\""+"string"+"\"")
        
    #Append the data-type triple to both PRXNode file and PRXNode DataType file
    output_rows_prxnode_rxnorm.append(output_row)
    output_rows_prxnode_datatype_rxnorm.append(output_row)

In [42]:
#Write the results to PRXNode RXNORM file
write_tsv("../results/rxnorm/kgtk_triples/properties/prxnode_rxnorm.tsv", output_rows_prxnode_rxnorm)

#Write the results to PRXNode Edges RXNORM file
write_tsv("../results/rxnorm/kgtk_triples/properties/prxnode_edges_rxnorm.tsv", output_rows_prxnode_edges_rxnorm)

#Write the results to PRXNode Data-Type RXNORM file
write_tsv("../results/rxnorm/kgtk_triples/properties/prxnode_datatype_rxnorm.tsv", output_rows_prxnode_datatype_rxnorm)

# **Step 6: Generate KGTK Triples for FDA using FDA-Intermediate Triples**
Uses the FDA Intermediate Triple files generated in Step 4
Uses the 2 dictionaries created in Step 2 and Step 3:<br>
qnode_dict_inwiki: RXNormIDs with QNODE in Wikidata<br>
qnode_dict_notinwiki: RXNormIDs with QRXNODE NOT in Wikidata<br>



In [43]:
#Create a list to hold output of triples of FDA QNodes with predicates in Wikidata
output_rows_qnode_pnode_fda=[]
#Create a list to hold output of triples of FDA QNodes with predicates NOT in Wikidata
output_rows_qnode_prxnode_fda=[]

#Create a list to hold output of triples of FDA QRXNodes with predicates in Wikidata
output_rows_qrxnode_pnode_fda=[]
#Create a list to hold output of triples of FDA QRXNodes with predicates NOT in Wikidata
output_rows_qrxnode_prxnode_fda=[]

#Create a list to hold output of triples of FDA PRXNodes 
output_rows_prxnode_fda=[]
output_rows_prxnode_edges_fda=[]
output_rows_prxnode_datatype_fda=[]

In [44]:
#Create a dictionary for FDA Predicates NOT in Wikidata
pred_notinwikidata_dict_fda={}

# **Step 6A: Generate KGTK Triples for FDA NODES Edge File- from PRODUCT-NDC Intermediate File**
1. Uses the fda_product_ndc Intermediate Triple files generated in Step 4
2. Get the data in required KGTK Format
3. Handle the 4 cases to Dump the Output in 4 files [Naming convention is as follows]:
*   Subject in Wikidata, Predicate in Wikidata: QNODE_PNODE_FDA
*   Subject in Wikidata, Predicate NOT in Wikidata: QNODE_PRXNODE_FDA
*   Subject NOT in Wikidata, Predicate in Wikidata: QRXNODE_PNODE_FDA
*   Subject NOT in Wikidata, Predicate NOT in Wikidata: QRXNODE_PRXNODE_FDA
4. Create the Predicates NOT in Wikidata dictionary
5. Write the results to these 4 KGTK Triples Files



In [45]:
#Load the intermediate triples file for product_ndc
with open('../results/fda/intermediate_triples/fda_triples_product_ndc.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    #Get all headers
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    
   
    for row in reader:
        output_row=[]
        
        #Get the object
        obj=str(row[2])
        if "'" in obj:
            obj=obj.replace("\'","\\'")

        #print(obj[:1])
        #Get the predicate
        pred=str(row[1])
        if pred== "package_ndc":
            pred="NDC 3 Segment"
        elif pred== "spl_set_id":
            pred="SPL_SET_ID"
        elif pred=="rxcui":
            pred="Related RXNorm"
        
        #Get the subject from ndc2Seg dictionary
        subj=ndc2seg_dict[str(row[0])]
        
        #Conver dates to kgtk format
        if "date" in pred and pred != "voluntary_mandated":
            obj="^"+'-'.join([obj[:4], obj[4:6], obj[6:]])
        #Check if predicates are identifers
        elif pred in ("NDC 3 Segment", "SPL_SET_ID", "spl_id", "product_id"):
            obj="\""+obj+"\""
        #Else check for finished predicate as its boolean
        elif pred not in ("finished"):
            obj="\'"+obj+"\'"+"@en" 
        
        #Case 1: Subject NOT in Wikidata, Predicate in Wikidata- Get triples
        if subj in qnode_dict_notinwiki and pred in pred_wikidata_dict:
            subj=qnode_dict_notinwiki[subj]
            output_row.append(str(subj))
            output_row.append(str(pred_wikidata_dict[pred]))
            output_row.append(str(obj))

            #Append to list of all QRXNode-PNode triples
            output_rows_qrxnode_pnode_fda.append(output_row)
        
        #Case 2: Subject NOT in Wikidata, Predicate NOT in Wikidata- Get triples
        elif subj in qnode_dict_notinwiki and pred not in pred_wikidata_dict:
            subj=qnode_dict_notinwiki[subj]
            output_row.append(str(subj))

            #Check if predicate is identifier or not
            if pred in identifier_source_list:
                pred_notinwikidata_dict_fda[pred]="PRX_ID_"+pred
            else:
                pred_notinwikidata_dict_fda[pred]="PRX_FDA_"+pred
            pred=pred_notinwikidata_dict_fda[pred]
            output_row.append(str(pred))
            output_row.append(str(obj))

            #Append to list of all QRXNode-PRXNode triples
            output_rows_qrxnode_prxnode_fda.append(output_row)
        
        #Case 3: Subject in Wikidata, Predicate in Wikidata- Get triples
        elif subj not in qnode_dict_notinwiki and pred in pred_wikidata_dict:
            subj=qnode_dict_inwiki[subj]
            output_row.append(str(subj))
            output_row.append(str(pred_wikidata_dict[pred]))
            output_row.append(str(obj))

            #Append to list of all QNode-PNode triples
            output_rows_qnode_pnode_fda.append(output_row)
        
        #Case 4: Subject in Wikidata, Predicate NOT in Wikidata- Get triples
        elif subj not in qnode_dict_notinwiki and pred not in pred_wikidata_dict:
            subj=qnode_dict_inwiki[subj]
            output_row.append(str(subj))

            #Check if predicate is identifier or not
            if pred in identifier_source_list:
                pred_notinwikidata_dict_fda[pred]="PRX_ID_"+pred
            else:
                pred_notinwikidata_dict_fda[pred]="PRX_FDA_"+pred
            pred=pred_notinwikidata_dict_fda[pred]
            output_row.append(str(pred))
            output_row.append(str(obj))

            #Append to list of all QNode-PRXNode triples
            output_rows_qnode_prxnode_fda.append(output_row)

# **Step 6B: Generate KGTK Triples for FDA NODES Edge File- from PACKAGE-NDC Intermediate File**
1. Uses the fda_package_ndc Intermediate Triple files generated in Step 4
2. Get the data in required KGTK Format
3. Handle the 4 cases to Dump the Output in 4 files [Naming convention is as follows]:
*   Subject in Wikidata, Predicate in Wikidata: QNODE_PNODE_FDA
*   Subject in Wikidata, Predicate NOT in Wikidata: QNODE_PRXNODE_FDA
*   Subject NOT in Wikidata, Predicate in Wikidata: QRXNODE_PNODE_FDA
*   Subject NOT in Wikidata, Predicate NOT in Wikidata: QRXNODE_PRXNODE_FDA
4. Create the Predicates NOT in Wikidata dictionary
5. Write the results to these 4 KGTK Triples Files


In [46]:
#Load the intermediate triples file for package_ndc
with open('../results/fda/intermediate_triples/fda_triples_package_ndc.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    #Get all headers
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    
   
    for row in reader:
        output_row=[]
        
        #Get object
        obj=str(row[2])
        if "'" in obj:
            obj=obj.replace("\'","\\'")
        
        #Get predicate
        pred=str(row[1])
        if pred== "package_ndc":
            pred="NDC 3 Segment"
        elif pred== "spl_set_id":
            pred="SPL_SET_ID"
        elif pred=="rxcui":
            pred="Related RXNorm"
        
        #Get subject from NDC 3Segment Dict
        subj=ndc3seg_dict[str(row[0])]
        
        #Convert date to KGTK format
        if "date" in pred and pred != "voluntary_mandated":
            obj="^"+'-'.join([obj[:4], obj[4:6], obj[6:]])
        #Check if predicate is identifer
        elif pred in ("NDC 3 Segment", "SPL_SET_ID", "spl_id", "product_id"):
            obj="\""+obj+"\""
        #Check for Boolean sample predicate
        elif pred not in ("sample"):
            obj="\'"+obj+"\'"+"@en" 
        
        #Case 1: Subject NOT in Wikidata, Predicate in Wikidata- Get triples
        if subj in qnode_dict_notinwiki and pred in pred_wikidata_dict:
            subj=qnode_dict_notinwiki[subj]
            output_row.append(str(subj))
            output_row.append(str(pred_wikidata_dict[pred]))
            output_row.append(str(obj))
        
            output_rows_qrxnode_pnode_fda.append(output_row)
        
        #Case 2: Subject NOT in Wikidata, Predicate NOT in Wikidata- Get triples
        elif subj in qnode_dict_notinwiki and pred not in pred_wikidata_dict:
            subj=qnode_dict_notinwiki[subj]
            output_row.append(str(subj))
            if pred in identifier_source_list:
                pred_notinwikidata_dict_fda[pred]="PRX_ID_"+pred
            else:
                pred_notinwikidata_dict_fda[pred]="PRX_FDA_"+pred
            pred=pred_notinwikidata_dict_fda[pred]
            output_row.append(str(pred))
            output_row.append(str(obj))
        
            output_rows_qrxnode_prxnode_fda.append(output_row)
        
        #Case 3: Subject in Wikidata, Predicate in Wikidata- Get triples
        elif subj not in qnode_dict_notinwiki and pred in pred_wikidata_dict:
            subj=qnode_dict_inwiki[subj]
            output_row.append(str(subj))
            output_row.append(str(pred_wikidata_dict[pred]))
            output_row.append(str(obj))
        
            output_rows_qnode_pnode_fda.append(output_row)
        
        #Case 4: Subject in Wikidata, Predicate NOT in Wikidata- Get triples
        elif subj not in qnode_dict_notinwiki and pred not in pred_wikidata_dict:
            subj=qnode_dict_inwiki[subj]
            output_row.append(str(subj))
            if pred in identifier_source_list:
                pred_notinwikidata_dict_fda[pred]="PRX_ID_"+pred
            else:
                pred_notinwikidata_dict_fda[pred]="PRX_FDA_"+pred
            pred=pred_notinwikidata_dict_fda[pred]
            output_row.append(str(pred))
            output_row.append(str(obj))
        
            output_rows_qnode_prxnode_fda.append(output_row)

# **Step 6C: Generate KGTK Triples for FDA NODES Edge File- from SPL-NDC Intermediate File**
1. Uses the fda_spl_ndc Intermediate Triple files generated in Step 4
2. Get the data in required KGTK Format
3. Handle the 4 cases to Dump the Output in 4 files [Naming convention is as follows]:
*   Subject in Wikidata, Predicate in Wikidata: QNODE_PNODE_FDA
*   Subject in Wikidata, Predicate NOT in Wikidata: QNODE_PRXNODE_FDA
*   Subject NOT in Wikidata, Predicate in Wikidata: QRXNODE_PNODE_FDA
*   Subject NOT in Wikidata, Predicate NOT in Wikidata: QRXNODE_PRXNODE_FDA
4. Create the Predicates NOT in Wikidata dictionary
5. Write the results to these 4 KGTK Triples Files


In [47]:
#Load the intermediate triples file for spl_ndc
with open('../results/fda/intermediate_triples/fda_triples_spl_ndc.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    #Get headers
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    
   
    for row in reader:
        output_row=[]
        
        #Get object
        obj=str(row[2])
        if "'" in obj:
            obj=obj.replace("\'","\\'")
        #print(obj[:1])

        #Get predicate
        pred=str(row[1])
        if pred== "package_ndc":
            pred="NDC 3 Segment"
        elif pred== "spl_set_id":
            pred="SPL_SET_ID"
        elif pred=="rxcui":
            pred="Related RXNorm"
        
        #Get subject from Spl_setid dict
        subj=spl_setid_dict[str(row[0])]

        #Convert date to KGTK format
        if "date" in pred and pred != "voluntary_mandated":
            obj="^"+'-'.join([obj[:4], obj[4:6], obj[6:]])
        #Check if predicate in identifier
        elif pred in ("NDC 3 Segment", "SPL_SET_ID", "spl_id", "product_id", "Related RXNorm", "unii", "upc"):
            obj="\""+obj+"\""
        else:
            obj="\'"+obj+"\'"+"@en" 

        #Case 1: Subject NOT in Wikidata, Predicate in Wikidata- Get triples
        if subj in qnode_dict_notinwiki and pred in pred_wikidata_dict:
            subj=qnode_dict_notinwiki[subj]
            output_row.append(str(subj))
            output_row.append(str(pred_wikidata_dict[pred]))
            output_row.append(str(obj))
        
            output_rows_qrxnode_pnode_fda.append(output_row)
        
        #Case 2: Subject NOT in Wikidata, Predicate NOT in Wikidata- Get triples
        elif subj in qnode_dict_notinwiki and pred not in pred_wikidata_dict:
            subj=qnode_dict_notinwiki[subj]
            output_row.append(str(subj))
            if pred in identifier_source_list:
                pred_notinwikidata_dict_fda[pred]="PRX_ID_"+pred
            else:
                pred_notinwikidata_dict_fda[pred]="PRX_FDA_"+pred
            pred=pred_notinwikidata_dict_fda[pred]
            output_row.append(str(pred))
            output_row.append(str(obj))
        
            output_rows_qrxnode_prxnode_fda.append(output_row)
        
        #Case 3: Subject in Wikidata, Predicate in Wikidata- Get triples
        elif subj not in qnode_dict_notinwiki and pred in pred_wikidata_dict:
            subj=qnode_dict_inwiki[subj]
            output_row.append(str(subj))
            output_row.append(str(pred_wikidata_dict[pred]))
            output_row.append(str(obj))
        
            output_rows_qnode_pnode_fda.append(output_row)

        #Case 4: Subject in Wikidata, Predicate NOT in Wikidata- Get triples
        elif subj not in qnode_dict_notinwiki and pred not in pred_wikidata_dict:
            subj=qnode_dict_inwiki[subj]
            output_row.append(str(subj))
            if pred in identifier_source_list:
                pred_notinwikidata_dict_fda[pred]="PRX_ID_"+pred
            else:
                pred_notinwikidata_dict_fda[pred]="PRX_FDA_"+pred
            pred=pred_notinwikidata_dict_fda[pred]
            output_row.append(str(pred))
            output_row.append(str(obj))
        
            output_rows_qnode_prxnode_fda.append(output_row)

# **Step 6D: Generate KGTK Triples for FDA NODES Edge File- from SPL-LABELING Intermediate File**
1. Uses the fda_spl_labeling Intermediate Triple files generated in Step 4
2. Get the data in required KGTK Format
3. Handle the 4 cases to Dump the Output in 4 files [Naming convention is as follows]:
*   Subject in Wikidata, Predicate in Wikidata: QNODE_PNODE_FDA
*   Subject in Wikidata, Predicate NOT in Wikidata: QNODE_PRXNODE_FDA
*   Subject NOT in Wikidata, Predicate in Wikidata: QRXNODE_PNODE_FDA
*   Subject NOT in Wikidata, Predicate NOT in Wikidata: QRXNODE_PRXNODE_FDA
4. Create the Predicates NOT in Wikidata dictionary
5. Write the results to these 4 KGTK Triples Files


In [48]:
#Load the intermediate triples file for spl_labeling
with open('../results/fda/intermediate_triples/fda_triples_spl_labeling.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    #Get the header
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    
    for row in reader:
        if len(row) == 3:
          output_row=[]

          #Get object
          obj=str(row[2])
          if "'" in obj:
              obj=obj.replace("\'","\\'")
          #print(obj[:1])

          #Get predicate
          pred=str(row[1])
          if pred== "package_ndc":
              pred="NDC 3 Segment"
          elif pred== "spl_set_id":
              pred="SPL_SET_ID"
          elif pred=="rxcui":
              pred="Related RXNorm"
          
          #Get subject from SPL_SETid dict
          subj=spl_setid_dict[str(row[0])]

          #Convert date to KGTK format
          if "date" in pred and pred != "voluntary_mandated":
              obj="^"+'-'.join([obj[:4], obj[4:6], obj[6:]])
          #Check if pred is identifier
          elif pred in ("NDC 3 Segment", "SPL_SET_ID", "spl_id", "product_id", "Related RXNorm", "unii", "upc"):
              obj="\""+obj+"\""
          else:
              obj="\'"+obj+"\'"+"@en" 

          #Case 1: Subject NOT in Wikidata, Predicate in Wikidata- Get triples
          if subj in qnode_dict_notinwiki and pred in pred_wikidata_dict:
              subj=qnode_dict_notinwiki[subj]
              output_row.append(str(subj))
              output_row.append(str(pred_wikidata_dict[pred]))
              output_row.append(str(obj))
          
              output_rows_qrxnode_pnode_fda.append(output_row)
          
          #Case 2: Subject NOT in Wikidata, Predicate NOT in Wikidata- Get triples
          elif subj in qnode_dict_notinwiki and pred not in pred_wikidata_dict:
              subj=qnode_dict_notinwiki[subj]
              output_row.append(str(subj))
              if pred in identifier_source_list:
                  pred_notinwikidata_dict_fda[pred]="PRX_ID_"+pred
              else:
                  pred_notinwikidata_dict_fda[pred]="PRX_FDA_"+pred
              pred=pred_notinwikidata_dict_fda[pred]
              output_row.append(str(pred))
              output_row.append(str(obj))
          
              output_rows_qrxnode_prxnode_fda.append(output_row)
          
          #Case 3: Subject in Wikidata, Predicate in Wikidata- Get triples
          elif subj not in qnode_dict_notinwiki and pred in pred_wikidata_dict:
              subj=qnode_dict_inwiki[subj]
              output_row.append(str(subj))
              output_row.append(str(pred_wikidata_dict[pred]))
              output_row.append(str(obj))
          
              output_rows_qnode_pnode_fda.append(output_row)

          #Case 4: Subject in Wikidata, Predicate NOT in Wikidata- Get triples
          elif subj not in qnode_dict_notinwiki and pred not in pred_wikidata_dict:
              subj=qnode_dict_inwiki[subj]
              output_row.append(str(subj))
              if pred in identifier_source_list:
                  pred_notinwikidata_dict_fda[pred]="PRX_ID_"+pred
              else:
                  pred_notinwikidata_dict_fda[pred]="PRX_FDA_"+pred
              pred=pred_notinwikidata_dict_fda[pred]
              output_row.append(str(pred))
              output_row.append(str(obj))
          
              output_rows_qnode_prxnode_fda.append(output_row)

# **Step 6E: Generate KGTK Triples for FDA NODES Edge File- from APPLICATION-DRUGSFDA Intermediate File**
1. Uses the fda_application_drugsfda Intermediate Triple files generated in Step 4
2. Get the data in required KGTK Format
3. Handle the 4 cases to Dump the Output in 4 files [Naming convention is as follows]:
*   Subject in Wikidata, Predicate in Wikidata: QNODE_PNODE_FDA
*   Subject in Wikidata, Predicate NOT in Wikidata: QNODE_PRXNODE_FDA
*   Subject NOT in Wikidata, Predicate in Wikidata: QRXNODE_PNODE_FDA
*   Subject NOT in Wikidata, Predicate NOT in Wikidata: QRXNODE_PRXNODE_FDA
4. Create the Predicates NOT in Wikidata dictionary
5. Write the results to these 4 KGTK Triples Files


In [49]:
#Load the intermediate triples file for application_drugsfda
with open('../results/fda/intermediate_triples/fda_triples_application_drugsfda.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    
   
    for row in reader:
        output_row=[]
        
        obj=str(row[2])
        if "'" in obj:
            obj=obj.replace("\'","\\'")
        #print(obj[:1])
        pred=str(row[1])
        if pred== "package_ndc":
            pred="NDC 3 Segment"
        elif pred== "spl_set_id":
            pred="SPL_SET_ID"
        elif pred=="rxcui":
            pred="Related RXNorm"
        
        subj=application_dict[str(row[0])]
        if "date" in pred and pred != "voluntary_mandated":
            obj="^"+'-'.join([obj[:4], obj[4:6], obj[6:]])
        elif pred in ("NDC 3 Segment", "SPL_SET_ID", "spl_id", "product_id", "Related RXNorm", "unii", "upc"):
            obj="\""+obj+"\""
        else:
            obj="\'"+obj+"\'"+"@en" 


        #Case 1: Subject NOT in Wikidata, Predicate in Wikidata- Get triples
        if subj in qnode_dict_notinwiki and pred in pred_wikidata_dict:
            subj=qnode_dict_notinwiki[subj]
            output_row.append(str(subj))
            output_row.append(str(pred_wikidata_dict[pred]))
            output_row.append(str(obj))
        
            output_rows_qrxnode_pnode_fda.append(output_row)
        #Case 2: Subject NOT in Wikidata, Predicate NOT in Wikidata- Get triples
        elif subj in qnode_dict_notinwiki and pred not in pred_wikidata_dict:
            subj=qnode_dict_notinwiki[subj]
            output_row.append(str(subj))
            if pred in identifier_source_list:
                pred_notinwikidata_dict_fda[pred]="PRX_ID_"+pred
            else:
                pred_notinwikidata_dict_fda[pred]="PRX_FDA_"+pred
            pred=pred_notinwikidata_dict_fda[pred]
            output_row.append(str(pred))
            output_row.append(str(obj))
        
            output_rows_qrxnode_prxnode_fda.append(output_row)
        #Case 3: Subject in Wikidata, Predicate in Wikidata- Get triples
        elif subj not in qnode_dict_notinwiki and pred in pred_wikidata_dict:
            subj=qnode_dict_inwiki[subj]
            output_row.append(str(subj))
            output_row.append(str(pred_wikidata_dict[pred]))
            output_row.append(str(obj))
        
            output_rows_qnode_pnode_fda.append(output_row)
        #Case 4: Subject in Wikidata, Predicate NOT in Wikidata- Get triples
        elif subj not in qnode_dict_notinwiki and pred not in pred_wikidata_dict:
            subj=qnode_dict_inwiki[subj]
            output_row.append(str(subj))
            if pred in identifier_source_list:
                pred_notinwikidata_dict_fda[pred]="PRX_ID_"+pred
            else:
                pred_notinwikidata_dict_fda[pred]="PRX_FDA_"+pred
            pred=pred_notinwikidata_dict_fda[pred]
            output_row.append(str(pred))
            output_row.append(str(obj))
        
            output_rows_qnode_prxnode_fda.append(output_row)

# **Step 6F: Generate KGTK Triples for FDA NODES Edge File- from PACKAGE-ENFORCEMENT Intermediate File**
1. Uses the fda_package_enforcment Intermediate Triple files generated in Step 4
2. Get the data in required KGTK Format
3. Handle the 4 cases to Dump the Output in 4 files [Naming convention is as follows]:
*   Subject in Wikidata, Predicate in Wikidata: QNODE_PNODE_FDA
*   Subject in Wikidata, Predicate NOT in Wikidata: QNODE_PRXNODE_FDA
*   Subject NOT in Wikidata, Predicate in Wikidata: QRXNODE_PNODE_FDA
*   Subject NOT in Wikidata, Predicate NOT in Wikidata: QRXNODE_PRXNODE_FDA
4. Create the Predicates NOT in Wikidata dictionary
5. Write the results to these 4 KGTK Triples Files


In [50]:
#Load the intermediate triples file for package_enforcement
with open('../results/fda/intermediate_triples/fda_triples_package_enforcement.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    
   
    for row in reader:
        output_row=[]
        
        obj=str(row[2])
        if "'" in obj:
            obj=obj.replace("\'","\\'")
        #print(obj[:1])
        pred=str(row[1])
        if pred== "package_ndc":
            pred="NDC 3 Segment"
        elif pred== "spl_set_id":
            pred="SPL_SET_ID"
        elif pred=="rxcui":
            pred="Related RXNorm"
        
        subj=ndc3seg_dict[str(row[0])]
        if "date" in pred and pred != "voluntary_mandated":
            obj="^"+'-'.join([obj[:4], obj[4:6], obj[6:]])
        elif pred in ("NDC 3 Segment", "SPL_SET_ID", "spl_id", "product_id", "Related RXNorm", "unii", "upc", "event_id"):
            obj="\""+obj+"\""
        else:
            obj="\'"+obj+"\'"+"@en" 

        #Case 1: Subject NOT in Wikidata, Predicate in Wikidata- Get triples
        if subj in qnode_dict_notinwiki and pred in pred_wikidata_dict:
            subj=qnode_dict_notinwiki[subj]
            output_row.append(str(subj))
            output_row.append(str(pred_wikidata_dict[pred]))
            output_row.append(str(obj))
        
            output_rows_qrxnode_pnode_fda.append(output_row)
        
        #Case 2: Subject NOT in Wikidata, Predicate NOT in Wikidata- Get triples
        elif subj in qnode_dict_notinwiki and pred not in pred_wikidata_dict:
            subj=qnode_dict_notinwiki[subj]
            output_row.append(str(subj))
            if pred in identifier_source_list:
                pred_notinwikidata_dict_fda[pred]="PRX_ID_"+pred
            else:
                pred_notinwikidata_dict_fda[pred]="PRX_FDA_"+pred
            pred=pred_notinwikidata_dict_fda[pred]
            output_row.append(str(pred))
            output_row.append(str(obj))
        
            output_rows_qrxnode_prxnode_fda.append(output_row)
        
        #Case 3: Subject NOT in Wikidata, Predicate in Wikidata- Get triples
        elif subj not in qnode_dict_notinwiki and pred in pred_wikidata_dict:
            subj=qnode_dict_inwiki[subj]
            output_row.append(str(subj))
            output_row.append(str(pred_wikidata_dict[pred]))
            output_row.append(str(obj))
        
            output_rows_qnode_pnode_fda.append(output_row)

        #Case 4: Subject NOT in Wikidata, Predicate NOT in Wikidata- Get triples
        elif subj not in qnode_dict_notinwiki and pred not in pred_wikidata_dict:
            subj=qnode_dict_inwiki[subj]
            output_row.append(str(subj))
            if pred in identifier_source_list:
                pred_notinwikidata_dict_fda[pred]="PRX_ID_"+pred
            else:
                pred_notinwikidata_dict_fda[pred]="PRX_FDA_"+pred
            pred=pred_notinwikidata_dict_fda[pred]
            output_row.append(str(pred))
            output_row.append(str(obj))
        
            output_rows_qnode_prxnode_fda.append(output_row)

In [51]:
#Write the results to the QRXNode Predicate in Wikidata FDA file and QRXNode Predicate NOT in Wikidata FDA file
write_tsv('../results/fda/kgtk_triples/nodes/qrxnode_prxnode_fda.tsv', output_rows_qrxnode_prxnode_fda)
write_tsv('../results/fda/kgtk_triples/nodes/qnode_prxnode_fda.tsv', output_rows_qnode_prxnode_fda)

#Write the results to the QNode Predicate in Wikidata FDA file and QNode Predicate NOT in Wikidata FDA file
write_tsv('../results/fda/kgtk_triples/nodes/qrxnode_pnode_fda.tsv', output_rows_qrxnode_pnode_fda)
write_tsv('../results/fda/kgtk_triples/nodes/qnode_pnode_fda.tsv', output_rows_qnode_pnode_fda)

# **Step 6G: Generate KGTK Triples for FDA PROPERTIES Edge & DataType Files**
1. Segregate and Get the data in required KGTK Format for Edges and DataType using the Predicates NOT in Wikidata dictionary
2. Dump the Output in 3 files [Naming convention is as follows]:
*   Predicates NOT in Wikidata: PRXNODE_FDA [For Reference Only]
*   Predicates NOT in Wikidata Edges: PRXNODE_Edges_FDA
*   Predicates NOT in Wikidata DataType: PRXNODE_DataType_FDA
3. Write the results to these 3 KGTK Triples Files






In [52]:
#Get keys of all Predicates NOT in wikidata from FDA
for x in pred_notinwikidata_dict_fda.keys():
    
    #print(x)
    
    output_row=[]
    pred_value=pred_notinwikidata_dict_fda[x]

    #Get label for predicates
    output_row.append(pred_value)
    output_row.append("label")
    output_row.append("\'"+x+"\'"+"@en")

    #Append to Predicate and Predicate Edges
    output_rows_prxnode_fda.append(output_row)
    output_rows_prxnode_edges_fda.append(output_row)
    
    output_row=[]
    #Get description for predicates
    output_row.append(pred_value)
    output_row.append("description")
    output_row.append("\'"+x+"\'"+"@en")

    #Append to Predicate and Predicate Edges
    output_rows_prxnode_fda.append(output_row)
    output_rows_prxnode_edges_fda.append(output_row)
    
    output_row=[]
    #Get datatype for predicates
    output_row.append(pred_value)
    output_row.append("data_type")
    if ("PRX_ID_" in pred_value):
        output_row.append("\""+"external-identifier"+"\"")
    elif ("PRX_FDA_" in pred_value):
        output_row.append("\""+"string"+"\"")
    #output_row.append("\"String\"")

    #Append to Predicate and Predicate DataType files
    output_rows_prxnode_fda.append(output_row)
    output_rows_prxnode_datatype_fda.append(output_row)

In [53]:
#Write the results to the PRXNode Predicate NOT in Wikidata FDA file,  Edges file and DataType file 
write_tsv("../results/fda/kgtk_triples/properties/prxnode_fda.tsv", output_rows_prxnode_fda)
write_tsv("../results/fda/kgtk_triples/properties/prxnode_edges_fda.tsv", output_rows_prxnode_edges_fda)
write_tsv("../results/fda/kgtk_triples/properties/prxnode_datatype_fda.tsv", output_rows_prxnode_datatype_fda)

# **Step 7: Perform KGTK Transformations and Validation on NODES and PROPERTIES Edge files**







# **Step 7A: Perform KGTK Compact Transformation**
1. Perform KGTK Compact Transformation for RXNorm KGTK triples NODES:<br>
      4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
2. Perform KGTK Compact Transformation for FDA KGTK triples NODES:<br>
      4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
3. Perform KGTK Compact Transformation for RXNorm KGTK triples PROPERTIES:<br>
      1 File- PRXNode
4. Perform KGTK Compact Transformation for FDA KGTK triples PROPERTIES:<br>
      1 File- PRXNode






In [54]:
#Perform KGTK Compact Transformation for RXNorm KGTK triples NODES:
#4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
!kgtk compact -i ../results/rxnorm/kgtk_triples/nodes/qrxnode_pnode_rxnorm.tsv -o ../results/rxnorm/kgtk_triples_compact/nodes/qrxnode_pnode_rxnorm_compact.tsv
!kgtk compact -i ../results/rxnorm/kgtk_triples/nodes/qrxnode_prxnode_rxnorm.tsv -o ../results/rxnorm/kgtk_triples_compact/nodes/qrxnode_prxnode_rxnorm_compact.tsv
!kgtk compact -i ../results/rxnorm/kgtk_triples/nodes/qnode_pnode_rxnorm.tsv -o ../results/rxnorm/kgtk_triples_compact/nodes/qnode_pnode_rxnorm_compact.tsv
!kgtk compact -i ../results/rxnorm/kgtk_triples/nodes/qnode_prxnode_rxnorm.tsv -o ../results/rxnorm/kgtk_triples_compact/nodes/qnode_prxnode_rxnorm_compact.tsv

In [55]:
#Perform KGTK Compact Transformation for FDA KGTK triples NODES:
#4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
!kgtk compact -i ../results/fda/kgtk_triples/nodes/qrxnode_pnode_fda.tsv -o ../results/fda/kgtk_triples_compact/nodes/qrxnode_pnode_fda_compact.tsv
!kgtk compact -i ../results/fda/kgtk_triples/nodes/qrxnode_prxnode_fda.tsv -o ../results/fda/kgtk_triples_compact/nodes/qrxnode_prxnode_fda_compact.tsv
!kgtk compact -i ../results/fda/kgtk_triples/nodes/qnode_pnode_fda.tsv -o ../results/fda/kgtk_triples_compact/nodes/qnode_pnode_fda_compact.tsv
!kgtk compact -i ../results/fda/kgtk_triples/nodes/qnode_prxnode_fda.tsv -o ../results/fda/kgtk_triples_compact/nodes/qnode_prxnode_fda_compact.tsv

In [56]:
#Perform KGTK Compact Transformation for RXNorm KGTK triples PROPERTIES:
#1 Files- PRXNode
!kgtk compact -i ../results/rxnorm/kgtk_triples/properties/prxnode_edges_rxnorm.tsv -o ../results/rxnorm/kgtk_triples_compact/properties/prxnode_edges_rxnorm_compact.tsv

In [57]:
#Perform KGTK Compact Transformation for FDA KGTK triples PROPERTIES:
#1 Files- PRXNode
!kgtk compact -i ../results/fda/kgtk_triples/properties/prxnode_edges_fda.tsv -o ../results/fda/kgtk_triples_compact/properties/prxnode_edges_fda_compact.tsv

# **Step 7B: Perform KGTK ADD-ID Transformation**
1. Perform KGTK ADD-ID Transformation for RXNorm KGTK triples NODES:<br>
      4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
2. Perform KGTK ADD-ID Transformation for FDA KGTK triples NODES:<br>
      4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
3. Perform KGTK ADD-ID Transformation for RXNorm KGTK triples PROPERTIES:<br>
      1 File- PRXNode
4. Perform KGTK ADD-ID Transformation for FDA KGTK triples PROPERTIES:<br>
      1 File- PRXNode


In [58]:
#Perform KGTK Add-ID Transformation for RXNorm KGTK triples NODES:
#4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
!kgtk add-id -i ../results/rxnorm/kgtk_triples_compact/nodes/qrxnode_pnode_rxnorm_compact.tsv -o ../results/rxnorm/kgtk_triples_id/nodes/qrxnode_pnode_rxnorm_id.tsv --id-style node1-label-node2-num
!kgtk add-id -i ../results/rxnorm/kgtk_triples_compact/nodes/qrxnode_prxnode_rxnorm_compact.tsv -o ../results/rxnorm/kgtk_triples_id/nodes/qrxnode_prxnode_rxnorm_id.tsv --id-style node1-label-node2-num
!kgtk add-id -i ../results/rxnorm/kgtk_triples_compact/nodes/qnode_pnode_rxnorm_compact.tsv -o ../results/rxnorm/kgtk_triples_id/nodes/qnode_pnode_rxnorm_id.tsv --id-style node1-label-node2-num
!kgtk add-id -i ../results/rxnorm/kgtk_triples_compact/nodes/qnode_prxnode_rxnorm_compact.tsv -o ../results/rxnorm/kgtk_triples_id/nodes/qnode_prxnode_rxnorm_id.tsv --id-style node1-label-node2-num

In [59]:
#Perform KGTK Add-ID Transformation for FDA KGTK triples NODES:
#4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
!kgtk add-id -i ../results/fda/kgtk_triples_compact/nodes/qrxnode_pnode_fda_compact.tsv -o ../results/fda/kgtk_triples_id/nodes/qrxnode_pnode_fda_id.tsv --id-style node1-label-node2-num
!kgtk add-id -i ../results/fda/kgtk_triples_compact/nodes/qrxnode_prxnode_fda_compact.tsv -o ../results/fda/kgtk_triples_id/nodes/qrxnode_prxnode_fda_id.tsv --id-style node1-label-node2-num
!kgtk add-id -i ../results/fda/kgtk_triples_compact/nodes/qnode_pnode_fda_compact.tsv -o ../results/fda/kgtk_triples_id/nodes/qnode_pnode_fda_id.tsv --id-style node1-label-node2-num
!kgtk add-id -i ../results/fda/kgtk_triples_compact/nodes/qnode_prxnode_fda_compact.tsv -o ../results/fda/kgtk_triples_id/nodes/qnode_prxnode_fda_id.tsv --id-style node1-label-node2-num

In [60]:
#Perform KGTK Add-ID Transformation for RXNorm KGTK triples PROPERTIES:
#1 Files- PRXNode
!kgtk add-id -i ../results/rxnorm/kgtk_triples_compact/properties/prxnode_edges_rxnorm_compact.tsv -o ../results/rxnorm/kgtk_triples_id/properties/prxnode_edges_rxnorm_id.tsv --id-style node1-label-node2-num

In [61]:
#Perform KGTK Compact Transformation for FDA KGTK triples PROPERTIES:
#1 Files- PRXNode
!kgtk add-id -i ../results/fda/kgtk_triples_compact/properties/prxnode_edges_fda_compact.tsv -o ../results/fda/kgtk_triples_id/properties/prxnode_edges_fda_id.tsv --id-style node1-label-node2-num

# **Step 7C: Perform KGTK Validate**
1. Perform KGTK Validate for RXNorm KGTK triples NODES:<br>
      4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
2. Perform KGTK Validate for FDA KGTK triples NODES:<br>
      4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
3. Perform KGTK Validate for RXNorm KGTK triples PROPERTIES:<br>
      1 File- PRXNode
4. Perform KGTK Validate for FDA KGTK triples PROPERTIES:<br>
      1 File- PRXNode


In [62]:
#Perform KGTK Validate Operation for RXNorm KGTK triples NODES:
#4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
!kgtk validate -i ../results/rxnorm/kgtk_triples_id/nodes/qrxnode_pnode_rxnorm_id.tsv -v
!kgtk validate -i ../results/rxnorm/kgtk_triples_id/nodes/qrxnode_prxnode_rxnorm_id.tsv -v
!kgtk validate -i ../results/rxnorm/kgtk_triples_id/nodes/qnode_pnode_rxnorm_id.tsv -v
!kgtk validate -i ../results/rxnorm/kgtk_triples_id/nodes/qnode_prxnode_rxnorm_id.tsv -v


Validating '../results/rxnorm/kgtk_triples_id/nodes/qrxnode_pnode_rxnorm_id.tsv'
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file ../results/rxnorm/kgtk_triples_id/nodes/qrxnode_pnode_rxnorm_id.tsv
header: node1	label	node2	id
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=0 label=1 node2=2 id=3
KgtkReader: Reading an edge file.
Validated 2164023 data lines

Validating '../results/rxnorm/kgtk_triples_id/nodes/qrxnode_prxnode_rxnorm_id.tsv'
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file ../results/rxnorm/kgtk_triples_id/nodes/qrxnode_prxnode_rxnorm_id.tsv
header: node1	label	node2	id
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=0 label=1 node2=2 id=3
KgtkReader: Reading an edge file.
Validated 4863938 data lines

Validating '../results/rxnorm/kgtk_triples_id/nodes/qnode_pnode_rxnorm_id.tsv'
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file ../results/rxnorm/kgtk_triples_id/nodes/qnode_

In [63]:
#Perform KGTK Validate Operation for FDA KGTK triples NODES:
#4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
!kgtk validate -i ../results/fda/kgtk_triples_id/nodes/qrxnode_pnode_fda_id.tsv -v
!kgtk validate -i ../results/fda/kgtk_triples_id/nodes/qrxnode_prxnode_fda_id.tsv -v
!kgtk validate -i ../results/fda/kgtk_triples_id/nodes/qnode_pnode_fda_id.tsv -v
!kgtk validate -i ../results/fda/kgtk_triples_id/nodes/qnode_prxnode_fda_id.tsv -v

Output hidden; open in https://colab.research.google.com to view.

In [64]:
#Perform KGTK Validate Operation for RXNorm KGTK triples PROPERTIES:
#1 Files- PRXNode
!kgtk validate -i ../results/rxnorm/kgtk_triples_id/properties/prxnode_edges_rxnorm_id.tsv -v


Validating '../results/rxnorm/kgtk_triples_id/properties/prxnode_edges_rxnorm_id.tsv'
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file ../results/rxnorm/kgtk_triples_id/properties/prxnode_edges_rxnorm_id.tsv
header: node1	label	node2	id
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=0 label=1 node2=2 id=3
KgtkReader: Reading an edge file.
Validated 124 data lines


In [65]:
#Perform KGTK Validate Operation for FDA KGTK triples PROPERTIES:
#1 Files- PRXNode
!kgtk validate -i ../results/fda/kgtk_triples_id/properties/prxnode_edges_fda_id.tsv -v


Validating '../results/fda/kgtk_triples_id/properties/prxnode_edges_fda_id.tsv'
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file ../results/fda/kgtk_triples_id/properties/prxnode_edges_fda_id.tsv
header: node1	label	node2	id
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=0 label=1 node2=2 id=3
KgtkReader: Reading an edge file.
Validated 380 data lines


# **Step 8: Generate KGTK Triples for FDA and RXNORM Qualifiers**
Uses the FDA and RXNORM KGTK Triples with IDs [created in Step 7] to generate Qualifier Edges


# **Step 8A: Generate KGTK Triples for FDA Qualifiers**
1. Generate KGTK Triples for Qualifiers Related to Active Ingredients in Drug-NDC, Products information in Drugs@FDA and Submissions Information in Drugs@FDA
2. Get the qualifiers for 2 files:
      QNODE_PRXNODE_QUALIFIER and QRXNODE_PRXNODE_QUALIFIER
3. Write the results to these 2 files


In [66]:
#Function: Get key from given value and dictionary name
def get_key(val, input_dict):
    for key, value in input_dict.items():
         if val == value:
             return key

In [67]:
#Create a list for output rows of QRXNode PRXNode FDA Qualifiers
output_rows_qrxnode_prxnode_fda_qualifier=[]
#Create a list for output rows of QNode PRXNode FDA Qualifiers
output_rows_qnode_prxnode_fda_qualifier=[]

#Create a dict to hold Predicates NOT in Wikidata resulting from Qualifiers
pred_notinwikidata_dict_fda_qualifier={}

#Get qualifiers from QRXNode PRXNode FDA file
with open('../results/fda/kgtk_triples_id/nodes/qrxnode_prxnode_fda_id.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    #Get header
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    #print(submisions_dict)
    #print(products_dict)
    #print(active_ingredients_dict)
   
    for row in reader:
        #Check if predicate is Product Number (products_dict qualifiers from Drugs@FDA)
        #Check if predicate is Submission Status Date (submission date qualifiers from Drugs@FDA)
        #Check if predicate is Active Ingredient (active ingredient qualifiers from Product-NDC)
        if row[1] in ('PRX_FDA_product_number','PRX_FDA_submission_status_date', 'PRX_FDA_active_ingredients'):
          #print(row[0], row[2])

          #Get the RXNormID
          rxnode=row[0].split('QRX',1)[1]
          
          if row[1]=='PRX_FDA_product_number':
            #Get the Product Number to get Product Dict Key
            prodno=row[2].split('@',1)[0].replace("'","")
            rxnode_key=rxnode+"_"+prodno
            #print(rxnode_key)

            #Check if key is in Product Dict Key
            if rxnode_key in products_dict:
              #Get associated triples for qualifiers
              product_associated_triples=products_dict[rxnode_key]
              for val in product_associated_triples:
                output_row=[]
                #Get triples
                output_row.append(row[3])
                
                pred=val[1]
                if pred in identifier_source_list:
                  pred_notinwikidata_dict_fda_qualifier[pred]="PRX_ID_"+pred
                else:
                  pred_notinwikidata_dict_fda_qualifier[pred]="PRX_FDA_"+pred 
                
                pred=pred_notinwikidata_dict_fda_qualifier[pred]
                output_row.append(pred)
                output_row.append("\""+val[2]+"\"")

                #Append to list of all qualifier triples
                output_rows_qrxnode_prxnode_fda_qualifier.append(output_row)
          elif row[1]=='PRX_FDA_submission_status_date':
            #Get the submission date for Submissions dict key 
            prodno=row[2].split('^',1)[1].replace("-","")
            rxnode_key=rxnode+"_"+prodno
            #print(rxnode_key)

            #Check if key is in Submissions Dictionary
            if rxnode_key in submisions_dict:
              #Get associated triples for qualifiers
              sub_associated_triples=submisions_dict[rxnode_key]
              for val in product_associated_triples:
                output_row=[]
                #Get triples
                output_row.append(row[3])
                pred=val[1]
                if pred in identifier_source_list:
                  pred_notinwikidata_dict_fda_qualifier[pred]="PRX_ID_"+pred
                else:
                  pred_notinwikidata_dict_fda_qualifier[pred]="PRX_FDA_"+pred 
                
                pred=pred_notinwikidata_dict_fda_qualifier[pred]
                output_row.append(pred)
                output_row.append("\""+val[2]+"\"")
                #Append to list to all qualifier triples
                output_rows_qrxnode_prxnode_fda_qualifier.append(output_row)
          else:
            #Get the active ingredient name for key
            prodno=row[2].split('@',1)[0].replace("'","")
            rxnode_key=rxnode+"_"+prodno
            #print(rxnode_key)

            #Check if key is in active ingredients dict
            if rxnode_key in active_ingredients_dict:
              val=active_ingredients_dict[rxnode_key]
              
              output_row=[]
              output_row.append(row[3])
              pred='strength'
              if pred in identifier_source_list:
                pred_notinwikidata_dict_fda_qualifier[pred]="PRX_ID_"+pred
              else:
                pred_notinwikidata_dict_fda_qualifier[pred]="PRX_FDA_"+pred 
              
              pred=pred_notinwikidata_dict_fda_qualifier[pred]
              output_row.append(pred)
              output_row.append("\""+val+"\"")
              output_rows_qrxnode_prxnode_fda_qualifier.append(output_row)

#print(output_rows_qrxnode_prxnode_fda_qualifier)

#Get qualifiers from QNode PRXNode FDA file
with open('../results/fda/kgtk_triples_id/nodes/qnode_prxnode_fda_id.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    #print(submisions_dict)
    #print(products_dict)
   
    for row in reader:
        
        if row[1] in ('PRX_FDA_product_number','PRX_FDA_submission_status_date', 'PRX_FDA_active_ingredients'):
          #print(row[0], row[2])
          rxnode=get_key(row[0],qnode_dict_inwiki)
          if row[1]=='PRX_FDA_product_number':
            prodno=row[2].split('@',1)[0].replace("'","")
            rxnode_key=rxnode+"_"+prodno
            #print(rxnode_key)
            if rxnode_key in products_dict:
              product_associated_triples=products_dict[rxnode_key]
              for val in product_associated_triples:
                output_row=[]
                output_row.append(row[3])
                pred=val[1]
                if pred in identifier_source_list:
                  pred_notinwikidata_dict_fda_qualifier[pred]="PRX_ID_"+pred
                else:
                  pred_notinwikidata_dict_fda_qualifier[pred]="PRX_FDA_"+pred 
                
                pred=pred_notinwikidata_dict_fda_qualifier[pred]
                output_row.append(pred)
                output_row.append("\""+val[2]+"\"")
                output_rows_qnode_prxnode_fda_qualifier.append(output_row)
          elif row[1]=='PRX_FDA_submission_status_date':
            prodno=row[2].split('^',1)[1].replace("-","")
            rxnode_key=rxnode+"_"+prodno
            #print(rxnode_key)
            if rxnode_key in submisions_dict:
              sub_associated_triples=submisions_dict[rxnode_key]
              for val in product_associated_triples:
                output_row=[]
                output_row.append(row[3])
                pred=val[1]
                if pred in identifier_source_list:
                  pred_notinwikidata_dict_fda_qualifier[pred]="PRX_ID_"+pred
                else:
                  pred_notinwikidata_dict_fda_qualifier[pred]="PRX_FDA_"+pred 
                
                pred=pred_notinwikidata_dict_fda_qualifier[pred]
                output_row.append(pred)
                output_row.append("\""+val[2]+"\"")
                output_rows_qnode_prxnode_fda_qualifier.append(output_row)
          else:
            prodno=row[2].split('@',1)[0].replace("'","")
            rxnode_key=rxnode+"_"+prodno
            #print(rxnode_key)
            if rxnode_key in active_ingredients_dict:
              val=active_ingredients_dict[rxnode_key]
              
              output_row=[]
              output_row.append(row[3])
              pred='strength'
              if pred in identifier_source_list:
                pred_notinwikidata_dict_fda_qualifier[pred]="PRX_ID_"+pred
              else:
                pred_notinwikidata_dict_fda_qualifier[pred]="PRX_FDA_"+pred 
              
              pred=pred_notinwikidata_dict_fda_qualifier[pred]
              output_row.append(pred)
              output_row.append("\""+val+"\"")
              output_rows_qnode_prxnode_fda_qualifier.append(output_row)

#print(output_rows_qnode_prxnode_fda_qualifier)


In [68]:
#Write the results to FDA- QRXNode_PRXNode Qualifier file and QNode_PRXNode Qualifier File
write_tsv("../results/fda/kgtk_triples/qualifiers/qrxnode_prxnode_fda_qualifier.tsv", output_rows_qrxnode_prxnode_fda_qualifier)
write_tsv("../results/fda/kgtk_triples/qualifiers/qnode_prxnode_fda_qualifier.tsv", output_rows_qnode_prxnode_fda_qualifier)

# **Step 8B: Generate KGTK Triples for RXNORM Qualifiers**
1. Generate KGTK Triples for Qualifiers Related to RXNORM Identifiers
2. Get the qualifiers for 4 files:
      QNODE_PRXNODE_QUALIFIER, QRXNODE_PRXNODE_QUALIFIER, QNODE_PNODE_QUALIFIER, QRXNODE_PNODE_QUALIFIER
3. Write the results to these 4 files


In [69]:
#Create a list for output rows of QRXNode PRXNode RXNorm Qualifiers
output_rows_qrxnode_prxnode_rxnorm_qualifier=[]
#Create a list for output rows of QRXNode PNode RXNorm Qualifiers
output_rows_qrxnode_pnode_rxnorm_qualifier=[]

#Create a list for output rows of QNode PRXNode RXNORM Qualifiers
output_rows_qnode_prxnode_rxnorm_qualifier=[]
#Create a list for output rows of QNode PNode RXNORM Qualifiers
output_rows_qnode_pnode_rxnorm_qualifier=[]

#Get Identifier related Qualifiers from QRXNode_PRXNode RXNorm file
with open('../results/rxnorm/kgtk_triples_id/nodes/qrxnode_prxnode_rxnorm_id.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    #print(rxcui_identifier_qual_dict)
   
    for row in reader:
      if row[1] in ("PRX_ID_NDDF", "PRX_ID_MMSL", "PRX_ID_VANDF", "PRX_ID_MMSX", "PRX_ID_MTHSPL", "PRX_ID_GS"):
        #print(row[0], row[1])
        rxnode=row[0].split('QRX',1)[1]
        identifier=row[1].split('PRX_ID_',1)[1]
        rxnode_key=rxnode+"_"+identifier
        if rxnode_key in rxcui_identifier_qual_dict:
          alias_list=rxcui_identifier_qual_dict[rxnode_key]
          for val in alias_list:
            output_row=[]
            output_row.append(row[3])
            output_row.append('alias')
            output_row.append(val)
            output_rows_qrxnode_prxnode_rxnorm_qualifier.append(output_row)

#print(output_rows_qrxnode_prxnode_rxnorm_qualifier)   

#Get Identifier related Qualifiers from QRXNode_PNode RXNorm file
with open('../results/rxnorm/kgtk_triples_id/nodes/qrxnode_pnode_rxnorm_id.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    #print(rxcui_identifier_qual_dict)
   
    for row in reader:
      if row[1] in ("P5806", "P486", "P715", "P267"):
        #print(row[0], row[1])
        
        rxnode=row[0].split('QRX',1)[1]
        identifier=get_key(row[1], pred_wikidata_dict)
        rxnode_key=rxnode+"_"+identifier
        #print(rxnode_key)
        if rxnode_key in rxcui_identifier_qual_dict:
          alias_list=rxcui_identifier_qual_dict[rxnode_key]
          for val in alias_list:
            output_row=[]
            output_row.append(row[3])
            output_row.append('alias')
            output_row.append(val)
            output_rows_qrxnode_pnode_rxnorm_qualifier.append(output_row)
            

#print(output_rows_qrxnode_pnode_rxnorm_qualifier)   

#Get Identifier related Qualifiers from QNode_PRXNode RXNorm file
with open('../results/rxnorm/kgtk_triples_id/nodes/qnode_prxnode_rxnorm_id.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    #print(rxcui_identifier_qual_dict)
   
    for row in reader:
      if row[1] in ("PRX_ID_NDDF", "PRX_ID_MMSL", "PRX_ID_VANDF", "PRX_ID_MMSX", "PRX_ID_MTHSPL", "PRX_ID_GS"):
        #print(row[0], row[1])
        rxnode=get_key(row[0],qnode_dict_inwiki)
        identifier=row[1].split('PRX_ID_',1)[1]
        rxnode_key=rxnode+"_"+identifier
        if rxnode_key in rxcui_identifier_qual_dict:
          alias_list=rxcui_identifier_qual_dict[rxnode_key]
          for val in alias_list:
            output_row=[]
            output_row.append(row[3])
            output_row.append('alias')
            output_row.append(val)
            output_rows_qnode_prxnode_rxnorm_qualifier.append(output_row)

#print(output_rows_qnode_prxnode_rxnorm_qualifier) 

#Get Identifier related Qualifiers from QNode_PNode RXNorm file
with open('../results/rxnorm/kgtk_triples_id/nodes/qnode_pnode_rxnorm_id.tsv', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    
    header_list = next(reader)
    #print("Header=", header_list)
    
    #umlscui_dict={}
    #print(rxcui_identifier_qual_dict)
   
    for row in reader:
      if row[1] in ("P5806", "P486", "P715", "P267"):
        #print(row[0], row[1])
        
        rxnode=get_key(row[0],qnode_dict_inwiki)
        identifier=get_key(row[1], pred_wikidata_dict)
        rxnode_key=rxnode+"_"+identifier
        #print(rxnode_key)
        if rxnode_key in rxcui_identifier_qual_dict:
          alias_list=rxcui_identifier_qual_dict[rxnode_key]
          for val in alias_list:
            output_row=[]
            output_row.append(row[3])
            output_row.append('alias')
            output_row.append(val)
            output_rows_qnode_pnode_rxnorm_qualifier.append(output_row)
            

#print(output_rows_qnode_pnode_rxnorm_qualifier)

In [70]:
#Write the results to RXNORM- QRXNode_PRXNode Qualifier file and QNode_PRXNode Qualifier File
write_tsv("../results/rxnorm/kgtk_triples/qualifiers/qrxnode_prxnode_rxnorm_qualifier.tsv", output_rows_qrxnode_prxnode_rxnorm_qualifier)
write_tsv("../results/rxnorm/kgtk_triples/qualifiers/qnode_prxnode_rxnorm_qualifier.tsv", output_rows_qnode_prxnode_rxnorm_qualifier)

#Write the results to RXNORM- QRXNode_PNode Qualifier file and QNode_PNode Qualifier File
write_tsv("../results/rxnorm/kgtk_triples/qualifiers/qrxnode_pnode_rxnorm_qualifier.tsv", output_rows_qrxnode_pnode_rxnorm_qualifier)
write_tsv("../results/rxnorm/kgtk_triples/qualifiers/qnode_pnode_rxnorm_qualifier.tsv", output_rows_qnode_pnode_rxnorm_qualifier)

# **Step 8C: Generate KGTK Triples for FDA Additional PROPERTIES from Qualifiers**
1. Generate KGTK Triples for Properties Related to FDA Properties
2. Get the properties:<br>
      PRXNODE_QUALIFIER_FDA<br>
      PRXNODE_edges_QUALIFIER_FDA<br>
      PRXNODE_datatype_QUALIFIER_FDA<br>
3. Write the results to these 3 files


In [71]:
#Create a list for output rows of properties related to qualifiers from FDA, properties edges and data types
output_rows_prxnode_fda_qualifier=[]
output_rows_prxnode_edges_fda_qualifier=[]
output_rows_prxnode_datatype_fda_qualifier=[]
for x in pred_notinwikidata_dict_fda_qualifier.keys():
    
    #print(x)
    
    output_row=[]
    pred_value=pred_notinwikidata_dict_fda_qualifier[x]

    #Get label
    output_row.append(pred_value)
    output_row.append("label")
    output_row.append("\'"+x+"\'"+"@en")
    output_rows_prxnode_fda_qualifier.append(output_row)
    output_rows_prxnode_edges_fda_qualifier.append(output_row)
    
    output_row=[]
    output_row.append(pred_value)
    #Get description
    output_row.append("description")
    output_row.append("\'"+x+"\'"+"@en")
    output_rows_prxnode_fda_qualifier.append(output_row)
    output_rows_prxnode_edges_fda_qualifier.append(output_row)
    
    output_row=[]
    #Get datatype
    output_row.append(pred_value)
    output_row.append("data_type")
    if ("PRX_ID_" in pred_value):
        output_row.append("\""+"external-identifier"+"\"")
    elif ("PRX_FDA_" in pred_value):
        output_row.append("\""+"string"+"\"")
    #output_row.append("\"String\"")
    output_rows_prxnode_fda_qualifier.append(output_row)
    output_rows_prxnode_datatype_fda_qualifier.append(output_row)

#print(output_rows_prxnode_fda_qualifier)

In [72]:
#Write the results to FDA- PRXNode Qualifier File, Edges and Datatype
write_tsv("../results/fda/kgtk_triples/properties/prxnode_fda_qualifier.tsv", output_rows_prxnode_fda_qualifier)
write_tsv("../results/fda/kgtk_triples/properties/prxnode_edges_fda_qualifier.tsv", output_rows_prxnode_edges_fda_qualifier)
write_tsv("../results/fda/kgtk_triples/properties/prxnode_datatype_fda_qualifier.tsv", output_rows_prxnode_datatype_fda_qualifier)

# **Step 9: Perform KGTK Transformations and Validation for RXNORM AND FDA QUALIFIERS**



# **Step 9A: Perform KGTK Compact Transformation**
1. Perform KGTK Compact Transformation for RXNORM Qualifiers
2. Perform KGTK Compact Transformation for FDA Qualifiers
3. Perform KGTK Compact Transformation for FDA Properties related to Qualifiers


In [73]:
#Perform KGTK Compact Transformation for RXNorm KGTK triples QUALIFIERS:
#4 Files- QRXNode_PNode_Qualifier, QRXNode_PRXNode_Qualifier, QNode_PNode_Qualifier, QNode_PRXNode_Qualifier
!kgtk compact -i ../results/rxnorm/kgtk_triples/qualifiers/qrxnode_pnode_rxnorm_qualifier.tsv -o ../results/rxnorm/kgtk_triples_compact/qualifiers/qrxnode_pnode_rxnorm_qualifier_compact.tsv
!kgtk compact -i ../results/rxnorm/kgtk_triples/qualifiers/qrxnode_prxnode_rxnorm_qualifier.tsv -o ../results/rxnorm/kgtk_triples_compact/qualifiers/qrxnode_prxnode_rxnorm_qualifier_compact.tsv
!kgtk compact -i ../results/rxnorm/kgtk_triples/qualifiers/qnode_pnode_rxnorm_qualifier.tsv -o ../results/rxnorm/kgtk_triples_compact/qualifiers/qnode_pnode_rxnorm_qualifier_compact.tsv
!kgtk compact -i ../results/rxnorm/kgtk_triples/qualifiers/qnode_prxnode_rxnorm_qualifier.tsv -o ../results/rxnorm/kgtk_triples_compact/qualifiers/qnode_prxnode_rxnorm_qualifier_compact.tsv

^C


In [74]:
#Perform KGTK Compact Transformation for FDA KGTK triples QUALIFIERS:
#2 Files- QRXNode_PRXNode_Qualifier, QNode_PRXNode_Qualifier
!kgtk compact -i ../results/fda/kgtk_triples/qualifiers/qrxnode_prxnode_fda_qualifier.tsv -o ../results/fda/kgtk_triples_compact/qualifiers/qrxnode_prxnode_fda_qualifier_compact.tsv
!kgtk compact -i ../results/fda/kgtk_triples/qualifiers/qnode_prxnode_fda_qualifier.tsv -o ../results/fda/kgtk_triples_compact/qualifiers/qnode_prxnode_fda_qualifier_compact.tsv

In [None]:
#Perform KGTK Compact Transformation for FDA KGTK triples PROPERTIES qualifier related:
#1 Files- PRXNode_edges Qualifier
!kgtk compact -i ../results/fda/kgtk_triples/properties/prxnode_edges_fda_qualifier.tsv -o ../results/fda/kgtk_triples_compact/properties/prxnode_edges_fda_qualifier_compact.tsv

# **Step 9B: Perform KGTK ADD-ID Transformation**
1. Perform KGTK ADD-ID Transformation for RXNORM Qualifiers
2. Perform KGTK ADD-ID Transformation for FDA Qualifiers
3. Perform KGTK ADD-ID Transformation for FDA Properties related to Qualifiers


In [76]:
#Perform KGTK Add-id Transformation for RXNorm KGTK triples QUALIFIERS:
#4 Files- QRXNode_PNode_Qualifier, QRXNode_PRXNode_Qualifier, QNode_PNode_Qualifier, QNode_PRXNode_Qualifier
!kgtk add-id -i ../results/rxnorm/kgtk_triples_compact/qualifiers/qrxnode_pnode_rxnorm_qualifier_compact.tsv -o ../results/rxnorm/kgtk_triples_id/qualifiers/qrxnode_pnode_rxnorm_qualifier_id.tsv --id-style node1-label-node2-num
!kgtk add-id -i ../results/rxnorm/kgtk_triples_compact/qualifiers/qrxnode_prxnode_rxnorm_qualifier_compact.tsv -o ../results/rxnorm/kgtk_triples_id/qualifiers/qrxnode_prxnode_rxnorm_qualifier_id.tsv --id-style node1-label-node2-num
!kgtk add-id -i ../results/rxnorm/kgtk_triples_compact/qualifiers/qnode_pnode_rxnorm_qualifier_compact.tsv -o ../results/rxnorm/kgtk_triples_id/qualifiers/qnode_pnode_rxnorm_qualifier_id.tsv --id-style node1-label-node2-num
!kgtk add-id -i ../results/rxnorm/kgtk_triples_compact/qualifiers/qnode_prxnode_rxnorm_qualifier_compact.tsv -o ../results/rxnorm/kgtk_triples_id/qualifiers/qnode_prxnode_rxnorm_qualifier_id.tsv --id-style node1-label-node2-num

No header line in file


In [77]:
#Perform KGTK Add-id Transformation for FDA KGTK triples QUALIFIERS:
#2 Files- QRXNode_PRXNode_Qualifier, QNode_PRXNode_Qualifier
!kgtk add-id -i ../results/fda/kgtk_triples_compact/qualifiers/qrxnode_prxnode_fda_qualifier_compact.tsv -o ../results/fda/kgtk_triples_id/qualifiers/qrxnode_prxnode_fda_qualifier_id.tsv --id-style node1-label-node2-num
!kgtk add-id -i ../results/fda/kgtk_triples_compact/qualifiers/qnode_prxnode_fda_qualifier_compact.tsv -o ../results/fda/kgtk_triples_id/qualifiers/qnode_prxnode_fda_qualifier_id.tsv --id-style node1-label-node2-num

In [None]:
#Perform KGTK Add-id Transformation for FDA KGTK triples PROPERTIES qualifier related:
#1 Files- PRXNode_edges Qualifier
!kgtk add-id -i ../results/fda/kgtk_triples_compact/properties/prxnode_edges_fda_qualifier_compact.tsv -o ../results/fda/kgtk_triples_id/properties/prxnode_edges_fda_qualifier_id.tsv --id-style node1-label-node2-num

# **Step 9C: Perform KGTK Validate**
1. Perform KGTK Validate for RXNORM Qualifiers
2. Perform KGTK Validate Transformation for FDA Qualifiers
3. Perform KGTK Validate Transformation for FDA Properties related to Qualifiers


In [None]:
#Perform KGTK Validate Operation for RXNorm KGTK triples QUALIFIERS:
#4 Files- QRXNode_PNode_Qualifier, QRXNode_PRXNode_Qualifier, QNode_PNode_Qualifier, QNode_PRXNode_Qualifier
!kgtk validate -i ../results/rxnorm/kgtk_triples_id/qualifiers/qrxnode_pnode_rxnorm_qualifier_id.tsv -v
!kgtk validate -i ../results/rxnorm/kgtk_triples_id/qualifiers/qrxnode_prxnode_rxnorm_qualifier_id.tsv -v
!kgtk validate -i ../results/rxnorm/kgtk_triples_id/qualifiers/qnode_pnode_rxnorm_qualifier_id.tsv -v
!kgtk validate -i ../results/rxnorm/kgtk_triples_id/qualifiers/qnode_prxnode_rxnorm_qualifier_id.tsv -v

In [80]:
#Perform KGTK Validate operation for FDA KGTK triples QUALIFIERS:
#2 Files-  QRXNode_PRXNode_Qualifier, QNode_PRXNode_Qualifier
!kgtk validate -i ../results/fda/kgtk_triples_id/qualifiers/qrxnode_prxnode_fda_qualifier_id.tsv -v
!kgtk validate -i ../results/fda/kgtk_triples_id/qualifiers/qnode_prxnode_fda_qualifier_id.tsv -v


Validating '../results/fda/kgtk_triples_id/qualifiers/qrxnode_prxnode_fda_qualifier_id.tsv'
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file ../results/fda/kgtk_triples_id/qualifiers/qrxnode_prxnode_fda_qualifier_id.tsv
header: node1	label	node2	id
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=0 label=1 node2=2 id=3
KgtkReader: Reading an edge file.
KgtkValue.strict_string_re.match failed for "{'strength': '0.146%', 'name': "RIBOFLAVIN 5'-PHOSPHATE SODIUM"}"
Data line 314780:
QRX2002505-PRX_FDA_product_number-'001'@en-0000	PRX_FDA_active_ingredients	"{'strength': '0.146%', 'name': "RIBOFLAVIN 5'-PHOSPHATE SODIUM"}"	QRX2002505-PRX_FDA_product_number-'001'@en-0000-PRX_FDA_active_ingredients-"{'strength': '0.146%', 'name': "RIBOFLAVIN 5'-PHOSPHATE SODIUM"}"-0000
col 2 (node2) value '"{\'strength\': \'0.146%\', \'name\': "RIBOFLAVIN 5\'-PHOSPHATE SODIUM"}"': 
col 2 (node2) value '"{\'strength\': \'0.146%\', \'name\': "RIBOFLAVIN 5\'-PHOSPHATE S

In [None]:
#Perform KGTK Validate Operation for FDA KGTK triples PROPERTIES qualifiers:
#1 Files- PRXNode_Qualifier
!kgtk validate -i ../results/fda/kgtk_triples_id/properties/prxnode_edges_fda_qualifier_id.tsv -v

# **Step 10: Generate Triples for Ingestion**


# **Step 10A: Merge RXNORM KGTK Triples for Edges and Property-DataType**
1. Perform KGTK Concatenate Transformation for RXNorm KGTK triples- NODES edges:<br>
4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
2. Perform KGTK Concatenate Transformation for RXNorm KGTK triples- QUALIFIERS edges:<br>
4 Files- QRXNode_PNode_Qualifier, QRXNode_PRXNode_Qualifier, QNode_PNode_Qualifier, QNode_PRXNode_Qualifier
3. Perform KGTK Concatenate Transformation for RXNorm KGTK triples- PROPERTIES edges:<br>
1 Files- PRXNode_edges
4. Perform KGTK Concatenate Transformation for RXNorm KGTK triples- PPROPERTIES datatype:<br>
1 Files- PRXNode_datatype


In [82]:
#Perform KGTK Concatenate Transformation for RXNorm KGTK triples- NODES edges:
#4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
!kgtk cat -i ../results/rxnorm/kgtk_triples_id/nodes/qrxnode_pnode_rxnorm_id.tsv ../results/rxnorm/kgtk_triples_id/nodes/qrxnode_prxnode_rxnorm_id.tsv ../results/rxnorm/kgtk_triples_id/nodes/qnode_pnode_rxnorm_id.tsv ../results/rxnorm/kgtk_triples_id/nodes/qnode_prxnode_rxnorm_id.tsv -o ../results/rxnorm/kgtk_merged/nodes_edges_merged_rxnorm.tsv

In [83]:
#Perform KGTK Concatenate Transformation for RXNorm KGTK triples- QUALIFIERS edges:
#4 Files- QRXNode_PNode_Qualifier, QRXNode_PRXNode_Qualifier, QNode_PNode_Qualifier, QNode_PRXNode_Qualifier
!kgtk cat -i ../results/rxnorm/kgtk_triples_id/qualifiers/qrxnode_pnode_rxnorm_qualifier_id.tsv ../results/rxnorm/kgtk_triples_id/qualifiers/qrxnode_prxnode_rxnorm_qualifier_id.tsv ../results/rxnorm/kgtk_triples_id/qualifiers/qnode_pnode_rxnorm_qualifier_id.tsv ../results/rxnorm/kgtk_triples_id/qualifiers/qnode_prxnode_rxnorm_qualifier_id.tsv -o ../results/rxnorm/kgtk_merged/qualifiers_edges_merged_rxnorm.tsv

In [84]:
#Perform KGTK Concatenate Transformation for RXNorm KGTK triples- PROPERTIES edges and PROPERTIES datatype:
#1 Files- PRXNode
!kgtk cat -i ../results/rxnorm/kgtk_triples_id/properties/prxnode_edges_rxnorm_id.tsv -o ../results/rxnorm/kgtk_merged/properties_edges_merged_rxnorm.tsv
!kgtk cat -i ../results/rxnorm/kgtk_triples/properties/prxnode_datatype_rxnorm.tsv -o ../results/rxnorm/kgtk_merged/properties_datatype_merged_rxnorm.tsv

# **Step 10B: Merge FDA KGTK Triples for Edges and Property-DataType**
1. Perform KGTK Concatenate Transformation for FDA KGTK triples- NODES edges:<br>
4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
2. Perform KGTK Concatenate Transformation for FDA KGTK triples- QUALIFIERS edges:<br>
4 Files- QRXNode_PNode_Qualifier, QRXNode_PRXNode_Qualifier, QNode_PNode_Qualifier, QNode_PRXNode_Qualifier
3. Perform KGTK Concatenate Transformation for FDA KGTK triples- PROPERTIES edges:<br>
2 Files- PRXNode_edges, PRXNode_edges_Qualifier
4. Perform KGTK Concatenate Transformation for FDA KGTK triples- PPROPERTIES datatype:<br>
2 Files- PRXNode_datatype, PRXNode_datatype_Qualifier

In [85]:
#Perform KGTK Concatenate Transformation for FDA KGTK triples- NODES edges:
#4 Files- QRXNode_PNode, QRXNode_PRXNode, QNode_PNode, QNode_PRXNode
!kgtk cat -i ../results/fda/kgtk_triples_id/nodes/qrxnode_pnode_fda_id.tsv ../results/fda/kgtk_triples_id/nodes/qrxnode_prxnode_fda_id.tsv ../results/fda/kgtk_triples_id/nodes/qnode_pnode_fda_id.tsv ../results/fda/kgtk_triples_id/nodes/qnode_prxnode_fda_id.tsv -o ../results/fda/kgtk_merged/nodes_edges_merged_fda.tsv

In [86]:
#Perform KGTK Concatenate Transformation for FDA KGTK triples- QUALIFIERS edges:
#4 Files- QRXNode_PNode_Qualifier, QRXNode_PRXNode_Qualifier, QNode_PNode_Qualifier, QNode_PRXNode_Qualifier
!kgtk cat -i ../results/fda/kgtk_triples_id/qualifiers/qrxnode_prxnode_fda_qualifier_id.tsv ../results/fda/kgtk_triples_id/qualifiers/qnode_prxnode_fda_qualifier_id.tsv -o ../results/fda/kgtk_merged/qualifiers_edges_merged_fda.tsv

In [87]:
#Perform KGTK Concatenate Transformation for FDA KGTK triples- PROPERTIES edges and PROPERTIES datatype:
#2 Files- PRXNode, PRXNode_Qualifier
!kgtk cat -i ../results/fda/kgtk_triples_id/properties/prxnode_edges_fda_id.tsv ../results/fda/kgtk_triples_id/properties/prxnode_edges_fda_qualifier_id.tsv -o ../results/fda/kgtk_merged/properties_edges_merged_fda.tsv
!kgtk cat -i ../results/fda/kgtk_triples/properties/prxnode_datatype_fda.tsv ../results/fda/kgtk_triples/properties/prxnode_datatype_fda_qualifier.tsv -o ../results/fda/kgtk_merged/properties_datatype_merged_fda.tsv

# **Step 10C: Merge RXNORM All Edges and FDA All Edges [Nodes+Properties+Qualifiers]**
1. Perform KGTK Concatenate Transformation for RXNROM KGTK triples- ALL edges:<br>
3 Files- NODES_EDGES, PROPERTIES_EDGES and QUALFIERS_EDGES
2. Perform KGTK Concatenate Transformation for FDA KGTK triples- ALL edges:<br>
3 Files- NODES_EDGES, PROPERTIES_EDGES and QUALFIERS_EDGES

In [88]:
#Perform KGTK Concatenate Transformation for RXNorm KGTK triples- NODES edges, PROPERTIES edges and QUALIFIERS edges:
#3 Files- Node_Edges_Merged, Properties_Edges_Merged, Qualifiers_Edges_Merged
!kgtk cat -i ../results/rxnorm/kgtk_merged/nodes_edges_merged_rxnorm.tsv ../results/rxnorm/kgtk_merged/properties_edges_merged_rxnorm.tsv ../results/rxnorm/kgtk_merged/qualifiers_edges_merged_rxnorm.tsv -o ../results/merged_ingestion/intermediate/nodes_properties_qualifiers_alledges_rxnorm.tsv

In [89]:
#Perform KGTK Concatenate Transformation for FDA KGTK triples- NODES edges, PROPERTIES edges and QUALIFIERS edges:
#3 Files- Node_Edges_Merged, Properties_Edges_Merged, Qualifiers_Edges_Merged
!kgtk cat -i ../results/fda/kgtk_merged/nodes_edges_merged_fda.tsv ../results/fda/kgtk_merged/properties_edges_merged_fda.tsv ../results/fda/kgtk_merged/qualifiers_edges_merged_fda.tsv -o ../results/merged_ingestion/intermediate/nodes_properties_qualifiers_alledges_fda.tsv

# **Step 10D: FINAL Merge for RXNORM+FDA Combined- ALL Edges AND Property DataType**
1. Perform KGTK Concatenate Transformation for ALL edges:<br>
2 Files- ALLEDGES_RXNORM, ALLEDGES_FDA
2. Perform KGTK Concatenate Transformation for ALL Property DataType:<br>
2 Files- PRXNODE_DATATYPE_RXNORM, PRXNODE_DATATYPE_FDA

In [90]:
#Perform FINAL KGTK Concatenate Transformation for Merged KGTK triples- RXNORM ALL Datatypes and FDA ALL Datatypes:
#2 Files- RXNorm_Properties_Merged, FDA_Properties_Merged
!kgtk cat -i ../results/rxnorm/kgtk_merged/properties_datatype_merged_rxnorm.tsv ../results/fda/kgtk_merged/properties_datatype_merged_fda.tsv -o ../results/merged_ingestion/final/properties_datatype_merged.tsv

In [91]:
#Perform FINAL KGTK Concatenate Transformation for Merged KGTK triples- RXNORM ALL Edges and FDA ALL Edges:
#2 Files- RXNorm_AllEdges_Merged, FDA_AllEdges_Merged
!kgtk cat -i ../results/merged_ingestion/intermediate/nodes_properties_qualifiers_alledges_rxnorm.tsv ../results/merged_ingestion/intermediate/nodes_properties_qualifiers_alledges_fda.tsv -o ../results/merged_ingestion/final/nodes_properties_qualifiers_alledges_merged.tsv