In [5]:
import requests
import json
import pandas as pd

genes = ['CYP2D6', 'ESR1', 'COMT', 
         'CYP3A5', 'ABCB1', 'CYP2C19', 
         'UGT2B7', 'SLC22A16', 'TPMT', 
         'NR1I2', 'NAT2', 'CYP1A2', 
         'CYP2C9', 'SULT1A1', 'UGT1A4', 
         'UGT1A8', 'UGT1A9', 'UGT2B15', 
         'UGT2B15', 'DPYD', 'TYMS', 
         'RRM1']

# variant annotation API
def get_variant_data(genes, url):
    # Create an empty DataFrame to hold the data
    all_data = pd.DataFrame()

    for gene in genes:
        
        # Make an API request for variant annotation data for the current gene
        # Set the parameters for the API request
        params = {
            "location.genes.symbol": gene,
            "view": "base"
        }
        
        try:
            # Make the API request
            response = requests.get(url, params=params)
            
             #Parse the JSON response
            data = response.json()
            
            # Convert the JSON response to a Pandas DataFrame
            gene_data = pd.json_normalize(data["data"])
            
            #print dimensions of the DataFrame
            print(gene, ":", gene_data.shape)

            # Append the gene data to the master DataFrame
            all_data = all_data.append(gene_data)
        except:
            print("Error: API request failed for gene " + gene)

        
        
    return all_data

#all_data = get_variant_data(genes, "https://api.pharmgkb.org/v1/data/variantAnnotation")

# Write the DataFrame to a CSV file
#all_data.to_csv("pharmagkb_variant.csv", index=False)

# Load the CSV file into a pandas DataFrame
all_data = pd.read_csv('pharmagkb_variant.csv')

#print dimensions of the DataFrame
print(all_data.shape)

print(",".join(all_data.columns))

all_data.head()


(8502, 131)
id,accessionId,description,formatVersion,history,isAssociated,isPlural,objCls,pediatric,phenotypeCategories,populationPhenotypes,relatedChemicals,score,scoreDetails,sentence,studyParameters,useForCaScoring,connWords1.id,connWords1.resource,connWords1.term,connWords1.termId,drugsConnWord.id,drugsConnWord.resource,drugsConnWord.term,drugsConnWord.termId,literature.id,literature.title,literature._sameAs,literature.authors,literature.crossReferences,literature.journal,literature.month,literature.objCls,literature.page,literature.pediatric,literature.pgkbPublication,literature.pubDate,literature.terms,literature.volume,literature.year,location.id,location.copyNumber,location.diplotypes,location.displayName,location.genePhenotype.id,location.genePhenotype.resource,location.genePhenotype.term,location.genePhenotype.termId,location.genes,location.gpPosition,location.haplotypes,location.linkedObjects,location.refSeqPosition,location.tagGene,location.type,location.variant.objCls,loca

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,id,accessionId,description,formatVersion,history,isAssociated,isPlural,objCls,pediatric,phenotypeCategories,...,sequenceLocation.variantHgvs,scoringRationale,location.refSeqCrossReference.name,sequenceLocation.altHgvs.c,sequenceLocation.altHgvs.g,sequenceLocation.altHgvs.name,sequenceLocation.altHgvs.p,alleleGroupingDescription,sequenceLocation.altHgvs.p.desc,sequenceLocation.notes
0,1183684657,PA166145182,"Median (+)R,R-tramadol area under the curve wa...",1,"[{'id': 1450367381, 'date': '2019-01-29T09:19:...",True,False,Variant Drug Annotation,False,"[{'id': 981344088, 'resource': 'Phenotype Cate...",...,,,,,,,,,,
1,1183684661,PA166144530,"Median (+)R,R-tramadol area under the curve wa...",1,"[{'id': 1450367377, 'date': '2019-01-29T09:19:...",True,False,Variant Drug Annotation,False,"[{'id': 981344088, 'resource': 'Phenotype Cate...",...,,,,,,,,,,
2,1450042177,PA166181190,Intrinsic clearance was 11.8% that of CYP2D6*1...,1,"[{'id': 1450042184, 'date': '2018-11-14T11:30:...",True,False,Variant Functional Assay Annotation,False,"[{'id': 981344088, 'resource': 'Phenotype Cate...",...,,,,,,,,,,
3,982047744,PA166143228,A 2 year old boy with an ultrarapid-metabolize...,1,[],True,False,Variant Drug Annotation,False,"[{'id': 981344088, 'resource': 'Phenotype Cate...",...,,,,,,,,,,
4,1451152848,PA166210941,Healthy volunteers were previously phenotyped ...,2,[],True,False,Variant Drug Annotation,False,"[{'id': 981344088, 'resource': 'Phenotype Cate...",...,,,,,,,,,,


In [6]:
# Create a list of the columns of interest
cols_of_interest = ["id",
                    "accessionId",
                    "alleleGenotype", 
                    "geneSymbol", 
                    "drugsConnWord.term", 
                    "phenotypes", 
                    "location.variant.symbol", 
                    "sequenceLocation.variantHgvs"]

# Use the loc method to keep only the columns of interest
all_data_variant = all_data.loc[:, cols_of_interest]

# Print the resulting DataFrame
all_data_variant.head()

Unnamed: 0,id,accessionId,alleleGenotype,geneSymbol,drugsConnWord.term,phenotypes,location.variant.symbol,sequenceLocation.variantHgvs
0,1183684657,PA166145182,,,and,,,
1,1183684661,PA166144530,,,and,,,
2,1450042177,PA166181190,*10,,or,,,
3,982047744,PA166143228,*1xN,,and,,,
4,1451152848,PA166210941,,,,,,


In [7]:
# drug label annotation API
def get_drug_label_data(url):
    # Create an empty DataFrame to hold the data
    all_data = pd.DataFrame()

    #for id in ids:
        
    #this_url = url + str(id)
    this_url = url
    print(this_url)
        
    # Make an API request for variant annotation data for the current gene
    # Set the parameters for the API request
    params = {
        #"relatedGenes.symbol": id,
        "relatedChemicals.name": "tamoxifen",
        "view": "base"
    }
    
    try:
        # Make the API request
        response = requests.get(this_url, params=params)

        #Parse the JSON response
        data = response.json()
        
        # Convert the JSON response to a Pandas DataFrame
        gene_data = pd.json_normalize(data["data"])
        
        #print dimensions of the DataFrame
        #print(id, ":", gene_data.shape)

        # Append the gene data to the master DataFrame
        all_data = all_data.append(gene_data)
        
    except:
        print("Error: API request failed for gene " + str(id))

        
        
    return all_data

all_data_drug_label = get_drug_label_data("https://api.pharmgkb.org/v1/data/label/")    

# Write the DataFrame to a CSV file
all_data_drug_label.to_csv("pharmagkb_drug_label.csv", index=False)

#print dimensions of the DataFrame
print(all_data_drug_label.shape)

all_data_drug_label.head()

https://api.pharmgkb.org/v1/data/label/
(5, 28)


Unnamed: 0,objCls,id,name,alternateDrugAvailable,biomarkerStatus,cancerGenome,crossReferences,dosingInformation,history,labelApplications,...,prescribingMarkdown.html,summaryMarkdown.id,summaryMarkdown.html,testing.id,testing.resource,testing.term,testing.termId,textMarkdown.id,textMarkdown.html,fdaPgxAssociation
0,Label Annotation,PA166104819,Annotation of FDA Label for tamoxifen and ESR1...,True,On FDA Biomarker List,True,[],False,"[{'id': 1183681483, 'date': '2013-10-25T11:41:...","[{'id': 1449814265, 'onOpenFda': True, 'pgxPre...",...,<p>Excerpt from the tamoxifen (SOLTAMOX) drug ...,1447981618,<p>Tamoxifen (SOLTAMOX) is an anti-estrogen us...,1183672041,Genetic Testing Level,Testing required,geneTestLevel:1183672041,1447981617,<p>Excerpts from the tamoxifen (SOLTAMOX) drug...,
1,Label Annotation,PA166184392,Annotation of Swissmedic Label for tamoxifen a...,False,,False,[],False,[],[],...,,1450815403,<p>The Swiss drug label for tamoxifen (Nolvade...,1183672111,Genetic Testing Level,Actionable PGx,geneTestLevel:1183672111,1450815404,<p><strong>Please note that the information co...,
2,Label Annotation,PA166170936,"Annotation of FDA Label for tamoxifen and F2, F5",False,On FDA Biomarker List,False,[],False,[],"[{'id': 1449814158, 'onOpenFda': True, 'pgxPre...",...,,1449052557,<p>Tamoxifen (SOLTAMOX) is indicated for the t...,1183672141,Genetic Testing Level,Informative PGx,geneTestLevel:1183672141,1449052556,<p>Excerpt from the tamoxifen (SOLTAMOX) drug ...,
3,Label Annotation,PA166181925,Annotation of FDA Label for tamoxifen and CYP2D6,False,On FDA Biomarker List,False,[],False,"[{'id': 1450821494, 'date': '2019-10-09T14:37:...","[{'id': 1450369807, 'onOpenFda': True, 'pgxPre...",...,,1450369804,<p>The FDA-approved drug label for tamoxifen (...,1183672111,Genetic Testing Level,Actionable PGx,geneTestLevel:1183672111,1450369805,<p>Excerpt from the tamoxifen (SOLTAMOX) drug ...,PK
4,Label Annotation,PA166127718,Annotation of HCSC Label for tamoxifen and CYP...,False,,False,[],False,[],[],...,,1447982367,<p>The product monograph for tamoxifen states ...,1183672041,Genetic Testing Level,Testing required,geneTestLevel:1183672041,1447982366,<p>Excerpts from the tamoxifen product monogra...,


In [11]:
response.text

NameError: name 'response' is not defined

In [5]:
all_data_drug_label.columns

Index(['objCls', 'id', 'name', 'alternateDrugAvailable', 'biomarkerStatus',
       'cancerGenome', 'crossReferences', 'dosingInformation',
       'fdaPgxAssociation', 'history', 'labelApplications', 'literature',
       'pediatric', 'prescribingGenes', 'relatedAlleles', 'relatedChemicals',
       'relatedGenes', 'source', 'prescribingMarkdown.id',
       'prescribingMarkdown.html', 'summaryMarkdown.id',
       'summaryMarkdown.html', 'testing.id', 'testing.resource',
       'testing.term', 'testing.termId', 'textMarkdown.id',
       'textMarkdown.html', 'pediatricMarkdown.id', 'pediatricMarkdown.html',
       'retirementReason.id', 'retirementReason.resource',
       'retirementReason.term', 'retirementReason.termId', 'errors'],
      dtype='object')

In [24]:
# Set the API endpoint URL
url = "https://api.pharmgkb.org/v1/data/label"

# Set the parameters for the API request
params = {
    "levelOfEvidence.term": "1A",
    "view": "base"
}

# Make the API request
response = requests.get(url, params=params)

# Parse the JSON response
data = response.json()

# Normalize the JSON data into a Pandas DataFrame
df_clin = pd.json_normalize(data["data"])

# Write the DataFrame to a CSV file
df_clin.to_csv("pharmagkb_clin.csv", index=False)

# Print a confirmation message
print("DataFrame written to output.csv")

#print dimensions of the DataFrame
print(df_clin.shape)

df_clin.head()

DataFrame written to output.csv
(289, 46)


Unnamed: 0,id,accessionId,allelePhenotypes,conflictingVariantAnnotationIds,history,name,objCls,overrideLevel,pediatric,relatedChemicals,...,location.refSeqCrossReference._url,location.refSeqPosition,location.rsid,location.tagGene,location.type,location.variant.objCls,location.variant.id,location.variant.symbol,location.variant.name,overrideLevelDescription
0,1445117525,PA166136351,"[{'id': 1451333059, 'allele': 'CC', 'limitedEv...",[],"[{'id': 1445117526, 'date': '2015-07-23T14:19:...",Clinical Annotation for rs193922876 (RYR1); de...,Clinical Annotation,False,True,"[{'objCls': 'Chemical', 'id': 'PA164749136', '...",...,https://www.ncbi.nlm.nih.gov/nuccore/NC_000019.10,38580114,rs193922876,False,SNP,Variant,PA166155575,rs193922876,rs193922876,
1,1183705788,PA166135464,"[{'id': 1451332933, 'allele': 'CC', 'limitedEv...",[],"[{'id': 1447992478, 'date': '2016-05-27T16:43:...",Clinical Annotation for rs118192170 (RYR1); de...,Clinical Annotation,False,False,"[{'objCls': 'Chemical', 'id': 'PA164749136', '...",...,https://www.ncbi.nlm.nih.gov/nuccore/NC_000019.10,38584989,rs118192170,False,SNP,Variant,PA166155536,rs118192170,rs118192170,
2,1445400186,PA166135876,"[{'id': 1451333191, 'allele': 'AA', 'limitedEv...",[],"[{'id': 1445400187, 'date': '2015-07-30T16:02:...",Clinical Annotation for rs1801086 (RYR1); desf...,Clinical Annotation,False,True,"[{'objCls': 'Chemical', 'id': 'PA164749136', '...",...,https://www.ncbi.nlm.nih.gov/nuccore/NC_000019.10,38446710,rs1801086,False,SNP,Variant,PA166155375,rs1801086,rs1801086,
3,1451434920,PA166242001,"[{'id': 1451461447, 'allele': 'A', 'limitedEvi...",[],"[{'id': 1451434924, 'date': '2021-05-10T17:03:...",Clinical Annotation for rs267606617 (MT-RNR1);...,Clinical Annotation,False,True,"[{'objCls': 'Chemical', 'id': 'PA164744372', '...",...,https://www.ncbi.nlm.nih.gov/nuccore/NC_012920.1,1555,rs267606617,False,SNP,Variant,PA166159181,rs267606617,rs267606617,
4,1445400222,PA166136355,"[{'id': 1451333198, 'allele': 'GG', 'limitedEv...",[],"[{'id': 1445400223, 'date': '2015-07-30T16:36:...",Clinical Annotation for rs193922772 (RYR1); de...,Clinical Annotation,False,True,"[{'objCls': 'Chemical', 'id': 'PA164749136', '...",...,https://www.ncbi.nlm.nih.gov/nuccore/NC_000019.10,38457546,rs193922772,False,SNP,Variant,PA166155568,rs193922772,rs193922772,


In [25]:
df_clin.columns

Index(['id', 'accessionId', 'allelePhenotypes',
       'conflictingVariantAnnotationIds', 'history', 'name', 'objCls',
       'overrideLevel', 'pediatric', 'relatedChemicals',
       'relatedChemicalsLogic', 'relatedDiseases', 'relatedGuidelines',
       'relatedLabels', 'score', 'scoreDetails', 'types', 'levelOfEvidence.id',
       'levelOfEvidence.resource', 'levelOfEvidence.term',
       'levelOfEvidence.termId', 'location.id', 'location.buildVersion',
       'location.chromosomeId', 'location.chromosomeName',
       'location.copyNumber', 'location.diplotypes', 'location.displayName',
       'location.genes', 'location.gpPosition', 'location.haplotypes',
       'location.linkedObjects', 'location.refSeqCrossReference.id',
       'location.refSeqCrossReference.name',
       'location.refSeqCrossReference.resource',
       'location.refSeqCrossReference.resourceId',
       'location.refSeqCrossReference._url', 'location.refSeqPosition',
       'location.rsid', 'location.tagGene', 'lo