# Example Python Script to GET genomic coordinates and processing

## GET request and return of JSON

This example shows a basic way to handle paginated results from the API. Not all Proteins API end-points return paginated data.

One __important__ method is _checkResponse_, as you need to check that the service is up and returning usable data. Response code __200__ means that the service is up and data has been returned. Other response codes can be used to check for different responses.

The _jsonRequest_ method is allowed to have up to 10 tries before failing completely. There's no time delay but this could be easily added.

In [None]:
import requests, sys
# Following imports are not required
import pandas as pd
import json


# Checks the server response code is correct 
def checkResponse(resp):
    if(resp.status_code == 200):
        return True
    else:
        return False 


# Carries out the REST get request and returns a combined list of all the results    
def jsonRequest(geneName: str, taxId: int, pageSize: int) -> str:
        
    # Format get URL
    geneArg = "gene=" + geneName
    taxIdArg = "taxid=" + taxId
    urlFormat = "format=json"
    startCount = 0
    resultsCount = startCount
    maxResultsPerPage = pageSize
    startAt = "from=" + str(startCount)
    resultsPerPage = "size=" + str(maxResultsPerPage)
    entries = []
    coordinatesURL = "https://www.ebi.ac.uk/proteins/api/coordinates"
    fullUrl = coordinatesURL + "?" + geneArg + "&" + taxIdArg + "&" \
    + startAt + "&" + resultsPerPage + "&" + urlFormat 
    #print(fullUrl)
    responseFail = 0
               
    while True:
        response = requests.get(fullUrl)
        if(self.checkResponse(response)):
            responseFail = 0
            jsonResponse = response.json()
            entries.extend(jsonResponse)
            resultsCount += maxResultsPerPage
                
            # Check if we need to do another page
            if response.headers["x-pagination-totalrecords"] is None \
            or (resultsCount >= int(response.headers["x-pagination-totalrecords"])):
                break
            if int(response.headers["x-pagination-totalrecords"]) % pageSize == 0:
                print(response.headers)
                
            # Update page offsets and reformat get URL
            startAt = "from=" + str(resultsCount)
            fullUrl = coordinatesURL + "?" + geneArg + "&" + taxIdArg + "&" \
            + startAt + "&" + resultsPerPage + "&" + urlFormat 
                
        # Stop requesting from server if have 10 fails
        elif(responseFail == 10):
            break
        # Record a failed response from server
        else:
            responseFail += 1
    print("total results count = " + response.headers["x-pagination-totalrecords"])     
    response.close()
    return entries


## Extracting and Transforming Genomic Coordinates

The next block of code illustrates how to parse the JSON to extract and do a basic transformation of the genomic coordinate data available.

Three methods are used but _parseGenomeJson_ does the majority of the work. This method loops through the collection of coordinate elements to extract out specific values such as 'chromosome' from the _genomicLocation_ sub-element.

_processGene_ simply finds the primary gene symbol by looping through the gene element.

Within _parseGenomeJson_ a call is made to _processExon_ to extract out both the genomic location and equivalent protein location for each exon.

In [None]:
def processGene(gene: []) -> str:
    for g in gene:
        if(g["type"] == "primary"):
            return g["value"]  
    return "No_Primary"
    
    
    
def processExon(exon: str) -> str:
    exonDat = []
    exonDat.append(exon["id"])
    exonDat.append(exon["genomeLocation"]["begin"]["position"])
    exonDat.append(exon["genomeLocation"]["end"]["position"])
    exonDat.append(exon["proteinLocation"]["begin"]["position"])
    exonDat.append(exon["proteinLocation"]["end"]["position"]) 
    return exonDat
    
    
    
      
def parseGenomeJson(coordinates: []) -> []: 
    pData = []

    for row in coordinates:
        accession = row["accession"]
        geneName = processGene(row["gene"])
        taxId = row["taxid"]
        chromosome = row["gnCoordinate"][0]["genomicLocation"]["chromosome"]
        chrStart = row["gnCoordinate"][0]["genomicLocation"]["start"]
        chrEnd = row["gnCoordinate"][0]["genomicLocation"]["end"]
        ensg = row["gnCoordinate"][0]["ensemblGeneId"]
        enst = row["gnCoordinate"][0]["ensemblTranscriptId"]
        ensp = row["gnCoordinate"][0]["ensemblTranslationId"]
            
        if(row["gnCoordinate"][0]["genomicLocation"]["reverseStrand"]):
            strand = "-" 
        else:
            strand ="+" 

        exons = row["gnCoordinate"][0]["genomicLocation"]["exon"]
        for e in exons:
            processedRow = []
            processedRow.append(accession)
            processedRow.append(geneName)
            processedRow.append(taxId)
            processedRow.append(ensg)
            processedRow.append(enst)
            processedRow.append(ensp)
            processedRow.append(chromosome)
            processedRow.append(chrStart)
            processedRow.append(chrEnd)
            processedRow.append(strand)
            processedRow.append(len(exons))  
            processedRow.extend(processExon(e))
            pData.append(processedRow)
             
    return pData

## Finding and transforming UniProt Sequence Annotations (Features)

We use two methods here to extract and transform UniProt sequence annotations (features).  

The second in this code block, _parseFeatureJson_, takes the JSON and loops through the collection of 
coordinates to extract out the basic data about the protein, the gene and it's genomic coordinates.

A second sub-loop takes the feature element extacts general UniProt annotation data related to genomic coordinates for the annotation, like in _parseFeatureJson_, but also calls _processFeature_ to extract the specific genomic coordinates and protein coordinates for the UniProt annotation.

In [None]:
def processFeature(feat: str) -> []:
        featDat = []
        featDat.append(feat["genomeLocation"]["begin"]["position"])
        featDat.append(feat["genomeLocation"]["end"]["position"])
        featDat.append(feat["location"]["begin"]["position"])
        featDat.append(feat["location"]["end"]["position"])
        
        evidence = feat["evidence"]
        mergedEvid = []
        for e in evidence:
            if("dbReference" in e):
                evidVal = e["dbReference"]["type"] + " " + e["dbReference"]["id"] + " " + e["code"]
            else:    
                evidVal = e["code"] 
            mergedEvid.append(evidVal)
        featDat.append(mergedEvid)
        return featDat
    

def parseFeatureJson(coordinates: [],featureType: str) -> []: 
        
    fData = []
    #print(coordinates)
    for row in coordinates:
        accession = row["accession"]
        geneName = self.processGene(row["gene"])
        taxId = row["taxid"]
        chromosome = row["gnCoordinate"][0]["genomicLocation"]["chromosome"]
        chrStart = row["gnCoordinate"][0]["genomicLocation"]["start"]
        chrEnd = row["gnCoordinate"][0]["genomicLocation"]["end"]
        ensg = row["gnCoordinate"][0]["ensemblGeneId"]
        enst = row["gnCoordinate"][0]["ensemblTranscriptId"]
        ensp = row["gnCoordinate"][0]["ensemblTranslationId"]
            
        if(row["gnCoordinate"][0]["genomicLocation"]["reverseStrand"]):
            strand = "-" 
        else:
            strand ="+" 
        features = row["gnCoordinate"][0]["feature"]
        fcnt = 1
        for f in features:
            # Only want to get the specified featureType (annotation Type)
            if(f["type"] != featureType):
                continue
            processedRow = []
            processedRow.append(accession)
            processedRow.append(geneName)
            processedRow.append(taxId)
            processedRow.append(ensg)
            processedRow.append(enst)
            processedRow.append(ensp)
            processedRow.append(chromosome)
            processedRow.append(chrStart)
            processedRow.append(chrEnd)
            processedRow.append(strand)
            processedRow.append(featureType)
            processedRow.append("feature #" + str(fcnt))
            fcnt = fcnt + 1
            processedRow.extend(processFeature(f))  
            fData.append(processedRow)
    return fData 

## Main method

The main method actually has a main class, that isn't defined in this example.  The class could take the main 
variables as shown but the methods above are called as if not defined in a class.

You would have to take all the separate code blocks here and put into a single class to have the python script correctly run (with some minor changes).

In [None]:
if __name__ == '__main__':
    
    coordinatesURL = "https://www.ebi.ac.uk/proteins/api/coordinates"
    geneName = "fgfr2"
    taxId = "9606"
    pageSize = 100
   
    main = ParseCoordinatesJson(coordinatesURL,geneName,taxId,pageSize)
    fgfr2Data = jsonRequest(geneName,taxId,pageSize)
   
    genomeMapping = parseGenomeJson(fgfr2Data)
      
    for gm in genomeMapping:
        print(gm)
        
    feature = "disulfide bond"   
    featureMapping = parseFeatureJson(fgfr2Data,feature)   
    
    for fm in featureMapping:
        print(fm) 

    exit()  