### Programming for Biomedical Informatics
#### Week 6 Assignment - Searching NCBI GEO for RNA-seq data

In this weekly mini assignment you will practice using eUtils to query NCBI GEO.

- if you want to use the Bio:Entrez module make sure that you've installed Biopython
- you should already have a free NCBI account so that you can get an API key, but if not please register for an NCBI account
- you have the option of Bio:Enrtez or the ```requests``` API approach.

In [1]:
# code snippet to find papers that a paper cites
import urllib.request
import xml.etree.ElementTree as ET

# load my API key from the file
with open('../../api_keys/ncbi.txt', 'r') as file:
    api_key = file.read().strip()

with open('../../api_keys/ncbi_email.txt', 'r') as file:
    email = file.read().strip()

In [55]:
def search_GEO(query):

    # Define the parameters for the eSearch request
    esearch_params = {
        'db': 'gds',
        'term': query,
        'api_key': api_key,
        'email': email,
        'usehistory': 'y'
    }

    # encode the parameters so they can be passed to the API
    encoded_data = urllib.parse.urlencode(esearch_params).encode('utf-8')

    # the base request url for eSearch
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

    # make the request
    request = urllib.request.Request(url, data=encoded_data)
    response = urllib.request.urlopen(request)

    # read into an XML object
    esaerch_data_XML = ET.fromstring(response.read())

    #print the number of counts
    count = esaerch_data_XML.find('Count').text

    print(f"Found {count} records with query: {query}")

    # Extract WebEnv and QueryKey
    webenv = esaerch_data_XML.find('WebEnv').text
    query_key = esaerch_data_XML.find('QueryKey').text

    return(esaerch_data_XML, webenv, query_key)

def get_FTPlinks(webenv, query_key):

    # Define the parameters for the eFetch request
    esummary_params = {
    'db': 'gds',
    'query_key': query_key,
    'WebEnv': webenv,
    'api_key': api_key,
    'email': email
    }

    # encode the parameters so they can be passed to the API
    encoded_data = urllib.parse.urlencode(esummary_params).encode('utf-8')

    # the base request url for eSummary
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"

    # make the request
    request = urllib.request.Request(url, data=encoded_data)
    response = urllib.request.urlopen(request)

    # read into an XML object
    esummary_data_XML = ET.fromstring(response.read())

    # extact the ftp links
    ftp_elements = esummary_data_XML.findall('.//Item[@Name="FTPLink"]')
    ftp_links = [element.text for element in ftp_elements]

    # remove the ftp:// prefix
    ftp_links = [link.replace('ftp://', '') for link in ftp_links]

    return ftp_links

In [None]:
# search GEO for RNA-seq data related to Autism Spectrum Disorder
result, webenv, query_key = search_GEO('("rnaseq counts"[Filter] AND "Autism Spectrum Disorder"[MeSH Terms])')

In [None]:
# get the ftp links
ftp_links = get_FTPlinks(webenv, query_key)

# print one ftp link per line
for link in ftp_links:
    print(link)