In [54]:
# Basic Python Notebook showing how you can quickly download and start working with a datafile and make simple plots.
# Bioinformatics 1 (2022-23) - Week 6 - Working with Biological Databases
# ian.simpson@ed.ac.uk

# Activity 3 - Gene Expression Data

#load in modules
import pandas as pd

In [None]:
%pip install biopython

In [None]:
# There is a great guide to programmatic access to GEO here - https://www.ncbi.nlm.nih.gov/geo/info/geo_paccess.html

# Once again we can use the BioPython Entrez library to query and retrieve information just as we did for nucleotides earlier in the course
from genericpath import exists
from Bio import Entrez

Entrez.email = "A.N.Other@example.com" # You should replace this with your e-mail address 

# note the Entrez esearch function searches and returns a handle to the results.

# search for ASD microarray gene expression datasets
handle = Entrez.esearch(db='gds',term='"Autism Spectrum Disorder"[MH] AND "Expression profiling by array"[Filter]',retmax=1000)
record = Entrez.read(handle)
handle.close()

# How many are there?
print("There are "+str(record['Count'])+" microarray experiments in GEO for ASD.")

# Extract the list
idList = record['IdList']

#fetch summaries for these experiments and extract only the publication date [PDAT]

#build a counter
asd_array_counts = {}

# this might take a little while
for id in idList:
    print("Fetching details for experiment: "+str(id))
    handle = Entrez.esummary(db='gds',id=id)
    record = Entrez.read(handle)
    for entry in record:
        year = entry['PDAT'][0:4]
        if year in asd_array_counts:
            asd_array_counts[year] += 1
        else:
            asd_array_counts[year] = 1

#convert the dict into a pandas dataframe
microarray_experiments_by_year = pd.DataFrame.from_dict(asd_array_counts,orient='index',columns=['counts'])

In [None]:
# search for ASD microarray RNA-seq gene expression datasets
handle = Entrez.esearch(db='gds',term='"Autism Spectrum Disorder"[MH] AND "Expression profiling by high throughput sequencing"[Filter]',retmax=1000)
record = Entrez.read(handle)
handle.close()

# How many are there?
print("There are "+str(record['Count'])+" RNA-seq experiments in GEO for ASD.")

# Extract the list
idList = record['IdList']

#fetch summaries for these experiments and extract only the publication date [PDAT]

#build a counter for the years
asd_seqexp_counts = {}

for id in idList:
    print("Fetching details for experiment: "+str(id))
    handle = Entrez.esummary(db='gds',id=id)
    record = Entrez.read(handle)
    for entry in record:
        year = entry['PDAT'][0:4]
        if year in asd_seqexp_counts:
            asd_seqexp_counts[year] += 1
        else:
            asd_seqexp_counts[year] = 1

#convert the dict into a pandas dataframe
rnaseq_experiments_by_year = pd.DataFrame.from_dict(asd_seqexp_counts,orient='index',columns=['counts'])

In [None]:
#plot both data together
combined = pd.merge(rnaseq_experiments_by_year,microarray_experiments_by_year,left_index=True,right_index=True)

#sort the index for neater plot
combined = combined.sort_index('index')

#name the columns
combined.columns = ['RNA-seq','microarray']

#plot
combined.plot.line(xlabel='year',ylabel='experiment count')

#what is this graph telling us?

In [None]:
#you now have all the code you need to adapt (above) to complete the rest of the Gene Expression section from the computing lab worksheet. good luck!
# you could think about creating a stacked barchart with both array and RNA-seq data shown together