In [None]:
# Basic Python Notebook showing how you can quickly download and start working with a datafile and make simple plots.
# Bioinformatics 1 (2022-23) - Week 6 - Working with Biological Databases
# ian.simpson@ed.ac.uk

# Activity 4 - Working with the Gene Onotology

#load in modules
import pandas as pd
import urllib as ul
import numpy as np

In [None]:
#retrieve the gene_ids from the previous section (dop_geneids.txt)
dop_gene_ids = pd.read_csv('dop_geneids.txt',header=None)
dop_gene_ids.columns=['GeneID']

dop_gene_ids.head()

In [None]:
#retrieve the gene2go file, note this is commented out so you don't download it multiple times.
#to use it for the first time remove the '#' from the start of the line below.

# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz','gene2go.gz')

gene2go = pd.read_csv('gene2go.gz', compression='gzip', header=0, sep='\t')

gene2go.head()

In [None]:
#now explicitly restrict to human (tax_id - 9606)
human_gene2go = gene2go[gene2go['#tax_id']==9606]

In [None]:
human_gene2go.head()

In [None]:
#merge the dataframes on the geneid
dop_gos = pd.merge(dop_gene_ids,human_gene2go,right_on='GeneID',left_on='GeneID')

#the top scoring GO_ID in our list
dop_go_counts = dop_gos.groupby('GO_ID').size().sort_values(ascending=False)
print(dop_go_counts)
print('\n')

#find the top annotated GO term
top_goid = dop_gos.groupby('GO_ID').size().sort_values(ascending=False).index[0]
print(top_goid)

#create a unique lookup for GO_ID term descriptions
unique_dop_gos = dop_gos[['GO_ID','GO_term']].drop_duplicates()

#top GO_ID with it's description
unique_dop_gos[unique_dop_gos['GO_ID'] == top_goid]

In [None]:
#how many human genes are there in our human gene2GO set?
num_human_genes_ingo = len(human_gene2go['GeneID'].drop_duplicates())

#how many genes are annotated with GO:0005515 in our human gene2GO set?
num_human_genes_withtop = len(human_gene2go[human_gene2go['GO_ID'] == top_goid]['GeneID'].drop_duplicates())

#what is the size of our gene list?
num_human_genes_inlist = len(dop_gene_ids['GeneID'].drop_duplicates())

#how many genes would we expect to be annoated with the top GO_ID? (77.25)
expectation = num_human_genes_withtop/num_human_genes_ingo * num_human_genes_inlist

print('We would expect to see this '+str(round(num_human_genes_withtop/num_human_genes_ingo * num_human_genes_inlist,2))+' times')

#how many genes in our list are annotated with the top GO_ID? (118)
observation = dop_go_counts[0]
print('We actually see this '+str(round(observation,2))+' times')

#what's the enrichment?
print('So, the top GO term is found '+str(round(observation/expectation,2))+' times more frequently than we would expect by chance')

#why do we want to know this?