### Week 5 - Biological Databases - Gene Ontology
- October 2023
- [https://https://github.com/tisimpson/bioinformatics1](https://github.com/tisimpson/bioinformatics1)
- [ian.simpson@ed.ac.uk](mailto:ian.simpson@ed.ac.uk)

In [None]:
import pandas as pd
import urllib as ul
import numpy as np

In [None]:
#retrieve the gene_ids from the previous section (dop_geneids.txt)
dop_gene_ids = pd.read_csv('../data/pathways/cams_geneids.txt',header=None)
dop_gene_ids.columns=['GeneID']
dop_gene_ids.head()

In [None]:
# We are going to retrieve the mapping file produced by the GeneOntology consortium that maps genes to GO terms this is stored in the gene2go file at the NCBI
# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz','../data/ontology/gene2go.gz')

# We can read this file into a pandas dataframe using the read_csv function
gene2go = pd.read_csv('../data/ontology/gene2go.gz', compression='gzip', header=0, sep='\t')

# We can look at the first few rows of the dataframe using the head function
gene2go.head()

In [None]:
#now explicitly restrict to human (tax_id - 9606)
human_gene2go = gene2go[gene2go['#tax_id']==9606]

In [None]:
# We can look at the first few rows of the dataframe using the head function
human_gene2go.head()

In [None]:
# We can merge data frames on matching keys using the merge function in Pandas
# Here we only look for GO annotations that are associated with dopaminergic genes
dop_gos = pd.merge(dop_gene_ids,human_gene2go,right_on='GeneID',left_on='GeneID')

# what is the most frequent GO term annotated to the dopaminergic genes?
# we can use the pandas groupby function to group the data by GO_ID and then count the number of rows in each group
dop_go_counts = dop_gos.groupby('GO_ID').size().sort_values(ascending=False)

# show the top10 terms in a prettytable
from prettytable import PrettyTable
top10 = dop_go_counts.head(10)
t = PrettyTable(['GO_ID','Count'])
for i in top10.index:
    t.add_row([i,top10[i]])
print(t)

In [None]:
# ideally we would like a table that also includes the GO term description

#create a unique lookup dataframe for GO_ID term descriptions from our dopaminergic GO gene dataframe
unique_dop_gos = dop_gos[['GO_ID','GO_term']].drop_duplicates()

# now print out our top 10 GO terms with their descriptions using prettytable
t = PrettyTable(['GO_ID','Count','GO_term'])
for i in top10.index:
    t.add_row([i,top10[i],unique_dop_gos[unique_dop_gos['GO_ID'] == i]['GO_term'].values[0]])
print(t)

In [None]:
# we can use this dataframe to ask lots of interesting questions about the data

# how many human genes are there in our human gene2GO set?
num_human_genes_ingo = len(human_gene2go['GeneID'].drop_duplicates())
print('There are '+str(num_human_genes_ingo)+' human genes in our human gene2GO set')

# how many genes are annotated with GO:0005515 in our human gene2GO set?
top_goid = dop_go_counts.index[0]
num_human_genes_withtop = len(human_gene2go[human_gene2go['GO_ID'] == top_goid]['GeneID'].drop_duplicates())
print('There are '+str(num_human_genes_withtop)+' human genes annotated with '+top_goid+' in our human gene2GO set')

# what is the size of our gene list?
num_human_genes_inlist = len(dop_gene_ids['GeneID'].drop_duplicates())
print('There are '+str(num_human_genes_inlist)+' genes in our gene list')

# how many genes would we expect to be annoated with the top GO_ID?
expectation = num_human_genes_withtop/num_human_genes_ingo * num_human_genes_inlist
print('We would expect to see this '+str(round(expectation,2))+' times')

# how many genes in our list are annotated with the top GO_ID?
observation = dop_go_counts[top_goid]
print('We actually see this '+str(round(observation,2))+' times')

# what's the enrichment?
print('So, the top GO term is found '+str(round(observation/expectation,2))+' times more frequently than we would expect by chance')

# why might we want to know this?