In [None]:
# Basic Python Notebook showing how you can quickly download and start working with a datafile and make simple plots.
# Bioinformatics 1 (2022-23) - Week 6 - Working with Biological Databases
# ian.simpson@ed.ac.uk

# Activity 1 - Genomic information for various species

## IN THIS NOTEBOOK I HAVE COMMENTED OUT ALL OF THE "ul.request.urlretrieve...." LINES. THIS IS TO STOP YOU ACCIDENTALLY DOWNLOADING THE FILES MULTIPLE TIMES. WHEN YOU FIRST RUN THE CELLS PLEASE UNCOMMENT THE LINES RUN THE CELL AND THEN RECOMMENT AGAIN BY ADDING A # AT THE START OF THE LINE. THIS IS TO PROTECT NCBI FROM LARGE NUMBERS OF DOWNLOADS OF THE DATA ##

#load in modules
import pandas as pd
import urllib as ul
import numpy as np

#pull the human gene_info file directly from the NCBI server (once you have done this once you can comment the line out below by putting a # symbol in front of the line. This will stop you downloading it multiple times)

# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz','Homo_sapiens.gene_info.gz')

# #mouse
# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Mus_musculus.gene_info.gz','Mus_musculus.gene_info.gz')

# #rat
# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Rattus_norvegicus.gene_info.gz','Rattus_norvegicus.gene_info.gz')

# #fruitfly
# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Invertebrates/Drosophila_melanogaster.gene_info.gz','Drosophila_melanogaster.gene_info.gz')

# #yeast
# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Fungi/Saccharomyces_cerevisiae.gene_info.gz','Saccharomyces_cerevisiae.gene_info.gz')


In [None]:
#read the gene_info files into Pandas data frames
human_df = pd.read_csv('Homo_sapiens.gene_info.gz', compression='gzip', header=0, sep='\t')
mouse_df = pd.read_csv('Mus_musculus.gene_info.gz', compression='gzip', header=0, sep='\t')
rat_df = pd.read_csv('Rattus_norvegicus.gene_info.gz', compression='gzip', header=0, sep='\t')
fruitfly_df = pd.read_csv('Drosophila_melanogaster.gene_info.gz', compression='gzip', header=0, sep='\t')
yeast_df = pd.read_csv('Saccharomyces_cerevisiae.gene_info.gz', compression='gzip', header=0, sep='\t')

In [None]:
#show the first few lines of the dataframe
human_df.head()

In [None]:
#counts of genes per chromosome
human_chromosomes = pd.DataFrame(human_df.value_counts(['chromosome']),columns=['gene_number'])

#this removes some unpleasent formatting for the index
human_chromosomes.index = [i[0] for i in human_chromosomes.index]

#show the first few lines
human_chromosomes.head()

In [None]:
#plot a basic bar-plot
human_chromosomes.plot.bar(ylabel='gene count', xlabel='chromosome',legend=False)

In [None]:
#counts by type_of_gene
human_df_genetype = pd.DataFrame(human_df.value_counts(['type_of_gene']),columns=['gene_number'])

#this removes some unpleasent formatting for the index
human_df_genetype.index = [i[0] for i in human_df_genetype.index]

#show the first few lines of the dataframe
human_df_genetype.head()

In [None]:
#plot a basic bar-plot
human_df_genetype.plot.bar(ylabel='gene count',legend=False)

In [None]:
print(human_df.shape)

In [None]:
#genes per species

#build the dataframe
data = {'counts' : [human_df.shape[0],mouse_df.shape[0],rat_df.shape[0],fruitfly_df.shape[0],yeast_df.shape[0]]}
df = pd.DataFrame(data)
df.index = ['human', 'mouse','rat','fruitfly','yeast']

#show the first few lines of the dataframe
print(df.head())

#plot a basic bar-plot
df.plot.bar(ylabel='gene count',xlabel='species',legend=False)

In [None]:
#find genome sizes

# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/overview.txt','genomes.txt')

#build the genomes report dataframe
genomes_df = pd.read_csv('genomes.txt',header=0,sep='\t')

# print(genomes_df[genomes_df.columns[0]])
# print(genomes_df[genomes_df.columns[0]].str.fullmatch('Homo sapiens'))
# print(genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Homo sapiens')]['Size (Mb)'])
# print(genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Homo sapiens')]['Size (Mb)'].astype('float'))
# print(genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Homo sapiens')]['Size (Mb)'].astype('float').values[0])

#select the genome sizes from the dataframe
genome_sizes = {'size': [genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Homo sapiens')]['Size (Mb)'].astype('float').values[0], genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Mus musculus')]['Size (Mb)'].astype('float').values[0], genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Rattus norvegicus')]['Size (Mb)'].astype('float').values[0], genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Drosophila melanogaster')]['Size (Mb)'].astype('float').values[0], genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Saccharomyces cerevisiae')]['Size (Mb)'].astype('float').values[0]]}

#build the genome size dataframe
genomes = pd.DataFrame(genome_sizes)

#add the species index
genomes.index = ['human', 'mouse','rat','fruitfly','yeast']

#normalise gene number by genome size and plot

#combine the two dfs - genomes and df
final = pd.merge(genomes,df,left_index=True,right_index=True)

#normalise the gene counts
final['norm'] = final['counts']/final['size']

#view the final df
print(final)

#plot the normalised gene counts
final.plot.bar(ylabel='normalised gene count',xlabel='species',legend=False,y='norm')

#brief conclusion - both fly and yeast have compressed genomes compared to the mammals

In [None]:
#unique transcripts

#This is the URL of the human refseq RNA file that contains the information displayed on the RefSeq website when you restrict it to homo_sapiense. You can find the number of unique transcripts from this by following a similar approach to above. I will not do this here, it is something you can develop as practice. NB that the file is >300Mb !

#https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/current/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_rna.gbff.gz