### Week 5 - Biological Databases - Genomic Data
- October 2023
- [https://https://github.com/tisimpson/bioinformatics1](https://github.com/tisimpson/bioinformatics1)
- [ian.simpson@ed.ac.uk](mailto:ian.simpson@ed.ac.uk)

In [None]:
# Genomic information for various species

# I've commented out the "ul.request.urlretrieve...." lines to prevent them being downloaded many times as they are large.
# If you want to download them again, just remove the # symbol and run the code again.

# We are going to download genome scale annotation data directly from NCBI for 5 model organisms.

import pandas as pd
import urllib as ul
import numpy as np

# #human
# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz','../data/genomes/Homo_sapiens.gene_info.gz')

# #mouse
# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Mus_musculus.gene_info.gz','../data/genomes/Mus_musculus.gene_info.gz')

# #rat
# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Rattus_norvegicus.gene_info.gz','../data/genomes/Rattus_norvegicus.gene_info.gz')

# #fruitfly
# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Invertebrates/Drosophila_melanogaster.gene_info.gz','../data/genomes/Drosophila_melanogaster.gene_info.gz')

# #yeast
# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Fungi/Saccharomyces_cerevisiae.gene_info.gz','../data/genomes/Saccharomyces_cerevisiae.gene_info.gz')

In [None]:
#read the gene_info files into Pandas data frames
human_df = pd.read_csv('../data/genomes/Homo_sapiens.gene_info.gz', compression='gzip', header=0, sep='\t')
mouse_df = pd.read_csv('../data/genomes/Mus_musculus.gene_info.gz', compression='gzip', header=0, sep='\t')
rat_df = pd.read_csv('../data/genomes/Rattus_norvegicus.gene_info.gz', compression='gzip', header=0, sep='\t')
fruitfly_df = pd.read_csv('../data/genomes/Drosophila_melanogaster.gene_info.gz', compression='gzip', header=0, sep='\t')
yeast_df = pd.read_csv('../data/genomes/Saccharomyces_cerevisiae.gene_info.gz', compression='gzip', header=0, sep='\t')

In [None]:
#show the first few lines of the dataframe
human_df.head()

In [None]:
# count the number of genes per chromosome
human_chromosomes = human_df['chromosome'].value_counts().rename_axis('chromosome').reset_index(name='gene_count')

# show the first few lines
human_chromosomes.head()

In [None]:
# show the size of the dataframe
print(human_df.shape)

In [None]:
#plot a basic bar-plot
sorted_counts = human_chromosomes.sort_values('gene_count',ascending=False)
sorted_counts.plot.bar(x='chromosome', y='gene_count', ylabel='gene count', xlabel='chromosome', legend=False)

In [None]:
#genes per species

#build the dataframe
data = {'counts' : [human_df.shape[0],mouse_df.shape[0],rat_df.shape[0],fruitfly_df.shape[0],yeast_df.shape[0]]}
df = pd.DataFrame(data)
df.index = ['human', 'mouse','rat','fruitfly','yeast']

#show the first few lines of the dataframe
print(df.head())

#plot a basic bar-plot
df.plot.bar(ylabel='gene count',xlabel='species',legend=False)

In [None]:
# Now we're going to look at the size of the genomes

# ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/overview.txt','../data/genomes/genomes.txt')

#build the genomes report dataframe
genomes_df = pd.read_csv('../data/genomes/genomes.txt',header=0,sep='\t',low_memory=False)

#select the genome sizes from the dataframe
genome_sizes = {'size': [genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Homo sapiens')]['Size (Mb)'].astype('float').values[0], genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Mus musculus')]['Size (Mb)'].astype('float').values[0], genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Rattus norvegicus')]['Size (Mb)'].astype('float').values[0], genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Drosophila melanogaster')]['Size (Mb)'].astype('float').values[0], genomes_df[genomes_df[genomes_df.columns[0]].str.fullmatch('Saccharomyces cerevisiae')]['Size (Mb)'].astype('float').values[0]]}

#build the genome size dataframe
genomes = pd.DataFrame(genome_sizes)

#add the species index
genomes.index = ['human', 'mouse','rat','fruitfly','yeast']

#normalise gene number by genome size and plot

#combine the two dfs - genomes and df
final = pd.merge(genomes,df,left_index=True,right_index=True)

#normalise the gene counts
final['norm'] = final['counts']/final['size']

#view the final df
print(final)

#plot the normalised gene counts
final.plot.bar(ylabel='normalised gene count',xlabel='species',legend=False,y='norm')

#brief conclusion - both fly and yeast have compressed genomes compared to the mammals

In [None]:
# Exploring unique transcripts from genome annotation

# This is the URL of the human refseq RNA file that contains the information displayed on the RefSeq website when you restrict it to homo_sapiens. You can find the number of unique transcripts from this by following a similar approach to above.
# I will not do this here, it is something you can develop as practice. NB that the file is >300Mb !

# https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/current/GCF_000001405.40-RS_2023_10/GCF_000001405.40_GRCh38.p14_rna.gbff.gz

# You might like to explore the genomes FTP data site - https://ftp.ncbi.nlm.nih.gov/genomes

In [None]:
import pandas as pd
import urllib as ul
import numpy as np
from prettytable import PrettyTable

In [None]:
# retrieve the human genome feature annotations from the NCBI FTP site
ul.request.urlretrieve('https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/annotation_releases/current/GCF_000001405.40-RS_2023_10/GCF_000001405.40_GRCh38.p14_feature_table.txt.gz','../DATA/genomes/features.txt.gz')

In [None]:
human_df = pd.read_csv('../data/genomes/features.txt.gz', compression='gzip',header=0, sep='\t', low_memory=False)

In [None]:
human_df.head()

In [None]:
# summarise the feature data

# plot a prettytable of the counts for all the features

# count the number of features per type
featureCounts = human_df['# feature'].value_counts().to_frame(name='counts')
featureCounts = featureCounts.rename_axis('feature type')

# create a prettytable object
table = PrettyTable()

# add the columns to the table
table.field_names = ["Feature Type", "Counts"]

# add the rows to the table
for index, row in featureCounts.iterrows():
    table.add_row([index,row['counts']])

# print the table
print(table)

In [None]:
mRNAs = human_df[human_df['# feature'] == 'mRNA']

mRNAs.head()

In [None]:
# summarise by gene

mRNA_GeneFrequencies = mRNAs['symbol'].value_counts().to_frame()

# mRNACounts = featureCounts.rename_axis('feature type')


# print the top gene with the highest number of transcripts in a prettytable
table = PrettyTable()

# add the columns to the table
table.field_names = ["Gene", "Number of Transcripts"]

# counter
i = 0

# add the rows to the table
for index, row in mRNA_GeneFrequencies.iterrows():
    # stop after 10 rows
    if i <= 10:
        table.add_row([index,row.values[0]])
        i += 1
    else:
        break

# print the table
print(table)

In [None]:
# plot a histogram of the mRNA_GeneFrequency data

# import the plotting library
import matplotlib.pyplot as plt

# plot a histogram of the mRNA_GeneFrequency data with labelled axes
plt.hist(mRNA_GeneFrequencies, bins=100, log=True)
plt.ylabel('Number of Genes (log scale)')
plt.xlabel('mRNAs per Gene')
plt.title('Histogram of mRNAs per Gene')
plt.show()

In [None]:
mRNA_TranscriptSizes = mRNAs['product_length']

plt.hist(mRNA_TranscriptSizes, bins=100, log=True)
plt.ylabel('Number of Transcripts (log scale)')
plt.xlabel('Size of Transcript (bp)')
plt.title('Histogram of mRNAs per Gene')
plt.show()

# the mean transcript length
print(f"The mean transcript length in the human genome is",int(np.round(mRNA_TranscriptSizes.mean())),"nucleotides")

# print the longest transcript
print("The longest transcript is",mRNAs[mRNAs['product_length'] == mRNAs['product_length'].max()]['symbol'].values[0],"at",int(mRNAs['product_length'].max()),"nucleotides")

# print the shortest transcript
print("The shortest transcript is",mRNAs[mRNAs['product_length'] == mRNAs['product_length'].min()]['symbol'].values[0],"at",int(mRNAs['product_length'].min()),"nucleotides")
