# Data science project in Python

For the 4 species gff files store in the `data/` folder (mouse: `GRCm38.gff3`, human: `GRCh38.gff3`, zebrafish: `GRCz11.gff3` and panda: `AilMel.gff3`), load these files into a DataFrame, filter the exons, and calculate their length, their GC content as well as their molecular weigth. Store these calculations into new columns in their respective DataFrame. Plot these three values for the four species onto three graphs using a boxplot.

Start by working with the mouse data for exploratory analysis, working through the problem steps by steps. Then, create a re-usable function to apply the analysis onto all datasets. Load all four datasets and apply the newly created function to calculate the new values. After having all DataFrames with the new three columns, visualise the data onto three boxplot graphs.

Present the results into a Jupyter notebook, using Pandas, Matplotlib and Biopython. Write reusable and modular code as much as possible using functions.

In [None]:
import pandas
from Bio import SeqIO
from Bio import Entrez
from Bio.SeqUtils import GC, molecular_weight

In [None]:
def get_gc_and_mw_from_gbid(id='NM_177676.6'):
    Entrez.email = 'A.N.Other@example.com' # Always tell NCBI who you are
    handle = Entrez.efetch(db="nucleotide", id=id, rettype="gb")
    seq_record = SeqIO.read(handle, "gb")
    handle.close()
    return GC(seq_record.seq), molecular_weight(seq_record.seq)

In [None]:
print(get_gc_and_mw_from_gbid('NM_177676.6'))

In [None]:
def get_gc_and_mw_from_gbids(df_ids):
    gcs = []
    mws = []
    for i in df_ids:
        gc, mw = get_gc_and_mw_from_gbid(i)
        gcs.append(gc)
        mws.append(mw)
    return gcs, mws

## Working with mouse data: exploratory analysis

- remove rows with null values
- filter all exons
- calculate GC contents and molecular weights

In [None]:
mouse = pandas.read_csv('data/GRCm38.gff3', sep='\t')
mouse.head()

In [None]:
mouse.dropna(inplace=True)
print(mouse.type.unique())

In [None]:
exon_mouse = mouse[mouse['type']=='exon']

In [None]:
exon_mouse.head()

In [None]:
small_exon_mouse = exon_mouse.iloc[:9,]

In [None]:
gcs, mws = get_gc_and_mw_from_gbids(small_exon_mouse['gbid'])

In [None]:
small_exon_mouse.insert(len(small_exon_mouse.columns), 'len', small_exon_mouse['end'] - small_exon_mouse['start'] + 1)

In [None]:
small_exon_mouse.insert(len(small_exon_mouse.columns), 'gc', gcs)

In [None]:
small_exon_mouse.insert(len(small_exon_mouse.columns), 'mw', mws)

In [None]:
small_exon_mouse.head()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.boxplot([small_exon_mouse['gc']], labels=['mouse'])
plt.ylabel('GC content (%)')
plt.show()

## Creating a re-usable function to apply the analysis onto other datasets

In [None]:
def get_exons(data):
    data.dropna(inplace=True)
    exons = data[data['type']=='exon']
    exons = exons[:9]
    gcs, mws = get_gc_and_mw_from_gbids(exons['gbid'])
    exons.insert(len(exons.columns), 'len', exons['end'] - exons['start'] + 1)
    exons.insert(len(exons.columns), 'gc', gcs)
    exons.insert(len(exons.columns), 'mw', mws)
    return exons

## Loading all four datasets and calculating new values

In [None]:
mouse = pandas.read_csv('data/GRCm38.gff3', sep='\t')
mouse_small_exons = get_exons(mouse)
mouse_small_exons.head()

In [None]:
human = pandas.read_csv('data/GRCh38.gff3', sep='\t')
human_small_exons = get_exons(human)
human_small_exons.head()

In [None]:
zebrafish = pandas.read_csv('data/GRCz11.gff3', sep='\t')
zebrafish_small_exons = get_exons(zebrafish)
zebrafish_small_exons.head()

In [None]:
panda = pandas.read_csv('data/AilMel.gff3', sep='\t')
panda_small_exons = get_exons(panda)
panda_small_exons.head()

## Visualising data

### Comparing exon's length, GC content and molecular weight across four species

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.boxplot([mouse_small_exons['len'], 
             human_small_exons['len'], 
             zebrafish_small_exons['len'],
             panda_small_exons['len']], 
            labels=['mouse', 'human', 'zebrafish', 'panda']
            )
plt.ylabel('Feature length (bp)')
plt.show()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.boxplot([mouse_small_exons['gc'], 
             human_small_exons['gc'], 
             zebrafish_small_exons['gc'],
             panda_small_exons['gc']], 
            labels=['mouse', 'human', 'zebrafish', 'panda']
            )
plt.ylabel('GC content (%)')
plt.show()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.boxplot([mouse_small_exons['mw'], 
             human_small_exons['mw'], 
             zebrafish_small_exons['mw'],
             panda_small_exons['mw']], 
            labels=['mouse', 'human', 'zebrafish', 'panda']
            )
plt.ylabel('Molecular weight')
plt.show()