In [5]:
import os
import numpy as np
import pandas as pd
import glob
from Bio import SeqIO

In [16]:
def genbank_seqio(gbkf, verbose=False):
    """
    Get the parser stream
    :param gbkf: genbank file
    :param verbose:
    :return:
    """

    handle = open(gbkf, 'r')

    return SeqIO.parse(handle, "genbank")

def get_features_of_type(seqio, ftype):
    """
    Get the features
    :param seqio:
    :param ftype:
    :return:
    """

    features = []
    for seq in seqio:
        for feature in seq.features:
            if feature.type == ftype:
                yield feature

def GC(seq):
    """
    Calculate GC content1
    :param seq:
    :return:
    """

    gc = 0
    for i in seq:
        if i == 'G' or i == 'C':
            gc += 1
    return gc / len(seq)

def get_average_cds_length(seqio):
    """
    Get average CDS length
    :param seqio:
    :return:
    """

    cds_length = 0
    for seq in seqio:
        for feature in seq.features:
            if feature.type == 'CDS':
                cds_length += len(feature.location.extract(seq).seq)
    return cds_length / len(seqio)

def get_median_cds_length(seqio):
    """
    Get median CDS length
    :param seqio:
    :return:
    """

    cds_length = []
    for seq in seqio:
        for feature in seq.features:
            if feature.type == 'CDS':
                cds_length.append(len(feature.location.extract(seq).seq))
    return np.median(cds_length)

def get_coding_density(seqiorec, ftypes=['CDS']):
    """
    Get coding density for a single record
    :param seqio:
    :param ftypes:
    :return:
    """

    cdcov = np.zeros(len(seqiorec.seq))
    for feature in seqiorec.features:
        if feature.type in ftypes:
            start, stop = map(int, sorted([feature.location.start, feature.location.end]))
            cdcov[start:stop] += 1
    return sum([1 if x > 0 else 0 for x in cdcov]) / len(seqiorec.seq)



In [23]:
infile = '../selgenome.phanotate2/Bc01.fasta.TAA,TAG,TGA.gbk'# no gap line
# infile = '../test.gbk' # with gap line
for record in genbank_seqio(infile):
    cd = get_coding_density(record, ['3 CDS'])
    print(cd)

0.9494116001678288


