## Playground

In [1]:
# https://biopython.org/wiki/GFF_Parsing

from typing import List, Dict, Iterable, Tuple
import pprint
import itertools as it
import collections

import Bio
from Bio.SeqFeature import SeqFeature
from Bio.SeqFeature import FeatureLocation, CompoundLocation
from BCBio.GFF import GFFExaminer
from BCBio import GFF

# in_file = "../samples/prokka.gff3"
# in_file = "../samples/augustus.gff3"
in_file = "../samples/maker.gff3"


In [2]:
recs = list(GFF.parse(in_file))
pprint.pp(recs)

[SeqRecord(seq=UnknownSeq(1274862, character='?'), id='4', name='<unknown name>', description='<unknown description>', dbxrefs=[])]


In [3]:
entry = recs[0]
pprint.pp(entry)

SeqRecord(seq=UnknownSeq(1274862, character='?'), id='4', name='<unknown name>', description='<unknown description>', dbxrefs=[])


In [4]:
f = entry.features[0]
f

SeqFeature(FeatureLocation(ExactPosition(53460), ExactPosition(64200), strand=-1), type='gene', id='maker-4-est_gff_Cufflinks-gene-0.0')

In [5]:
f.qualifiers

{'ID': ['maker-4-est_gff_Cufflinks-gene-0.0'],
 'Name': ['maker-4-est_gff_Cufflinks-gene-0.0'],
 'score': ['7.864828'],
 'source': ['maker']}

## Formatting `SeqRecord` as DDBJ annotation table

In [6]:
len(recs)

1

In [7]:
collections.Counter(f.type for f in recs[0].features) 

Counter({'gene': 69})

In [8]:
a_gene = recs[0].features[0]
collections.Counter(f.type for f in a_gene.sub_features)

Counter({'mRNA': 1})

In [9]:
a_transcript = a_gene.sub_features[0]
collections.Counter(f.type for f in a_transcript.sub_features)

Counter({'exon': 5, 'five_prime_UTR': 1, 'CDS': 5, 'three_prime_UTR': 1})

In [10]:
a_transcript.sub_features

[SeqFeature(FeatureLocation(ExactPosition(63539), ExactPosition(64200), strand=-1), type='exon', id='maker-4-est_gff_Cufflinks-gene-0.0-mRNA-1:exon:4'),
 SeqFeature(FeatureLocation(ExactPosition(57141), ExactPosition(61911), strand=-1), type='exon', id='maker-4-est_gff_Cufflinks-gene-0.0-mRNA-1:exon:3'),
 SeqFeature(FeatureLocation(ExactPosition(56499), ExactPosition(57083), strand=-1), type='exon', id='maker-4-est_gff_Cufflinks-gene-0.0-mRNA-1:exon:2'),
 SeqFeature(FeatureLocation(ExactPosition(53816), ExactPosition(53999), strand=-1), type='exon', id='maker-4-est_gff_Cufflinks-gene-0.0-mRNA-1:exon:1'),
 SeqFeature(FeatureLocation(ExactPosition(53460), ExactPosition(53751), strand=-1), type='exon', id='maker-4-est_gff_Cufflinks-gene-0.0-mRNA-1:exon:0'),
 SeqFeature(FeatureLocation(ExactPosition(64050), ExactPosition(64200), strand=-1), type='five_prime_UTR', id='maker-4-est_gff_Cufflinks-gene-0.0-mRNA-1:five_prime_utr'),
 SeqFeature(FeatureLocation(ExactPosition(63539), ExactPosition(

In [11]:
target_type = "CDS"
target_features = [f for f in recs[0].features[0].sub_features[0].sub_features if f.type == target_type]
locs = [f.location for f in target_features]
compound_loc = CompoundLocation(locs)
feature_packed = SeqFeature(compound_loc, target_type)
feature_packed

SeqFeature(CompoundLocation([FeatureLocation(ExactPosition(63539), ExactPosition(64050), strand=-1), FeatureLocation(ExactPosition(57141), ExactPosition(61911), strand=-1), FeatureLocation(ExactPosition(56499), ExactPosition(57083), strand=-1), FeatureLocation(ExactPosition(53816), ExactPosition(53999), strand=-1), FeatureLocation(ExactPosition(53643), ExactPosition(53751), strand=-1)], 'join'), type='CDS', location_operator='join')

In [12]:
print(compound_loc)

join{[63539:64050](-), [57141:61911](-), [56499:57083](-), [53816:53999](-), [53643:53751](-)}


In [13]:
target_features[2].qualifiers

{'ID': ['maker-4-est_gff_Cufflinks-gene-0.0-mRNA-1:cds'],
 'Parent': ['maker-4-est_gff_Cufflinks-gene-0.0-mRNA-1'],
 'source': ['maker'],
 'phase': ['2']}

In [14]:
def table_to_str(table: List[List[str]]) -> str:
    return "\n".join("\t".join(items) for items in table)

In [15]:
def format_location(loc: Bio.SeqFeature.FeatureLocation) -> str:
    """Format Bio.SeqFeature.FeatureLocation
    
    """
    return "{start}..{end}".format(start=loc.start, end=loc.end)

In [16]:
def to_ddbj_table(rec: Bio.SeqRecord):
    """Convert GFF.SeqRecord into DDBJ annotation table format
    
    DDBJ annotation table format is TSV (tab-separated variables) with 5 columns
        - Column 1: Sequence ID
        - Column 2: Feature Key
        - Column 3: Location
        - Column 4: Qualifier Key
        - Column 5: Qualifier Value
    
    Many entries are blank.
    
    Example:
    [
        ["CLN01", "source", "1..12297"                         , "organism"   ,  "Mus musculus"  ],
        [       ,         ,                                    , "mol_type"   ,  "genomic DNA"   ],
        [       ,         ,                                    , "clone"      ,  "PC0110"        ],      
        [       ,         ,                                    , "chromosome" ,  "8"             ],      
        [       ,  "CDS"  , "join(<1..456,609..879,1070..1213)", "product"    ,  "protein kinase"],
        [       ,         ,                                    , "codon_start",  "2"             ],
    ]

    """
    lines = sum(
        len(vals)
        for feature in rec.features 
        for vals in feature.qualifiers.values()
    )
    print("lines: {}".format(lines))
    table = [["" for _ in range(5)] for _ in range(lines)]
    
    idx = 0
    for (feature_idx, feature) in enumerate(rec.features):
        for (qualifier_idx, (qualifier_key, values)) in enumerate(feature.qualifiers.items()):
            for qualifier_value in values:
                if qualifier_idx == 0:
                    table[idx][1] = feature.type
                    if feature.location is not None:
                        table[idx][2] = format_location(feature.location)
                table[idx][3] = qualifier_key
                table[idx][4] = qualifier_value
                idx += 1
    return table

In [17]:
tbl = to_ddbj_table(entry)
s = table_to_str(tbl)
print(s)

lines: 276
	gene	53460..64200	ID	maker-4-est_gff_Cufflinks-gene-0.0
			Name	maker-4-est_gff_Cufflinks-gene-0.0
			score	7.864828
			source	maker
	gene	90013..128029	ID	maker-4-est_gff_Cufflinks-gene-1.0
			Name	maker-4-est_gff_Cufflinks-gene-1.0
			score	6.732889
			source	maker
	gene	152875..163632	ID	maker-4-est_gff_Cufflinks-gene-1.1
			Name	maker-4-est_gff_Cufflinks-gene-1.1
			score	16.569730
			source	maker
	gene	68503..76929	ID	maker-4-est_gff_Cufflinks-gene-1.2
			Name	maker-4-est_gff_Cufflinks-gene-1.2
			score	20.410109
			source	maker
	gene	86132..87863	ID	maker-4-est_gff_Cufflinks-gene-1.3
			Name	maker-4-est_gff_Cufflinks-gene-1.3
			score	2132.029560
			source	maker
	gene	135602..150357	ID	maker-4-est_gff_Cufflinks-gene-1.4
			Name	maker-4-est_gff_Cufflinks-gene-1.4
			score	5.745459
			source	maker
	gene	230940..235351	ID	maker-4-est_gff_Cufflinks-gene-2.0
			Name	maker-4-est_gff_Cufflinks-gene-2.0
			score	39.430468
			source	maker
	gene	248672..250746	ID	maker-4-est_gf

In [18]:
def get_containment_relation(features: Iterable[SeqFeature]) -> Dict[SeqFeature, List[SeqFeature]]:
    """Find containment relation and return dictionary such that
    a key contains its value as sub-segments.
    """
    res = collections.defaultdict(list)
    for f1, f2 in it.combinations(features, 2):
        start1 = f1.location.start
        end1 = f1.location.end
        start2 = f2.location.start
        end2 = f2.location.end
        if start1 <= start2 and end2 <= end1:
            res[f1].append(f2)
        elif start2 <= start1 and end1 <= end2:
            res[f2].append(f1)
        elif end1 <= start2 or end2 <= start1:
            # note that biopython's segment is [start, end) 
            pass
        else:
            loc1 = str(f1.location)
            loc2 = str(f2.location)
            print("Intersection (0-based): {}:{} and {}:{}".format(f1.type, loc1, f2.type, loc2))
    return res

def group_by_id(features: Iterable[SeqFeature]) -> Dict[Tuple[str, str], List[SeqFeature]]:
    """Group segments by (ID, featureKey)
    """
    res = collections.defaultdict(list)
    for feature in features:
        tup = (feature.id, feature.type)
        res[tup].append(feature)
    return res

In [19]:
# Containment and intersection of segments do not work as different CDS can overlap anyway
get_containment_relation(entry.features)

Intersection (0-based): gene:[1208177:1213081](+) and gene:[1213069:1217101](-)


defaultdict(list, {})

In [20]:
# Grouping by (ID, featureKey)
d = group_by_id(entry.features)
for k, vs in d.items():
    print("{}: {}".format(k, len(vs)))

('maker-4-est_gff_Cufflinks-gene-0.0', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-1.0', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-1.1', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-1.2', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-1.3', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-1.4', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-2.0', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-2.1', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-2.2', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-2.3', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-2.4', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-2.5', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-2.6', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-2.7', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-2.8', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-2.9', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-3.0', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-3.1', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-3.2', 'gene'): 1
('maker-4-est_gff_Cufflinks-gene-3.3', 'gene'): 1


In [23]:
import collections
collections.Counter(f.type for f in entry.features[0].sub_features)

Counter({'mRNA': 1})

In [24]:
collections.Counter(f.type for f in entry.features[0].sub_features[0].sub_features)

Counter({'exon': 5, 'five_prime_UTR': 1, 'CDS': 5, 'three_prime_UTR': 1})

## Testing translators in `src`

In [26]:
import json
import pathlib
import sys

pwd = pathlib.Path.cwd().parent / "src"
sys.path.append(str(pwd))
from modules import translators

paths = ["../src/modules/gff_column_to_ddbj_qualifier.json", "../src/modules/gff_attribute_to_ddbj_qualifier.json"]
paths = [pathlib.Path(p).absolute() for p in paths]

In [27]:
qualifier_converter = translators.GFF3AttributesToQualifiers(paths)

In [28]:
entry = qualifier_converter.run(entry)

In [29]:
entry.features[-3].sub_features[0].qualifiers

{'note': ['source:maker'],
 '_AED': ['0.11'],
 '_eAED': ['0.11'],
 '_QI': ['30|1|1|1|0|0|5|103|441']}

In [30]:
cds = [f for f in entry.features[0].sub_features[0].sub_features if f.type == "CDS"]
cds[0].qualifiers

{'note': ['source:maker'], 'codon_start': ['0']}

In [31]:
collections.Counter(f.type for f in entry.features[0].sub_features[0].sub_features)

Counter({'exon': 5, 'five_prime_UTR': 1, 'CDS': 5, 'three_prime_UTR': 1})

In [32]:
paths = ["../src/modules/gff_type_to_ddbj_feature.json"]
paths = [pathlib.Path(p).absolute() for p in paths]
feature_converter = translators.GFF3TypesToFeatures(paths)

In [33]:
entry = feature_converter.run(entry)

In [34]:
collections.Counter(f.type for f in entry.features[0].sub_features[0].sub_features)

Counter({'exon': 5, "5'UTR": 1, 'CDS': 5, "3'UTR": 1})

In [35]:
type(entry.features)

list