# Handling GFF3 files


In [1]:
import biu

In [2]:
gff = biu.formats.GFF3('example_files/example.gff3')

D: GFF input source is file.


In [3]:
print(gff)

GFF3 object
 Where: example_files/example.gff3
 Entries: 13
 Top level statistics:
  * Gene : 1



## Get Top Level features, by type

In [4]:
gff.topLevel

{'Gene': ['gene0001']}

## Get entries from GFF based on ID


In [5]:
gff.getIDEntry('gene0001')

GFF3Entry(seqid:ctg123, source:., feature:Gene, start:1300, end:9001, score:., strand:+, phase:., attr:ID=gene0001;Name=sonichedgehog)

## Get entries based on parent

In [6]:
gff.getChildren('gene0001')

[GFF3Entry(seqid:ctg123, source:., feature:mRNA, start:1300, end:9000, score:., strand:+, phase:., attr:ID=mrna0001.2;Name=sonichedgehog.1),
 GFF3Entry(seqid:ctg123, source:., feature:mRNA, start:1301, end:9001, score:., strand:+, phase:., attr:ID=mrna0001.2;Name=sonichedgehog.2)]

In [7]:
gff.getChildren('gene0001', containParent=True)

[GFF3Entry(seqid:ctg123, source:., feature:Gene, start:1300, end:9001, score:., strand:+, phase:., attr:ID=gene0001;Name=sonichedgehog),
 GFF3Entry(seqid:ctg123, source:., feature:mRNA, start:1300, end:9000, score:., strand:+, phase:., attr:ID=mrna0001.2;Name=sonichedgehog.1;Parent=gene0001),
 GFF3Entry(seqid:ctg123, source:., feature:mRNA, start:1301, end:9001, score:., strand:+, phase:., attr:ID=mrna0001.2;Name=sonichedgehog.2;Parent=gene0001)]

## Create sub-structures

In [8]:
subgff = biu.formats.GFF3(gff.getChildren('gene0001'))

print(subgff)

GFF3 object
 Where: 0x7f7f22bf0780
 Entries: 2
 Top level statistics:
  * mRNA : 2



D: GFF input source is list of GFF3Entries.


# C elegans genome

In [9]:
ce = biu.db.Genome("WBcel235")

In [10]:
print(ce.gff)

D: Initializing the GFF3ResourceManager object NOW
D: GFF input source is file.


GFF3 object
 Where: /home/tgehrmann/repos/BIU/docs/genome_WBcel235/genome.gff3
 Entries: 517453
 Top level statistics:
  * region : 7
  * gene : 44830
  * pseudogene : 1901
  * primary_transcript : 14
  * tRNA : 22
  * rRNA : 2
  * sequence_feature : 2



In [11]:
ce.gff.topLevel["gene"][0]

'gene0'

In [12]:
# For negative stranded feature
print(ce.gff.getChildren("rna4", feature="exon"))#
ce.gff.getChildren("rna4", feature="exon")[-1].seq(ce.genome["all"]).seq

[GFF3Entry(seqid:NC_003279.8, source:RefSeq, feature:exon, start:26733, end:26778, score:., strand:-, phase:., attr:ID=id12;Dbxref=GeneID:171592,Genbank:NM_058262.4,WormBase:WBGene00022278;end_range=26778,.;gbkey=mRNA;gene=rcor-1;partial=true;product=RCOR (REST CO-Repressor) homolog;transcript_id=NM_058262.4), GFF3Entry(seqid:NC_003279.8, source:RefSeq, feature:exon, start:26590, end:26681, score:., strand:-, phase:., attr:ID=id13;Dbxref=GeneID:171592,Genbank:NM_058262.4,WormBase:WBGene00022278;gbkey=mRNA;gene=rcor-1;partial=true;product=RCOR (REST CO-Repressor) homolog;transcript_id=NM_058262.4), GFF3Entry(seqid:NC_003279.8, source:RefSeq, feature:exon, start:26371, end:26524, score:., strand:-, phase:., attr:ID=id14;Dbxref=GeneID:171592,Genbank:NM_058262.4,WormBase:WBGene00022278;gbkey=mRNA;gene=rcor-1;partial=true;product=RCOR (REST CO-Repressor) homolog;transcript_id=NM_058262.4), GFF3Entry(seqid:NC_003279.8, source:RefSeq, feature:exon, start:25273, end:25472, score:., strand:-, p

D: Initializing the FastaResourceManager object NOW
D: Fasta input source is file


'gtcaatcgACACGTATCGCGCCGAGCTCAAGTCAAAACTCGGCAAATA'

In [13]:
# For positively stranded feature
ce.gff.getChildren("gene2")[1].seq(ce.genome["all"]).seq

'ATGGTCGCGTGGCAGCCGATGAAGCGGTCGATGATCAATGAGGATTCTAGAGCTCCATGTAAGTTAGTGGTGGTGGCCggaaaagagaaaactcggccaagCTGCTcggagtttttgaatttttgataatccgaaataaaaattgattgctCGAAAAGGAACaatcttttggaaaaaaacgaattttgtcatttttttcagcaaaaattgattttcgaatttttccaataaaaaatcgataatttctccCCGTGCAGTGGAAaacaaacaatatttttttgttgatcgTTCTCTTCCAAACCCGGAATAGGTACACACATTCCTGCGTCATCCCATTCTCttatcacacttttttttcgaaaataaaagtgTAGAGACGGAAAAGTGAGAAAGGAGTCAATTTTATGCGAAATTTTGCATGATAATAcactcaaattaaaaaaactgcgtggcgtgcactgcagaaaacctcatatttaggccccgcctttttctcgtccactcacggagaaaaggcaaaaatttggggaccaaccaatatcaggccgccgacatcctacgggttccgcgcgccgctatgtttaactcgctgtgggtgtggcgagctgtctccgcccgctgcgagttaaacatagcggcgcgcggaacccgtaggaagtcggcggcctgatattgttggtccccaaattttttccttttctccgtgagtggacgagaaaaaggcggggcctaattatgaggttttctgcagtacacgccacgcagtttttttattttgagtgtatagGTCTCGATTCTCGAAAGTATGACAGTTATTTAAATGATGAACTCGTGATGactgttaaatttttggaaatttcgggggaattatatcgatttttcgataaatttacaggaaaaaagtccaaaatctAGGTAttccatggtaggcaggcgcgaTTTCTTGACGCCTGCCTGGAATCTGTCCGCCTCAcaccaaaaaatgtcaatcattttgct