# Handling GFF3 files


In [3]:
import biu

In [4]:
gff = biu.formats.GFF3('example_files/example.gff3')

D: GFF input source is file.


In [5]:
print(gff)

GFF3 object
 Where: example_files/example.gff3
 Entries: 13
 Indexed: Yes
 Feature statistics:
  * Gene : 1
  * mRNA : 2
  * exon : 10



In [4]:
for entry in gff:
    print(entry)

GFF3Entry(seqid:ctg123, source:., feature:Gene, start:1300, end:9001, score:., strand:+, phase:., attr:ID=gene0001;Name=sonichedgehog)
GFF3Entry(seqid:ctg123, source:., feature:mRNA, start:1300, end:9000, score:., strand:+, phase:., attr:ID=mrna0001.1;Name=sonichedgehog.1;Parent=gene0001)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:1300, end:1500, score:., strand:+, phase:., attr:ID=exon00001.1;Parent=mrna0001.1)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:1050, end:1500, score:., strand:+, phase:., attr:ID=exon00002.1;Parent=mrna0001.1)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:3000, end:3902, score:., strand:+, phase:., attr:ID=exon00003.1;Parent=mrna0001.1)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:5000, end:5500, score:., strand:+, phase:., attr:ID=exon00004.1;Parent=mrna0001.1)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:7000, end:9000, score:., strand:+, phase:., attr:ID=exon00005.1;Parent=mrna0001.1)
GFF3Entry(seqid:ctg123, s

## Get feature IDs, by type

In [5]:
gff.features

{'Gene': ['gene0001'],
 'mRNA': ['mrna0001.1', 'mrna0001.2'],
 'exon': ['exon00001.1',
  'exon00002.1',
  'exon00003.1',
  'exon00004.1',
  'exon00005.1',
  'exon00001.2',
  'exon00002.2',
  'exon00003.2',
  'exon00004.2',
  'exon00005.2']}

## Get entries from GFF based on ID

In [6]:
gff['gene0001']

GFF3Entry(seqid:ctg123, source:., feature:Gene, start:1300, end:9001, score:., strand:+, phase:., attr:ID=gene0001;Name=sonichedgehog)

## Get GFF entries from GFF based on integer index

In [7]:
gff[1]

GFF3Entry(seqid:ctg123, source:., feature:mRNA, start:1300, end:9000, score:., strand:+, phase:., attr:ID=mrna0001.1;Name=sonichedgehog.1;Parent=gene0001)

## Get entries based on parent

In [8]:
children = gff.getChildren('mrna0001.1', containParent=True)
print(children)

GFF3 object
 Where: 0x7f9b0683b288
 Entries: 6
 Indexed: Yes
 Feature statistics:
  * mRNA : 1
  * exon : 5



D: GFF input source is list of GFF3Entries.


In [9]:
for child in children:
    print(child)

GFF3Entry(seqid:ctg123, source:., feature:mRNA, start:1300, end:9000, score:., strand:+, phase:., attr:ID=mrna0001.1;Name=sonichedgehog.1)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:1300, end:1500, score:., strand:+, phase:., attr:ID=exon00001.1;Parent=mrna0001.1)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:1050, end:1500, score:., strand:+, phase:., attr:ID=exon00002.1;Parent=mrna0001.1)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:3000, end:3902, score:., strand:+, phase:., attr:ID=exon00003.1;Parent=mrna0001.1)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:5000, end:5500, score:., strand:+, phase:., attr:ID=exon00004.1;Parent=mrna0001.1)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:7000, end:9000, score:., strand:+, phase:., attr:ID=exon00005.1;Parent=mrna0001.1)


In [10]:
for child in gff.getChildren('mrna0001.2', containParent=True, depth=None):
    print(child)

GFF3Entry(seqid:ctg123, source:., feature:mRNA, start:1301, end:9001, score:., strand:+, phase:., attr:ID=mrna0001.2;Name=sonichedgehog.2)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:1301, end:1501, score:., strand:+, phase:., attr:ID=exon00001.2;Parent=mrna0001.2)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:1051, end:1501, score:., strand:+, phase:., attr:ID=exon00002.2;Parent=mrna0001.2)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:3001, end:3901, score:., strand:+, phase:., attr:ID=exon00003.2;Parent=mrna0001.2)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:5001, end:5501, score:., strand:+, phase:., attr:ID=exon00004.2;Parent=mrna0001.2)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:7001, end:9001, score:., strand:+, phase:., attr:ID=exon00005.2;Parent=mrna0001.2)


D: GFF input source is list of GFF3Entries.


## Create sub-structures

By default, the result of getChildren, or query, returns another GFF3 object that can be queried in the same way.

Sometimes, you are only interested in the raw GFF3 entries. These can be retrieved with the `raw=True` option.


In [7]:
entries = gff.getChildren('mrna0001.1', containParent=True, raw=True)
print(type(entries), type(entries[0]))

<class 'list'> <class 'biu.formats.gff3Utils.GFF3Entry'>


You can envelop these in another GFF3 structure if you wish:

In [9]:
subgff = biu.formats.GFF3(entries)
print(subgff)

GFF3 object
 Where: 0x7fc004be45e8
 Entries: 6
 Indexed: Yes
 Feature statistics:
  * mRNA : 1
  * exon : 5



D: GFF input source is list of GFF3Entries.


## Query based on intervals

You can also use the `raw=True` option here

In [12]:
print(subgff.query("ctg123", 1301, 1500))
for e in subgff.query("ctg123", 1301, 1500):
    print(e)

GFF3 object
 Where: 0x7f9b0683b348
 Entries: 3
 Indexed: Yes
 Feature statistics:
  * exon : 2
  * mRNA : 1

GFF3Entry(seqid:ctg123, source:., feature:exon, start:1300, end:1500, score:., strand:+, phase:., attr:ID=exon00001.1;Parent=mrna0001.1)
GFF3Entry(seqid:ctg123, source:., feature:exon, start:1050, end:1500, score:., strand:+, phase:., attr:ID=exon00002.1;Parent=mrna0001.1)
GFF3Entry(seqid:ctg123, source:., feature:mRNA, start:1300, end:9000, score:., strand:+, phase:., attr:ID=mrna0001.1;Name=sonichedgehog.1)


D: GFF input source is list of GFF3Entries.
D: GFF input source is list of GFF3Entries.


In [13]:
print(subgff)

GFF3 object
 Where: 0x7f9b0683b5e8
 Entries: 6
 Indexed: Yes
 Interval Indexes:
  * exon,mrna
 Feature statistics:
  * mRNA : 1
  * exon : 5



## Get gene/transcript/protein sequences from the GFF file.
We examine this in the C elegans genome

In [14]:
ce = biu.db.Genome("WBcel235")

In [15]:
print(ce.gff)

D: Initializing the GFF3ResourceManager object NOW
D: GFF input source is file.


GFF3 object
 Where: /home/tgehrmann/repos/BIU/docs/genome_WBcel235/genome.gff3
 Entries: 517453
 Indexed: Yes
 Feature statistics:
  * region : 7
  * gene : 44830
  * snoRNA : 345
  * exon : 219765
  * mRNA : 28134
  * CDS : 196662
  * pseudogene : 1901
  * piRNA : 15364
  * ncRNA : 7769
  * transcript : 667
  * pseudogenic_tRNA : 209
  * tRNA : 634
  * antisense_RNA : 104
  * lnc_RNA : 181
  * primary_transcript : 271
  * miRNA : 454
  * snRNA : 130
  * rRNA : 22
  * pseudogenic_rRNA : 1
  * scRNA : 1
  * sequence_feature : 2



In [16]:
ce.gff.features["gene"][0]

'gene0'

In [17]:
# For negative stranded feature
print(ce.gff.getChildren("rna4", feature="exon")[0])#

print("\n>rna4")
print(ce.gff.seq("rna4", ce.genome["all"]).translate())

GFF3Entry(seqid:NC_003279.8, source:RefSeq, feature:exon, start:26733, end:26778, score:., strand:-, phase:., attr:ID=id12;Dbxref=GeneID:171592,Genbank:NM_058262.4,WormBase:WBGene00022278;end_range=26778,.;gbkey=mRNA;gene=rcor-1;partial=true;product=RCOR (REST CO-Repressor) homolog;transcript_id=NM_058262.4)

>rna4


D: GFF input source is list of GFF3Entries.
D: GFF input source is list of GFF3Entries.
D: Initializing the FastaResourceManager object NOW
D: Fasta input source is file


MDSYTSSDEDASRKENEGLNMLNASPEPMEEDDPEEQAEQEEETSRMARPIRSMRKRETTSGESMGDEDEDLEDEEDEDEEAEAREHHESGAHDTSFSNPLSNVDNLIHVGTEYQAIIQPTAEQLEKEPCRDQQIWAFPDEMNENRLTEYISEATGRYQLPIDRALFILNKQSNDFDAAMVQAMRRKEIHDDWTAEEISLFSTCFFHFGKRFKKIHAAMPQRSLSSIIQYYYNTKKVQNYKTMINVHLNETDTYDELFKEVNHLERVPSGYCENCNAKSDLLILNRVMSRHECKPCILYFRLMRVPRPASLRALTKRRQRVLCPEYMKIYVYGYLELMEPANGKAIKRLGIGKEKEEDDDIMVVDDCLLRKPSGPYIVEQSIEADPIDENTCRMTRCFDTPAALALIDNIKRKHHMCVPLVWRVKQTKCMEENEILNEEARQQMFRATMTYSRVPKGEIANWKKDMMALKGRFERFTPELDTTATNGNRSGKVRINYGWSPEEKKNAIRCFHWYKDNFELIAELMATKTVEQIKKFYMDNEKLILESIDTYRAELKSKLGK*


In [18]:
# For positively stranded feature
print(ce.gff.getChildren("gene2")[0])

print("\n>gene2")
print(ce.gff.seq("gene2", ce.genome["all"]).translate().seq)

GFF3Entry(seqid:NC_003279.8, source:RefSeq, feature:mRNA, start:11641, end:16585, score:., strand:+, phase:., attr:ID=rna2;Dbxref=GeneID:171591,Genbank:NM_058259.4,WormBase:WBGene00022276;Name=NM_058259.4;end_range=16585,.;gbkey=mRNA;gene=nlp-40;partial=true;product=Peptide P4;start_range=.,11641;transcript_id=NM_058259.4)

>gene2
MKLVILLSFVATVAVFAAPSAPAGLEEKLRALQEQLYSLEKENGVDVKQKEQPAAADTFLGFVPQKRMVAWQPMKRSMINEDSRAPYGRVAADEAVDDQ*GF*SSIAPRNRSPLGRSVESRRTPRSQPGGSFGGSSCS*SIPIMLHAIEARLAEVLRAGERLGVNPEEVLADLRARNQFQ*


D: GFF input source is list of GFF3Entries.
D: GFF input source is list of GFF3Entries.


In [19]:
gff.features

{'Gene': ['gene0001'],
 'mRNA': ['mrna0001.1', 'mrna0001.2'],
 'exon': ['exon00001.1',
  'exon00002.1',
  'exon00003.1',
  'exon00004.1',
  'exon00005.1',
  'exon00001.2',
  'exon00002.2',
  'exon00003.2',
  'exon00004.2',
  'exon00005.2']}