In [1]:
# Copyright (C) 2017 Sur Herrera Paredes
import pandas as pd
import pybedtools as bed
# import numpy as np

In [2]:
def split_gene_annotations(functions, sep1=';',
                           sep2=':', sep3=',',
                           append_which = False):
    """Take annotation string from MIDAS database and expand it
    into a table of Annotation<->Gene"""
    
    # Split strings and create dictionary
    by_type = functions.split(sep=sep1)
    annotations = {}
    for t in by_type:
        db, annots = t.split(sep=sep2)
        annots = annots.split(sep=sep3)
        
        if append_which is True:
            annots = [sep2.join([db,i]) for i in annots]
        
        annotations[db] = annots
    
    # Get dataframe from dictionary
    d = _create_annotation_dataframe(annotations)
    return(d)

def _create_annotation_dataframe(annotations):
    """Expands annotation of one gene"""
    
    d = pd.DataFrame()
    for k,a in annotations.items():
        d2 = pd.DataFrame(data = a, columns=['Annotation'])
        d2['Type'] = k
        #print(d2)
        d = d.append(d2)
    return(d)

In [3]:
# Parameters
infile = "/home/sur/micropopgen/exp/2017/today3/Zymomonas_mobilis_61858/genome.features"
fasta = "/home/sur/micropopgen/exp/2017/today3/Zymomonas_mobilis_61858/genome.fna"
which = "GO"
append_which = True

In [4]:
# Read feature table
Feat = pd.read_csv(infile,sep="\t")
Feat = Feat.head(n=10)
# Feat.head()
# split_gene_annotations(functions=Feat['functions'][0])

In [5]:
# Gett annotations
ngenes = 0
ncds = 0
nannot = 0
nwhich = 0
Res = pd.DataFrame(columns=['Annotation','Type','Gene'])
for i, r in Feat.iterrows():
    g = r['gene_id']
    a = r['functions']
    t = r['gene_type']
    ngenes = ngenes + 1
    # print(a)
    # print("============")
    
    # Keep only CDS
    if t != 'CDS':
        continue
    ncds = ncds + 1
    # Skip unannotated genes
    if pd.isnull(a):
        continue
    nannot = nannot + 1
    
    d = split_gene_annotations(functions=a,
                               append_which=append_which)
    d['Gene'] = g
    
    # Select annotation
    if not pd.isnull(which):
        d = d.loc[ d.Type == which, :]
    
    # Append only if it has rows
    if len(d.index) > 0:
        nwhich = nwhich + 1
        Res = Res.append(d)

Res = Res.drop(['Type'], axis = 1)
    

In [6]:
print(ngenes,ncds,nannot,nwhich)

10 10 8 3


In [7]:
Res

Unnamed: 0,Annotation,Gene
0,GO:0003723,579138.3.peg.1
1,GO:0003735,579138.3.peg.1
2,GO:0005622,579138.3.peg.1
3,GO:0005840,579138.3.peg.1
4,GO:0006412,579138.3.peg.1
0,GO:0005524,579138.3.peg.3
1,GO:0016887,579138.3.peg.3
0,GO:0004239,579138.3.peg.10


In [9]:
# Get BED format dataframe
Bed = Feat[ ['scaffold_id', 'start', 'end', 'gene_id', 'gene_type', 'strand']].copy()
Bed.reset_index(drop=True)
print(Bed)
Bed.loc[ Bed.strand == '+', 'start'] = Bed.loc[ Bed.strand == '+', 'start'] - 1
Bed.loc[ Bed.strand == '-', 'end'] = Bed.loc[ Bed.strand == '-', 'end'] + 1
Bed

  scaffold_id  start    end          gene_id gene_type strand
0    CP002865    210    473   579138.3.peg.1       CDS      +
1    CP002865   1218   2669   579138.3.peg.2       CDS      +
2    CP002865   2764   4632   579138.3.peg.3       CDS      +
3    CP002865   5041   6132   579138.3.peg.4       CDS      +
4    CP002865   6197   6331   579138.3.peg.5       CDS      +
5    CP002865   6354   6866   579138.3.peg.6       CDS      +
6    CP002865   7204   7932   579138.3.peg.7       CDS      +
7    CP002865   8003   9004   579138.3.peg.8       CDS      -
8    CP002865   9021   9779   579138.3.peg.9       CDS      -
9    CP002865   9895  10725  579138.3.peg.10       CDS      +


Unnamed: 0,scaffold_id,start,end,gene_id,gene_type,strand
0,CP002865,209,473,579138.3.peg.1,CDS,+
1,CP002865,1217,2669,579138.3.peg.2,CDS,+
2,CP002865,2763,4632,579138.3.peg.3,CDS,+
3,CP002865,5040,6132,579138.3.peg.4,CDS,+
4,CP002865,6196,6331,579138.3.peg.5,CDS,+
5,CP002865,6353,6866,579138.3.peg.6,CDS,+
6,CP002865,7203,7932,579138.3.peg.7,CDS,+
7,CP002865,8003,9005,579138.3.peg.8,CDS,-
8,CP002865,9021,9780,579138.3.peg.9,CDS,-
9,CP002865,9894,10725,579138.3.peg.10,CDS,+


In [11]:
(Bed.end - Bed.start)/3

0     88.0
1    484.0
2    623.0
3    364.0
4     45.0
5    171.0
6    243.0
7    334.0
8    253.0
9    277.0
dtype: float64

In [12]:
# create BedTool and obtain sequences
Bed = bed.BedTool.from_dataframe(Bed)
Bed.sequence(fi=fasta, s=True, name=True)

<BedTool(/tmp/pybedtools.g5e9_m5f.tmp)>

In [13]:
# Show results
print(open(Bed.seqfn).read())

>579138.3.peg.1(+)
ATGGCGAATACGCCGCAGGCAAAAAAGCGCATCCGTCGTAATGACCGTCGCGCTGAGATTAACGGCAATCGTATTAGCCGTATCCGTACTTTTATCAAAAAGGTTGAGTCTGCTATTGAAGCGGGCAATAAAACCGATGCAGAACAAGCTTTAGCCGCTGCACAGCCGGAATTGTTCCGCGGTGTTTCTAAAGGCATTTTGCATAAAAATACGGCTTCACGGAAATTCTCCCGTTTGGTTAAGAGCGTTACCGCTTTAGCCTAA
>579138.3.peg.2(+)
ATGAATATCCCGGTACAAGACCAGGCATCATTACTGCCGGCTGCATGGAATGAAGTGCGTCAAATTTTGCGTAAAAAATGTGGTACACGTACTTTTGAAAGCTGGCTAAAATCTCTGGCACTCGCCGATTTTGATAACACTCATAAAATAATACGTTTGGCCTGTCCCAGTGAGTTTATGGCCAATTGGGTTTCTGCTCATTTATCTGATGAATTATTATTAGCATGGCGGACAGTCTGGCCTGGTATTACCGAAGTAAAGATAACCGTTCGCAATCCAAGTGCTCAACCTTTATTATTAGACGTTACAGAAATCGAATTACCCTTAGGCGACCATCCTCAACCCGTTGTCAAAAAAGCGAGCAAAAAAAAACAACCTTCGGCTACCATTGTTCCCCCTGCGTCAGCAGATGAATCCGAAAAGGATAGCCAAGGCCAGTTTGAAGAGCGCTATAATTTCGATAATTTTGTTGTTGGCAAGGCAAATGACCTCGCTTATCGCGCGGCCTGCACTTTTGCAGAAGGTGGAAAGCTCGATTTCAACCCGCTTTTTTTACATGGTGGAACGGGTCTTGGTAAAACCCATTTGATGCATGCGGTCGGGCTCGAATATTTAAAGCGTCATCCTAACAGTTCGGCAATTTATATGTCTGCCGAAAAATTCATGTATGATTTTGTCGCGGCCATGCGGGCGAAAG

In [None]:
((Feat['end'] - Feat['start']) + 1) / 3

In [None]:
((Feat['end'] - Feat['start']) + 1)