In [32]:
# Copyright (C) 2017 Sur Herrera Paredes
import pandas as pd
import pybedtools as bed
# import numpy as np

In [33]:
def split_gene_annotations(functions, sep1=';',
                           sep2=':', sep3=',',
                           append_which = False):
    """Take annotation string from MIDAS database and expand it
    into a table of Annotation<->Gene"""
    
    # Split strings and create dictionary
    by_type = functions.split(sep=sep1)
    annotations = {}
    for t in by_type:
        db, annots = t.split(sep=sep2)
        annots = annots.split(sep=sep3)
        
        if append_which is True:
            annots = [sep2.join([db,i]) for i in annots]
        
        annotations[db] = annots
    
    # Get dataframe from dictionary
    d = _create_annotation_dataframe(annotations)
    return(d)

def _create_annotation_dataframe(annotations):
    """Expands annotation of one gene"""
    
    d = pd.DataFrame()
    for k,a in annotations.items():
        d2 = pd.DataFrame(data = a, columns=['Annotation'])
        d2['Type'] = k
        #print(d2)
        d = d.append(d2)
    return(d)

In [34]:
# Parameters
infile = "/home/sur/micropopgen/exp/2017/today3/Zymomonas_mobilis_61858/genome.features"
fasta = "/home/sur/micropopgen/exp/2017/today3/Zymomonas_mobilis_61858/genome.fna"
which = "GO"
append_which = True

In [35]:
# Read feature table
Feat = pd.read_csv(infile,sep="\t")
Feat = Feat.head()
# Feat.head()
# split_gene_annotations(functions=Feat['functions'][0])

In [38]:
# Gett annotations
ngenes = 0
ncds = 0
nannot = 0
nwhich = 0
Res = pd.DataFrame(columns=['Annotation','Type','Gene'])
for i, r in Feat.iterrows():
    g = r['gene_id']
    a = r['functions']
    t = r['gene_type']
    ngenes = ngenes + 1
    # print(a)
    # print("============")
    
    # Keep only CDS
    if t != 'CDS':
        continue
    ncds = ncds + 1
    # Skip unannotated genes
    if pd.isnull(a):
        continue
    nannot = nannot + 1
    
    d = split_gene_annotations(functions=a,
                               append_which=append_which)
    d['Gene'] = g
    
    # Select annotation
    if not pd.isnull(which):
        d = d.loc[ d.Type == which, :]
    
    # Append only if it has rows
    if len(d.index) > 0:
        nwhich = nwhich + 1
        Res = Res.append(d)

Res = Res.drop(['Type'], axis = 1)
    

In [39]:
print(ngenes,ncds,nannot,nwhich)

5 5 4 2


In [40]:
Res

Unnamed: 0,Annotation,Gene
0,GO:0003723,579138.3.peg.1
1,GO:0003735,579138.3.peg.1
2,GO:0005622,579138.3.peg.1
3,GO:0005840,579138.3.peg.1
4,GO:0006412,579138.3.peg.1
0,GO:0005524,579138.3.peg.3
1,GO:0016887,579138.3.peg.3
