# Purpose:

2014-12-26 (Friday)

Create code to make tables with info from `argot2` annotations for all genes in a gene-list.

# Implementation:

## Imports:

In [12]:
# imports
from collections import defaultdict

import pandas as pd

from spartan.utils.annotations.ensembl.gff3 import parse_gff3_attributes
from spartan.utils.files import tableFile2namedTuple

## File paths:

In [6]:
# define paths to files
bpth = "/home/gus/remote_mounts/louise/data/"

## basic genome files
fanno = bpth + "genomes/glossina_fuscipes/annotations/functional/GfusI1.1_pre/argot2_out/argot_functional_annotations_ts150.h5"

## project specific files
top1_environment = bpth + "projects/ddrad58/SNPs_of_interest/genes_near_SNPs/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.5000.top1_env.gene_list.tsv"
top1_infection = bpth + "projects/ddrad58/SNPs_of_interest/genes_near_SNPs/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.5000.top1_infection.gene_list.tsv"

top5_environment = bpth + "projects/ddrad58/SNPs_of_interest/genes_near_SNPs/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.5000.top5_env.gene_list.tsv"
top5_infection = bpth + "projects/ddrad58/SNPs_of_interest/genes_near_SNPs/tsetseFINAL_14Oct2014_f2_53_v_GfusI1.1.window.5000.top5_infection.gene_list.tsv"



## Set up main data variables:

### Set up annotation database:

In [7]:
argot2 = pd.read_hdf(path_or_buf=fanno, key='dataframe')

In [9]:
argot2.head()

Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
0,GFUI034947-PA,P,GO:0006508,proteolysis,270.313447,0.496543,8.247696
1,GFUI035874-PA,F,GO:0005515,protein binding,529.038456,0.5,5.471582
2,GFUI033625-PA,P,GO:0005980,glycogen catabolic process,307.758251,0.284597,13.42445
3,GFUI033625-PA,F,GO:0004134,4-alpha-glucanotransferase activity,159.513252,0.177063,10.153643
4,GFUI033625-PA,F,GO:0004135,"amylo-alpha-1,6-glucosidase activity",184.352303,0.177063,11.734746


### Set up gene/SNP relationship data:

#### Function to create dictionary-based retrieval object for gene/SNP data:

In [11]:
def bed3_v_gff3_window_by_gff3_ID(bedtools_window_out):
    headers = ["bed3_seq",
               "bed3_start",
               "bed3_end",
               "gff3_seq",
               "gff3_source",
               "gff3_type",
               "gff3_start",
               "gff3_end",
               "gff3_score",
               "gff3_strand",
               "gff3_phase",
               "gff3_attributes",]
    
    table = tableFile2namedTuple(bedtools_window_out, 
                                 sep='\t', 
                                 headers=headers)
    
    Tree = lambda: defaultdict(Tree)
    data = Tree()
    
    for row in table:
        data['gff3_rec']['info'].setdefault('seq', row['gff3_seq'])
        data['gff3_rec']['info'].setdefault('source', row['gff3_source'])
        data['gff3_rec']['info'].setdefault('type', row['gff3_type'])
        data['gff3_rec']['info'].setdefault('start', row['gff3_start'])
        data['gff3_rec']['info'].setdefault('end', row['gff3_end'])
        data['gff3_rec']['info'].setdefault('score', row['gff3_score'])
        data['gff3_rec']['info'].setdefault('strand', row['gff3_strand'])
        data['gff3_rec']['info'].setdefault('phase', row['gff3_phase'])
        data['gff3_rec']['info'].setdefault('attributes', row['gff3_attributes'])

        bed3_hit = dict('seq': row['bed3_seq'], 
                        'start': row['bed3_start'],
                        'end': row['bed3_end'])

        data['gff3_rec'].get('bed3_hits', []).append(bed3_hit)