## Imports:

In [1]:
# imports
import csv

from collections import defaultdict

import pandas as pd
pd.set_option('display.max_columns', 60)

import munch

from IPython.display import display, HTML

import pybedtools as pbt

import sh

import tabulate
tbl = tabulate.tabulate

from spartan.utils.annotations.ensembl.gff3 import parse_gff3
from spartan.utils.annotations.ensembl.gff3 import parse_gff3_attributes

from spartan.utils.fastas import ParseFastA

from spartan.utils.files import tableFile2namedTuple

from spartan.utils.genome_specific.GfusI1 import GfusI1_0

## File paths:

In [2]:
# File Paths

## Fasta file for renaming contigs ----------------------------------------------------------
fasta = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/assemblies/GfusI1/Glossina-fuscipes-IAEA_SCAFFOLDS_GfusI1.fa"

## Functional annatation (Argot2)
fanno = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/functional/GfusI1.1_pre/argot2_out/argot_functional_annotations_ts150.h5"

## For setting up the BEDTOOLS phase
btools_gene_models_gff3 = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/Glossina-fuscipes-IAEA_BASEFEATURES_GfusI1.1.gff3"

## Input BED etc if using file
bed_path = ""

## Input small bed coordinats as string if thats what youre into
bed_string = """Scaffold150 13770 13771
"""

In [3]:
query_bedtool = pbt.BedTool(bed_string, from_string=True)

In [4]:
name_map = GfusI1_0.get_name_map_from_fasta_headers(fasta)

In [5]:
name_map['KK351935.1']

'Scaffold150'

In [6]:
# load gene models into pybedtools object and filter for only gene features
btools_gene_models_pbt = pbt.BedTool(btools_gene_models_gff3)
genes = btools_gene_models_pbt.filter(lambda x: x[2] == 'gene').saveas()

In [7]:
# load functional annotations
argot2 = pd.read_hdf(path_or_buf=fanno, key='dataframe')

In [8]:
argot2['gene_id'] = argot2.Sequence.apply(lambda x: x[:-3])
argot2_200 = argot2[argot2['Total Score'] >= 200]
argot2.head()

Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content,gene_id
0,GFUI034947-PA,P,GO:0006508,proteolysis,270.313447,0.496543,8.247696,GFUI034947
1,GFUI035874-PA,F,GO:0005515,protein binding,529.038456,0.5,5.471582,GFUI035874
2,GFUI033625-PA,P,GO:0005980,glycogen catabolic process,307.758251,0.284597,13.42445,GFUI033625
3,GFUI033625-PA,F,GO:0004134,4-alpha-glucanotransferase activity,159.513252,0.177063,10.153643,GFUI033625
4,GFUI033625-PA,F,GO:0004135,"amylo-alpha-1,6-glucosidase activity",184.352303,0.177063,11.734746,GFUI033625


#### Function to create dictionary-based retrieval object for gene/SNP data:

In [15]:
def snp_vs_gff_to_DF(bedtools_out):
    headers = ["bed3_seq",
               "bed3_start",
               "bed3_end",
               "gff3_seq",
               "gff3_source",
               "gff3_type",
               "gff3_start",
               "gff3_end",
               "gff3_score",
               "gff3_strand",
               "gff3_phase",
               "gff3_attributes",]
    df = pd.read_csv(bedtools_out.fn, sep='\t', names=headers)
    
    gene_id = lambda x: parse_gff3_attributes(x)['ID']
    
    df['gff3_rec'] = df.gff3_attributes.apply(gene_id)
    
    return df

def genes_near_bed(query_bedtool, gene_bedtool, annotations, w=1000):
    df = snp_vs_gff_to_DF(query_bedtool.window(gene_bedtool, w=w))
    df = df.merge(right=annotations,
                  how='left',
                  on=None,
                  left_on='gff3_rec', 
                  right_on='gene_id')
    return df

#  Genes located near by `query_bedtool`

In [11]:
query_out = snp_vs_gff_to_DF(query_bedtool.window(genes, w=1000))

In [12]:
query_out.head()

Unnamed: 0,bed3_seq,bed3_start,bed3_end,gff3_seq,gff3_source,gff3_type,gff3_start,gff3_end,gff3_score,gff3_strand,gff3_phase,gff3_attributes,gff3_rec
0,Scaffold150,13770,13771,Scaffold150,VectorBase,gene,13476,16476,.,-,.,ID=GFUI009284;biotype=protein_coding,GFUI009284
1,Scaffold150,13770,13771,Scaffold150,VectorBase,gene,14669,18725,.,+,.,ID=GFUI009279;biotype=protein_coding,GFUI009279


In [13]:
genes_near_bed(query_bedtool=query_bedtool, gene_bedtool=genes, annotations=argot2)

Unnamed: 0,bed3_seq,bed3_start,bed3_end,gff3_seq,gff3_source,gff3_type,gff3_start,gff3_end,gff3_score,gff3_strand,gff3_phase,gff3_attributes,gff3_rec,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content,gene_id
0,Scaffold150,13770,13771,Scaffold150,VectorBase,gene,13476,16476,.,-,.,ID=GFUI009284;biotype=protein_coding,GFUI009284,,,,,,,,
1,Scaffold150,13770,13771,Scaffold150,VectorBase,gene,14669,18725,.,+,.,ID=GFUI009279;biotype=protein_coding,GFUI009279,,,,,,,,
