## Imports:

In [1]:
# imports
import csv

from collections import defaultdict

import pandas as pd
pd.set_option('display.max_columns', 60)

import munch

from IPython.display import display, HTML

import pybedtools as pbt

import sh

import tabulate
tbl = tabulate.tabulate

from spartan.utils.annotations.ensembl.gff3 import parse_gff3
from spartan.utils.annotations.ensembl.gff3 import parse_gff3_attributes

from spartan.utils.fastas import ParseFastA

from spartan.utils.files import tableFile2namedTuple

from spartan.utils.genome_specific.GfusI1 import GfusI1_0

from gs_ddRAD2015.scripts import ld_figures as ldfigs

## File paths:

In [2]:
# File Paths

## LD analysis dir
ld_dir = "/home/gus/MEGAsync/projects/ddRAD_phase2/repos/ddRAD_phase2/scratch/gs_2015_ld/OT_MS_NB_indiv.geno.ld"


## Fasta file for renaming contigs ----------------------------------------------------------
fasta = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/assemblies/GfusI1/Glossina-fuscipes-IAEA_SCAFFOLDS_GfusI1.fa"

## Functional annatation (Argot2)
fanno = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/functional/GfusI1.1_pre/argot2_out/argot_functional_annotations_ts150.h5"

## For setting up the BEDTOOLS phase
btools_gene_models_gff3 = "/home/gus/remote_mounts/louise/data/genomes/glossina_fuscipes/annotations/Glossina-fuscipes-IAEA_BASEFEATURES_GfusI1.1.gff3"

## Input BEDs etc if using files
Selected_PopPairwiseMSOT_Environm_path = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/ddrad58/data_from_andrea/2015-03-17_env_selection/Selected_PopPairwiseMSOT_Environm.bed"
Selected_PopPairwiseMSNB_Environm_path = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/ddrad58/data_from_andrea/2015-03-17_env_selection/Selected_PopPairwiseMSNB_Environm.bed"
Top10_corrected_PopPairwiseOverlap_Infection2015_path = "/home/gus/Documents/YalePostDoc/project_stuff/g_f_fucipes_uganda/ddrad58/data_from_andrea/2015-07-23_overlap_infection/Top10_corrected_PopPairwiseOverlap_Infection2015.bed"



In [3]:
# load LD analysis results
figs = ldfigs.Figures(out_dir=ld_dir, formats='png')
figs.load_data_tables()
ld_tables = munch.Munch(figs.d)
ld_tables.keys()


ld_figures: loading data_tables.




['ld',
 'd_bins',
 'contig_info',
 'd_bin_v_others_melt',
 'len_contigs_per_bin',
 'd_bin_v_others',
 'contigs_per_bin',
 'mean_bin_r2_all',
 'sp_contigs',
 'ld_contig']

In [4]:
ld = ld_tables.ld
del(ld_tables)

In [5]:
# load BED files
Selected_PopPairwiseMSOT_Environm = pbt.BedTool(Selected_PopPairwiseMSOT_Environm_path)
Selected_PopPairwiseMSNB_Environm = pbt.BedTool(Selected_PopPairwiseMSNB_Environm_path)
Top10_corrected_PopPairwiseOverlap_Infection2015 = pbt.BedTool(Top10_corrected_PopPairwiseOverlap_Infection2015_path)

In [6]:
# load gene models into pybedtools object and filter for only gene features
btools_gene_models_pbt = pbt.BedTool(btools_gene_models_gff3)
genes = btools_gene_models_pbt.filter(lambda x: x[2] == 'gene').saveas()

In [7]:
# load functional annotations
argot2 = pd.read_hdf(path_or_buf=fanno, key='dataframe')

In [8]:
argot2['gene_id'] = argot2.Sequence.apply(lambda x: x[:-3])
argot2_200 = argot2[argot2['Total Score'] >= 200]
argot2.head()

Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content,gene_id
0,GFUI034947-PA,P,GO:0006508,proteolysis,270.313447,0.496543,8.247696,GFUI034947
1,GFUI035874-PA,F,GO:0005515,protein binding,529.038456,0.5,5.471582,GFUI035874
2,GFUI033625-PA,P,GO:0005980,glycogen catabolic process,307.758251,0.284597,13.42445,GFUI033625
3,GFUI033625-PA,F,GO:0004134,4-alpha-glucanotransferase activity,159.513252,0.177063,10.153643,GFUI033625
4,GFUI033625-PA,F,GO:0004135,"amylo-alpha-1,6-glucosidase activity",184.352303,0.177063,11.734746,GFUI033625


#### Function to create dictionary-based retrieval object for gene/SNP data:

In [9]:
def snp_vs_gff_to_DF(bedtools_out):
    headers = ["bed3_seq",
               "bed3_start",
               "bed3_end",
               "gff3_seq",
               "gff3_source",
               "gff3_type",
               "gff3_start",
               "gff3_end",
               "gff3_score",
               "gff3_strand",
               "gff3_phase",
               "gff3_attributes",]
    df = pd.read_csv(bedtools_out.fn, sep='\t', names=headers)
    
    gene_id = lambda x: parse_gff3_attributes(x)['ID']
    
    df['gff3_rec'] = df.gff3_attributes.apply(gene_id)
    
    return df

def genes_near_bed(query_bedtool, gene_bedtool, annotations, w=1000):
    df = snp_vs_gff_to_DF(query_bedtool.window(gene_bedtool, w=w))
    df = df.merge(right=annotations,
                  how='left',
                  on=None,
                  left_on='gff3_rec', 
                  right_on='gene_id')
    return df

In [10]:
# - generate set of non-redundant andrea snps from joined table as set of tuples
# - convert to bedtool object(s)
def reduce_joined_SNP_coords(snps_interest_df):
    df = snps_interest_df  # less typing

    if len(df) == 0:
        return None
    
    
    snp_set = set()
    snp_set.update(list(df.apply(lambda x: str((x.seq_x, x.start_x, x.end_x)),1)))
    snp_set.update(list(df.apply(lambda x: str((x.seq_y, x.start_y, x.end_y)),1)))

    snp_set.discard("(nan, nan, nan)")
    
    return pbt.BedTool('\n'.join((l.replace(".0",'').replace("'","").replace('(','').replace(')','').replace(',','\t') for l in snp_set)), from_string=True)


# - get windowed intersection intersection with gene models using pybedtools

def get_win_isec(a,b,win=1000):
    if a is None:
        return None
    return a.window(b, w=win)


def add_SNP_to_gene_distance(df):
    if df is not None:
        df['d_to_gene'] = df.apply(d_to_gene,1)


def d_to_gene(x):

    low,high = ((x.bed3_end - x.gff3_start),(x.bed3_end - x.gff3_end))

    if (low >= 0) and (high <= 0):
        return 0
    else:
        return min([abs(low),abs(high)])

    

# get annotations for genes:
def join_genes_with_annos(genes, annotations, how="inner"):
    if genes is None:
        return None
    return pd.merge(left=genes, right=annotations, 
                    how=how, 
                    on=None, 
                    left_on='gff3_rec', right_on='gene_id')

In [11]:
def make_table_selected_snps_vs_LD_filter(anno_dfs):
    
    new_headers = ["SNP group",
                   "Gene",
                   "Scaffold",
                   "SNP Location",
                   "Distance to Gene",
                   "Top C",
                   "Top P",
                   "Top F"]

    old_headers = ["snp_group",
                   "bed3_seq",
                   "gff3_rec",
                   "bed3_end",
                   "d_to_gene"]


    
    concat_df = concat_dfs(anno_dfs)


    # group concat_df
    groups = concat_df.groupby(by=old_headers)

    # Collect row data
    out_rows = []

    for name,group in groups.groups.items():
        out_rows.append(new_row_from_group(groups.get_group(name)))

    # create final table
#     out_table = pd.DataFrame(data=out_rows, index=None, columns=new_headers, dtype=None, copy=False)
#     gene_mask = out_table.Gene.notnull()
#     out_table = out_table[gene_mask]
    return pd.DataFrame(data=out_rows, index=None, columns=new_headers, dtype=None, copy=False)



def new_row_from_group(group):

    row  = get_old_header_values(group)
    row["Top C"] = get_top_3_terms(group=group,aspect="C")
    row["Top P"] = get_top_3_terms(group=group,aspect="P")
    row["Top F"] = get_top_3_terms(group=group,aspect="F")

    return pd.Series(row)

def get_old_header_values(group):
    vals = munch.Munch()

    vals["SNP group"] = group.snp_group.iloc[0]
    vals["Scaffold"] = group.bed3_seq.iloc[0]
    vals["Gene"] = group.gff3_rec.iloc[0]
    vals["SNP Location"] = group.bed3_end.iloc[0]
    vals["Distance to Gene"] = group.d_to_gene.iloc[0]

    return vals


def get_top_3_terms(group,aspect):
    df = group.query("Aspect == '{aspect}'".format(aspect=aspect))

    return ";".join(df.sort("Total Score").Name[:3].values)

def concat_dfs(dfs):

    concat_df = pd.DataFrame()

    # add snp-group name to rows and concat to new table
    for name, df in dfs.items():

        try:
            df["snp_group"] = name
        except TypeError:
            pass

        concat_df = pd.concat(objs=[concat_df,df], axis=0, join='outer', join_axes=None, ignore_index=True, keys=None, levels=None, names=None, verify_integrity=False, copy=True)

    return concat_df

#  Genes located near by `query_bedtool`

In [12]:
NEARBY_GENES = {}
NEARBY_GENES['Top10_corrected_PopPairwiseOverlap_Infection2015'] = genes_near_bed(query_bedtool=Top10_corrected_PopPairwiseOverlap_Infection2015, gene_bedtool=genes, annotations=argot2_200)
NEARBY_GENES['Selected_PopPairwiseMSOT_Environm'] = genes_near_bed(query_bedtool=Selected_PopPairwiseMSOT_Environm, gene_bedtool=genes, annotations=argot2_200)
NEARBY_GENES['Selected_PopPairwiseMSNB_Environm'] = genes_near_bed(query_bedtool=Selected_PopPairwiseMSNB_Environm, gene_bedtool=genes, annotations=argot2_200)


In [13]:
NEARBY_GENES['Top10_corrected_PopPairwiseOverlap_Infection2015']

Unnamed: 0,bed3_seq,bed3_start,bed3_end,gff3_seq,gff3_source,gff3_type,gff3_start,gff3_end,gff3_score,gff3_strand,gff3_phase,gff3_attributes,gff3_rec,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content,gene_id
0,Scaffold2,2593208,2593209,Scaffold2,VectorBase,gene,2593175,2594101,.,-,.,ID=GFUI023810;biotype=protein_coding,GFUI023810,,,,,,,,
1,Scaffold355,306767,306768,Scaffold355,VectorBase,gene,287768,311543,.,-,.,ID=GFUI027633;biotype=protein_coding,GFUI027633,,,,,,,,
2,Scaffold163,513433,513434,Scaffold163,VectorBase,gene,513883,527197,.,-,.,ID=GFUI010652;biotype=protein_coding,GFUI010652,GFUI010652-PA,P,GO:0006200,ATP catabolic process,474.917555,0.939484,8.421864,GFUI010652
3,Scaffold163,513433,513434,Scaffold163,VectorBase,gene,513883,527197,.,-,.,ID=GFUI010652;biotype=protein_coding,GFUI010652,GFUI010652-PA,C,GO:0019028,viral capsid,1239.312392,1.0,6.609099,GFUI010652
4,Scaffold873,18833,18834,Scaffold873,VectorBase,gene,11605,18148,.,+,.,ID=GFUI050068;biotype=protein_coding,GFUI050068,,,,,,,,
5,Scaffold3,958026,958027,Scaffold3,VectorBase,gene,950967,964016,.,+,.,ID=GFUI030595;biotype=protein_coding,GFUI030595,,,,,,,,
6,Scaffold3,958026,958027,Scaffold3,VectorBase,gene,957656,965402,.,-,.,ID=GFUI030591;biotype=protein_coding,GFUI030591,,,,,,,,
7,Scaffold44,665069,665070,Scaffold44,VectorBase,gene,661632,664661,.,-,.,ID=GFUI033643;biotype=protein_coding,GFUI033643,GFUI033643-PA,C,GO:0005737,cytoplasm,243.857525,0.335343,3.178476,GFUI033643
8,Scaffold44,665069,665070,Scaffold44,VectorBase,gene,661632,664661,.,-,.,ID=GFUI033643;biotype=protein_coding,GFUI033643,GFUI033643-PA,C,GO:0005737,cytoplasm,243.857525,0.335343,3.178476,GFUI033643
9,Scaffold443,92558,92559,Scaffold443,VectorBase,gene,79974,99681,.,-,.,ID=GFUI033311;biotype=protein_coding,GFUI033311,GFUI033311-PA,F,GO:0003676,nucleic acid binding,335.079554,0.34425,3.285748,GFUI033311


In [14]:
for name, value in NEARBY_GENES.items():
        if value is None:
            pass
        elif (len(value) > 0):
            add_SNP_to_gene_distance(value)
        else:
            NEARBY_GENES[name] = None

In [15]:
table1 = make_table_selected_snps_vs_LD_filter(NEARBY_GENES)

In [16]:
table1

Unnamed: 0,SNP group,Gene,Scaffold,SNP Location,Distance to Gene,Top C,Top P,Top F
0,Top10_corrected_PopPairwiseOverlap_Infection2015,GFUI030595,Scaffold3,958027,0,,,
1,Selected_PopPairwiseMSOT_Environm,GFUI034819,Scaffold473,223159,0,,,
2,Selected_PopPairwiseMSOT_Environm,GFUI010534,Scaffold162,576272,0,membrane;cytoplasmic vesicle membrane;cytoplas...,transmembrane transport,
3,Selected_PopPairwiseMSOT_Environm,GFUI033307,Scaffold443,242064,0,,,
4,Selected_PopPairwiseMSOT_Environm,GFUI004250,Scaffold113,393483,0,,,
5,Selected_PopPairwiseMSNB_Environm,GFUI035185,Scaffold47,462630,0,,,
6,Top10_corrected_PopPairwiseOverlap_Infection2015,GFUI030591,Scaffold3,958027,0,,,
7,Selected_PopPairwiseMSOT_Environm,GFUI005292,Scaffold11,1908584,0,,,
8,Selected_PopPairwiseMSOT_Environm,GFUI046107,Scaffold730,54980,0,,,
9,Selected_PopPairwiseMSOT_Environm,GFUI050285,Scaffold87,761673,0,,removal of superoxide radicals;superoxide meta...,metal ion binding;oxidoreductase activity;supe...


In [17]:
# convert LD filtered SNP-pairs into table of single seq:location data

ld_snp_pairs = ld.query("one_minus_cdf_BH <= 0.01")
ld_snps_a = ld_snp_pairs[["CHR_A","BP_A"]].rename(columns={"CHR_A":"Scaffold","BP_A":"SNP Location"})
ld_snps_b = ld_snp_pairs[["CHR_A","BP_B"]].rename(columns={"CHR_A":"Scaffold","BP_B":"SNP Location"})
ld_snps = pd.concat([ld_snps_a,ld_snps_b]).sort()
ld_snps["LD Filtered"] = 'True'
ld_snps.head()

Unnamed: 0,Scaffold,SNP Location,LD Filtered
11,Scaffold0,13388,True
11,Scaffold0,86267,True
21,Scaffold0,183680,True
21,Scaffold0,13388,True
84,Scaffold0,13388,True


In [18]:
table1 = table1.merge(ld_snps, 
             how='left', 
             on=["Scaffold","SNP Location"], left_on=None, right_on=None,  
             sort=False, suffixes=('_x', '_y'), copy=True).drop_duplicates()

In [19]:
table1.to_excel('/home/gus/src/repos/git/markdown-docs/manuscripts/Gloria-Soria/2015/ddRAD58/tables/report_to_andrea_to_authors.xls',index=False)