In [1]:
%matplotlib inline

In [2]:
from IPython.display import HTML

In [3]:
from __future__ import unicode_literals
from __future__ import division
from __future__ import print_function

import os

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 60)
pd.set_option('display.max_rows', 300)
pd.set_option("max_colwidth",1000)


from collections import defaultdict

import munch

from IPython.display import display, HTML

from spartan.utils.annotations.ensembl.gff3 import parse_gff3_attributes
from spartan.utils.files import tableFile2namedTuple

# Files

In [4]:
results_ = "/home/gus/MEGAsync/zim/main/Yale/Collaborations/Hongyu-tsetse/gmm_to_gff_pipeline/pipeline_runs/gmm_to_gff_testing_filterPSL_100bp/snps_near_homologous_de/snps_near_homologous_de_distance_0.csv"
bed_ = "/home/gus/MEGAsync/zim/main/Yale/Collaborations/Hongyu-tsetse/gmm_to_gff_pipeline/pipeline_prep/produced/Glossina-fuscipes-IAEA_BASEFEATURES_GfusI1.3.sorted.bed"
fanno = "/run/media/gus/Storage/louise/data/genomes/glossina_fuscipes/annotations/functional/GfusI1.1_pre/argot2_out/argot_functional_annotations_ts150.h5"

# Link Templates

In [5]:
url = "https://www.vectorbase.org/Glossina_fuscipes/Location/View?r=[chrom]:[left]-[right]"

# Vectorbase Link Templates
gene_page          = '''<a href="https://www.vectorbase.org/Glossina_fuscipes/Gene/Summary?db=core;g=%(gene_name)s"> %(gene_name)s: gene home page</a>'''
protein_summary    = '''<a href="https://www.vectorbase.org/Glossina_fuscipes/Transcript/ProteinSummary?db=core;g=%(gene_name)s"> %(gene_name)s: protein summary </a>'''
gene_region_detail = '''<a href="https://www.vectorbase.org/Glossina_fuscipes/Location/View?db=core;g=%(gene_name)s;r"> %(gene_name)s: genome browser </a>'''
gene_ontology_bp   = '''<a href="https://www.vectorbase.org/Glossina_fuscipes/Transcript/Ontology/biological_process?db=core;g=%(gene_name)s;oid=biological_process"> %(gene_name)s: GO: biological process </a>'''
gene_ontology_cp   = '''<a href="https://www.vectorbase.org/Glossina_fuscipes/Transcript/Ontology/cellular_component?db=core;g=%(gene_name)s;oid=cellular_component"> %(gene_name)s: GO: cellular component </a>'''
gene_ontology_mf   = '''<a href="https://www.vectorbase.org/Glossina_fuscipes/Transcript/Ontology/molecular_function?db=core;g=%(gene_name)s;oid=molecular_function"> %(gene_name)s: GO: molecular function </a>'''

# Functions

In [6]:
def create_multiindex(df, levels, data_cols=None):
    
    # select data columns
    if data_cols is None:
        mi = df[[c for c in data.columns if c not in levels]].copy()
    else:
        assert isinstance(data_cols, list)
        mi = df[data_cols]
    
    # get multiindex values
    indexes = [df[i].copy() for i in levels]
    
    # add new indexes to data
    mi.index = indexes
    
    return mi




bed_headers = ["chrom",
               "chromStart",
               "chromEnd",
               "name",
               "score",
               "strand",
               "thickStart",
               "thickEnd",
               "itemRgb",
               "blockCount",
               "blockSizes",
               "blockStarts"]


def add_url_col(results, url, bed, bed_headers):
    """Add url column to results df."""
    print("begun adding urls.")
    # correct the templating of the url required by snakemake interpreting it as wildcards
    url = url.replace("[chrom]","{chrom}").replace("[left]","{left}").replace("[right]","{right}")

    bed = pd.read_csv(bed, sep='\t',names=bed_headers)[["chrom", "chromStart", "chromEnd", "name"]]
    bed = bed.rename(columns={"name":"gene_id_internal"})

    combo = pd.merge(left=results, right=bed, how='inner',
                     on="gene_id_internal", left_on=None, right_on=None)

    # count_cols = ['chromStart','chromEnd','SNP_start','SNP_end']
    combo['left'] = combo.apply(lambda row: min(row[['chromStart','chromEnd','SNP_start','SNP_end']]), axis=1)
    combo['right'] = combo.apply(lambda row: max(row[['chromStart','chromEnd','SNP_start','SNP_end']]), axis=1)
    combo['url'] = combo.apply(lambda row: url.format(chrom=row["chrom"],left=row["left"],right=row["right"]), axis=1)

    results = pd.merge(left=results, right=combo[["gene_id_internal","url"]], how='inner',
                     on="gene_id_internal", left_on=None, right_on=None)

    return results.drop_duplicates()

In [7]:
class ListTable(list):
    """ Overridden list class which takes a 2-dimensional list of 
        the form [[1,2,3],[4,5,6]], and renders an HTML Table in 
        IPython Notebook. """
    
    def _repr_html_(self):
        html = ["<table>"]
        for index, row in enumerate(self):
            html.append("<tr>")
            
            for col in row:
                # if header, format as such
                if index == 0:
                    html.append("<th>{0}</th>".format(col))
                else:
                    html.append("<td>{0}</td>".format(col))
            
            html.append("</tr>")
        html.append("</table>")
        return ''.join(html)

def present_gene(gene_name, snp_coords, argot_df):
    fset = ','.join(list(snp_coords.query(""" gene_id_internal == '{gene}' """.format(gene=gene)).feature_set_name.values))
    display(HTML("<h2>{gene}({fset})</h2>".format(fset=fset,gene=gene_name)))
    
    nearby_snps = list(snp_coords.query(""" gene_id_internal == '{gene}' """.format(gene=gene)).snp_str.values)
    display(HTML("<b>SNPs:</b><br>%s<br>" % ("<br>".join(nearby_snps))))
    
    display(gene_table(gene_name=gene_name,
                       argot_df=argot_df,
                       tscore=200))
    
    display(HTML("<p>"))
    display(vb_link(gene_name=gene_name, link_template=gene_page))
    display(vb_link(gene_name=gene_name, link_template=protein_summary))
    display(vb_link(gene_name=gene_name, link_template=gene_region_detail))
    display(vb_link(gene_name=gene_name, link_template=gene_ontology_bp))
    display(vb_link(gene_name=gene_name, link_template=gene_ontology_cp))
    display(vb_link(gene_name=gene_name, link_template=gene_ontology_mf))
    display(HTML("<hr>"))

In [8]:
def filter_by_gene(gene, argot_df):
    return argot_df[argot_df.Sequence.str.startswith(gene)]

def sort_by_TS(argot_df):
    return argot_df.sort_values(by='Total Score', ascending=0)

def filter_by_TS(tscore, argot_df):
    return argot_df[argot_df["Total Score"] >= tscore]

def gene_table(gene_name, argot_df, tscore):
    return sort_by_TS(filter_by_TS(tscore, filter_by_gene(gene_name, argot_df)))

def vb_link(gene_name, link_template):
    return HTML(link_template % dict(gene_name=gene_name))

def format_snps(snp_df):
    snp_line = "{chrom}:{end}"
    
    snp_df['snp_str'] = snp_df.apply(lambda row: snp_line.format(chrom=row.SNP_chrom, end=row.SNP_end), axis=1)

# Load inputs

In [9]:
data = pd.read_csv(results_)
# data.head()

In [10]:
data['SNP_id'] = data.apply(lambda row: '{chrom}:{end}'.format(chrom=row.SNP_chrom, end=row.SNP_end), axis=1)

In [11]:
# data.head()

In [12]:
# data = add_url_col(data, url, bed_, bed_headers)

In [13]:
# data.head()

# Breakdowns

In [26]:
# data.head(60)

In [24]:
levels=["SNP_id","feature_set_name","gene_id_internal","tcons_id","de_id"]
data_cols=[col for col in data.columns.values if ((col not in levels) and (col not in ["SNP_end"])) ]

create_multiindex(data.query(""" program == 'edger' """),
                  levels,
                  data_cols=["fdr","lg2_fc","url"]).drop_duplicates()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,fdr,lg2_fc,url
SNP_id,feature_set_name,gene_id_internal,tcons_id,de_id,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Scaffold67:72303,novel_mapped_tx,GFUI043918,TCONS_00039298,EDGR|SG_0000245,1.029145e-10,1.475683,https://www.vectorbase.org/Glossina_fuscipes/Location/View?r=Scaffold67:45542-72303
Scaffold67:72303,novel_mapped_tx,GFUI043918,TCONS_00039298,EDGR|PV_PLUS_MINUS_0000968,0.005578161,0.791775,https://www.vectorbase.org/Glossina_fuscipes/Location/View?r=Scaffold67:45542-72303
Scaffold5:2342538,official_annotations,GFUI041241,TCONS_00048887,EDGR|SG_0001206,0.008699393,2.043383,https://www.vectorbase.org/Glossina_fuscipes/Location/View?r=Scaffold5:2342491-2348210
Scaffold151:406664,novel_mapped_tx,GFUI009421,TCONS_00067903,EDGR|MIDGUT_0001209,2.463629e-07,2.201035,https://www.vectorbase.org/Glossina_fuscipes/Location/View?r=Scaffold151:397135-406664
Scaffold27:959995,official_annotations,GFUI021833,TCONS_00080297,EDGR|MIDGUT_0001618,4.164748e-06,2.735993,https://www.vectorbase.org/Glossina_fuscipes/Location/View?r=Scaffold27:930934-962675
Scaffold22:1291805,novel_mapped_tx,GFUI017734,TCONS_00034486,EDGR|PV_PLUS_MINUS_0000719,0.0008631194,0.632797,https://www.vectorbase.org/Glossina_fuscipes/Location/View?r=Scaffold22:1291804-1306127
Scaffold22:1291805,novel_mapped_tx,GFUI017734,TCONS_00034486,EDGR|PV_PLUS_PLUS_0000729,0.001102155,0.618966,https://www.vectorbase.org/Glossina_fuscipes/Location/View?r=Scaffold22:1291804-1306127
Scaffold9:1360708,official_annotations,GFUI053401,TCONS_00024643,EDGR|PB_0000256,0.001212582,1.308488,https://www.vectorbase.org/Glossina_fuscipes/Location/View?r=Scaffold9:1358276-1393589
Scaffold9:1360708,official_annotations,GFUI053401,TCONS_00024643,EDGR|MIDGUT_0004905,0.03235413,2.119142,https://www.vectorbase.org/Glossina_fuscipes/Location/View?r=Scaffold9:1358276-1393589
Scaffold2:2206337,official_annotations,GFUI023751,TCONS_00041418,EDGR|PB_0000641,0.01655064,-0.740946,https://www.vectorbase.org/Glossina_fuscipes/Location/View?r=Scaffold2:2203135-2223436


# Functional inference

In [16]:
argot2 = pd.read_hdf(path_or_buf=fanno, key='dataframe')

In [17]:
argot2.head()

Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
0,GFUI034947-PA,P,GO:0006508,proteolysis,270.313447,0.496543,8.247696
1,GFUI035874-PA,F,GO:0005515,protein binding,529.038456,0.5,5.471582
2,GFUI033625-PA,P,GO:0005980,glycogen catabolic process,307.758251,0.284597,13.42445
3,GFUI033625-PA,F,GO:0004134,4-alpha-glucanotransferase activity,159.513252,0.177063,10.153643
4,GFUI033625-PA,F,GO:0004135,"amylo-alpha-1,6-glucosidase activity",184.352303,0.177063,11.734746


In [18]:
snp_coords = data[["SNP_chrom","SNP_start","SNP_end","gene_id_internal","feature_set_name"]].drop_duplicates()

In [19]:
# snp_coords

In [20]:
format_snps(snp_coords)
# snp_coords

In [21]:
snp_coords

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,gene_id_internal,feature_set_name,snp_str
0,Scaffold67,72302,72303,GFUI043918,novel_mapped_tx,Scaffold67:72303
20,Scaffold5,2342537,2342538,GFUI041241,official_annotations,Scaffold5:2342538
22,Scaffold151,406663,406664,GFUI009421,novel_mapped_tx,Scaffold151:406664
42,Scaffold102,398873,398874,GFUI002252,official_annotations,Scaffold102:398874
43,Scaffold27,959994,959995,GFUI021833,official_annotations,Scaffold27:959995
47,Scaffold22,1291804,1291805,GFUI017734,novel_mapped_tx,Scaffold22:1291805
127,Scaffold3,2235177,2235178,GFUI030827,official_annotations,Scaffold3:2235178
128,Scaffold9,1360707,1360708,GFUI053401,official_annotations,Scaffold9:1360708
136,Scaffold2,2206336,2206337,GFUI023751,official_annotations,Scaffold2:2206337
158,Scaffold64,729351,729352,GFUI043027,official_annotations,Scaffold64:729352


In [22]:
for gene in sorted(list(snp_coords.gene_id_internal.values)):

    present_gene(gene_name=gene, snp_coords=snp_coords, argot_df=argot2)

Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
33277,GFUI007441-PA,C,GO:0005634,nucleus,5127.878792,0.434757,5.725783
33276,GFUI007441-PA,C,GO:0005737,cytoplasm,2514.451906,0.394663,3.178476
33271,GFUI007441-PA,F,GO:0003700,sequence-specific DNA binding transcription factor activity,1227.26982,0.290086,5.025891
33274,GFUI007441-PA,F,GO:0003677,DNA binding,915.199249,0.337374,3.83101
33275,GFUI007441-PA,F,GO:0043565,sequence-specific DNA binding,313.076408,0.147086,5.933594
33268,GFUI007441-PA,P,GO:0045944,positive regulation of transcription from RNA polymerase II promoter,257.799183,0.040785,12.597549
33267,GFUI007441-PA,P,GO:0006355,"regulation of transcription, DNA-templated",253.240196,0.189078,7.524266
33273,GFUI007441-PA,F,GO:0005515,protein binding,205.901924,0.300636,5.471582


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
39020,GFUI009421-PA,F,GO:0003682,chromatin binding,4041.622846,0.341053,8.759523
39021,GFUI009421-PA,C,GO:0005634,nucleus,1268.84323,0.465245,5.725783
39022,GFUI009421-PA,C,GO:0000785,chromatin,380.701117,0.105615,8.8033
39023,GFUI009421-PA,C,GO:0005694,chromosome,332.36308,0.248406,6.511289


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
42672,GFUI017734-PA,F,GO:0005215,transporter activity,941.190325,0.781837,2.590009
42675,GFUI017734-PA,F,GO:0005384,manganese ion transmembrane transporter activity,479.64632,0.076954,10.970281
42682,GFUI017734-PA,C,GO:0016021,integral component of membrane,424.62259,0.140479,3.692502
42680,GFUI017734-PA,C,GO:0005886,plasma membrane,334.68409,0.140552,4.357169
42681,GFUI017734-PA,C,GO:0016020,membrane,332.160061,0.563269,2.472824
42678,GFUI017734-PA,C,GO:0005770,late endosome,287.868062,0.070704,10.996288
42677,GFUI017734-PA,C,GO:0005737,cytoplasm,252.497154,0.469864,3.178476
42676,GFUI017734-PA,C,GO:0005764,lysosome,213.055689,0.065152,10.254827
42673,GFUI017734-PA,F,GO:0005375,copper ion transmembrane transporter activity,212.469471,0.052477,10.450138


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
40512,GFUI020790-PA,F,GO:0000062,fatty-acyl-CoA binding,9944.925842,0.334478,10.777529
40511,GFUI020790-PA,P,GO:0006810,transport,276.68226,0.264352,5.069855


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
59774,GFUI021833-PA,P,GO:0042478,regulation of eye photoreceptor cell development,1828.310576,0.104519,19.021834
59776,GFUI021833-PA,C,GO:0016021,integral component of membrane,1602.750453,0.328501,3.692502
59775,GFUI021833-PA,C,GO:0016020,membrane,1107.22045,0.666834,2.472824
59773,GFUI021833-PA,P,GO:0030154,cell differentiation,860.701718,0.233555,8.342626
59772,GFUI021833-PA,P,GO:0007275,multicellular organismal development,739.477899,0.252392,7.690822


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
62673,GFUI023751-PA,F,GO:0008017,microtubule binding,1647.254698,0.134332,9.436151
62680,GFUI023751-PA,C,GO:0005737,cytoplasm,858.000384,0.255226,3.178476
62672,GFUI023751-PA,F,GO:0005515,protein binding,550.247579,0.338247,5.471582
62674,GFUI023751-PA,C,GO:0043234,protein complex,537.480631,0.28168,4.565728
62675,GFUI023751-PA,C,GO:0000922,spindle pole,466.756769,0.104516,10.799745
62671,GFUI023751-PA,P,GO:0007067,mitotic nuclear division,265.490248,0.087227,11.153626
62669,GFUI023751-PA,P,GO:0007051,spindle organization,239.614465,0.066361,13.91492
62676,GFUI023751-PA,C,GO:0005813,centrosome,225.673324,0.066038,10.018345


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
31955,GFUI030827-PA,C,GO:0005634,nucleus,233.933929,0.319027,5.725783


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
19117,GFUI043027-PA,P,GO:0006260,DNA replication,5253.270634,0.492974,8.821469
19120,GFUI043027-PA,F,GO:0003887,DNA-directed DNA polymerase activity,4539.992874,0.322161,7.169783
19118,GFUI043027-PA,F,GO:0003677,DNA binding,2533.271379,0.250152,3.83101
19123,GFUI043027-PA,C,GO:0008622,epsilon DNA polymerase complex,2479.35269,0.332648,12.834245
19116,GFUI043027-PA,P,GO:0006261,DNA-dependent DNA replication,2445.618757,0.259546,10.374902
19122,GFUI043027-PA,C,GO:0005634,nucleus,1685.464129,0.804198,5.725783
19124,GFUI043027-PA,C,GO:0043231,intracellular membrane-bounded organelle,792.375979,0.946265,4.246234
19119,GFUI043027-PA,F,GO:0016740,transferase activity,569.214978,0.680699,2.4874
19121,GFUI043027-PA,F,GO:0016779,nucleotidyltransferase activity,409.609294,0.490394,5.055132


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
51722,GFUI043720-PA,P,GO:0006470,protein dephosphorylation,2077.021281,0.236174,11.013535
51725,GFUI043720-PA,P,GO:0035335,peptidyl-tyrosine dephosphorylation,1854.192243,0.104459,11.690944
51724,GFUI043720-PA,P,GO:0016311,dephosphorylation,1327.332234,0.334633,9.783756
51729,GFUI043720-PA,F,GO:0004725,protein tyrosine phosphatase activity,1106.817502,0.176805,8.510896
51726,GFUI043720-PA,F,GO:0016787,hydrolase activity,1088.596949,0.857645,2.643033
51730,GFUI043720-PA,F,GO:0008138,protein tyrosine/serine/threonine phosphatase activity,904.445986,0.147903,10.023433
51728,GFUI043720-PA,F,GO:0004721,phosphoprotein phosphatase activity,837.540183,0.541206,7.88706
51727,GFUI043720-PA,F,GO:0016791,phosphatase activity,663.461735,0.703193,6.651616
51733,GFUI043720-PA,C,GO:0005634,nucleus,555.072824,0.429894,5.725783
51732,GFUI043720-PA,C,GO:0005737,cytoplasm,497.84608,0.152037,3.178476


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
52251,GFUI043918-PA,C,GO:0016021,integral component of membrane,694.22171,0.165345,3.692502
52248,GFUI043918-PA,P,GO:0055085,transmembrane transport,368.487641,0.137289,6.541786
52247,GFUI043918-PA,P,GO:0006810,transport,314.148365,0.48244,5.069855
52250,GFUI043918-PA,C,GO:0016020,membrane,255.65687,0.494696,2.472824
52249,GFUI043918-PA,F,GO:0022857,transmembrane transporter activity,214.245007,0.40039,3.208825


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content


Unnamed: 0,Sequence,Aspect,GO ID,Name,Total Score,Internal Confidence,Information Content
