# Prep

## Imports

In [1]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

In [2]:
import sys
sys.path.append('../python/')

from pathlib import Path

In [3]:
import pandas as pd
from munch import Munch

import datetime as dt

from tabulate import tabulate

from jinja2 import Template

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [4]:
%aimport linked_vs_orig
import linked_vs_orig as lvo

In [5]:
%aimport reporting
import reporting as rep

## Paths

In [6]:
linked_ = '../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/snps_near_homologous_de/snps_near_homologous_de_distance_200.linked.csv'
orig_ = '../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/snps_near_homologous_de/snps_near_homologous_de_distance_200.original.csv'

## Constants

### Genome region images

In [7]:
url_tmpl = 'https://www.vectorbase.org/Glossina_fuscipes/ImageExport/ImageFormats?component=ViewBottom;data_type=Location;db=core;r={chr}:{start}-{end}'

### Writing reports

In [8]:
report_tmpl = "../reports/templates/gene_set_comparison.md"

## Functions

### Genome region images

In [9]:
# See pipeline source

### Writing reports

In [10]:
def df_to_md_table(df):
    return tabulate(df,headers=list(df.columns.values),tablefmt='pipe')

def load_template(source):
    
    try:
        path = Path(source)
        exists = path.exists()
        
        return Template(source=path.read_text())
        
    except OSError:
        return Template(source=source)

## Loading

In [11]:
linked = lvo.load_snps_near_de(linked_)
orig = lvo.load_snps_near_de(orig_)

In [12]:
linked.head()

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,proximal_id,de_id,xloc_id,tcons_id,gene_id_external,gene_id_internal,lg2_fc,p,fdr,comparison,program,url
0,Scaffold66,864875,864876,official_annotations,GFUI043720,EDGR|MIDGUT_0001058,XLOC_027003,TCONS_00054385,GMOY007600,GFUI043720,1.93513,7.147837e-09,7.425508e-08,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
1,Scaffold66,864876,864877,official_annotations,GFUI043720,EDGR|MIDGUT_0001058,XLOC_027003,TCONS_00054385,GMOY007600,GFUI043720,1.93513,7.147837e-09,7.425508e-08,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
2,Scaffold66,864880,864881,official_annotations,GFUI043720,EDGR|MIDGUT_0001058,XLOC_027003,TCONS_00054385,GMOY007600,GFUI043720,1.93513,7.147837e-09,7.425508e-08,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
3,Scaffold66,866220,866221,official_annotations,GFUI043720,EDGR|MIDGUT_0001058,XLOC_027003,TCONS_00054385,GMOY007600,GFUI043720,1.93513,7.147837e-09,7.425508e-08,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
4,Scaffold66,864875,864876,official_annotations,GFUI043720,EDGR|MIDGUT_0001235,XLOC_026998,TCONS_00054371,GMOY007600,GFUI043720,1.64763,3.283071e-08,2.9218e-07,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...


In [13]:
orig.head()

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,proximal_id,de_id,xloc_id,tcons_id,gene_id_external,gene_id_internal,lg2_fc,p,fdr,comparison,program,url
0,Scaffold27,960004,960005,official_annotations,GFUI021833,EDGR|MIDGUT_0001618,XLOC_038726,TCONS_00080297,GMOY011124,GFUI021833,2.735993,6.130982e-07,4e-06,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
1,Scaffold27,960004,960005,official_annotations,GFUI021833,EDGR|MIDGUT_0001618,XLOC_038726,TCONS_00080298,GMOY011124,GFUI021833,2.735993,6.130982e-07,4e-06,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
2,Scaffold27,960004,960005,official_annotations,GFUI021833,EDGR|MIDGUT_0001618,XLOC_038726,TCONS_00080299,GMOY011124,GFUI021833,2.735993,6.130982e-07,4e-06,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
3,Scaffold27,960004,960005,official_annotations,GFUI021833,EDGR|MIDGUT_0001618,XLOC_038726,TCONS_00080300,GMOY011124,GFUI021833,2.735993,6.130982e-07,4e-06,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
4,Scaffold27,960391,960392,official_annotations,GFUI021833,EDGR|MIDGUT_0001618,XLOC_038726,TCONS_00080297,GMOY011124,GFUI021833,2.735993,6.130982e-07,4e-06,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...


## Cleaning

# Comparisons

In [14]:
# get sets of gene names to compare
split_linked = lvo.split_official_novel(linked)
split_orig = lvo.split_official_novel(orig)

## Official Genes

In [15]:
official = lvo.set_comparison_table(a=split_linked.official_genes, 
                                    b=split_orig.official_genes,
                                    a_name="linked", 
                                    b_name="original")

In [16]:
official.counts

Unnamed: 0,count
linked,7
original,12
linked AND original,6
linked NOT original,1
original NOT linked,6
linked OR original,13
one OR other NOT both,7


In [17]:
official.genes

Unnamed: 0,linked,original,linked AND original,linked NOT original,original NOT linked,linked OR original,one OR other NOT both
0,GFUI002252,GFUI002252,GFUI002252,GFUI043025,GFUI007441,GFUI002252,GFUI007441
1,GFUI021833,GFUI007441,GFUI021833,,GFUI020790,GFUI007441,GFUI020790
2,GFUI030827,GFUI020790,GFUI030827,,GFUI020794,GFUI020790,GFUI020794
3,GFUI041241,GFUI020794,GFUI041241,,GFUI023751,GFUI020794,GFUI023751
4,GFUI043025,GFUI021833,GFUI043720,,GFUI043027,GFUI021833,GFUI043025
5,GFUI043720,GFUI023751,GFUI051070,,GFUI053401,GFUI023751,GFUI043027
6,GFUI051070,GFUI030827,,,,GFUI030827,GFUI053401
7,,GFUI041241,,,,GFUI041241,
8,,GFUI043027,,,,GFUI043025,
9,,GFUI043720,,,,GFUI043027,


In [18]:
official.genes['linked OR original']

0     GFUI002252
1     GFUI007441
2     GFUI020790
3     GFUI020794
4     GFUI021833
5     GFUI023751
6     GFUI030827
7     GFUI041241
8     GFUI043025
9     GFUI043027
10    GFUI043720
11    GFUI051070
12    GFUI053401
Name: linked OR original, dtype: object

## Novel Tx

In [19]:
split_orig.novel_tx

{'TCONS_00037543',
 'TCONS_00037545',
 'TCONS_00039298',
 'TCONS_00039299',
 'TCONS_00039300',
 'TCONS_00039301',
 'TCONS_00067396',
 'TCONS_00067903',
 'TCONS_00067904',
 'TCONS_00067905',
 'TCONS_00067906',
 'TCONS_00067907',
 'TCONS_00067908',
 'TCONS_00067909',
 'TCONS_00067910',
 'TCONS_00067911',
 'TCONS_00067912'}

In [20]:
split_linked.novel_tx

{'TCONS_00034477',
 'TCONS_00034478',
 'TCONS_00034479',
 'TCONS_00034480',
 'TCONS_00034481',
 'TCONS_00034482',
 'TCONS_00034483',
 'TCONS_00034484',
 'TCONS_00034485',
 'TCONS_00034486',
 'TCONS_00034487',
 'TCONS_00034488',
 'TCONS_00034489',
 'TCONS_00034490',
 'TCONS_00034491',
 'TCONS_00034494',
 'TCONS_00037543',
 'TCONS_00037545',
 'TCONS_00039298',
 'TCONS_00039299',
 'TCONS_00039300',
 'TCONS_00039301',
 'TCONS_00067396',
 'TCONS_00067903',
 'TCONS_00067904',
 'TCONS_00067905',
 'TCONS_00067906',
 'TCONS_00067907',
 'TCONS_00067908',
 'TCONS_00067909',
 'TCONS_00067910',
 'TCONS_00067911',
 'TCONS_00067912'}

In [21]:
novel = lvo.set_comparison_table(a=split_linked.novel_tx, 
                             b=split_orig.novel_tx,
                             a_name="linked", 
                             b_name="original")

In [22]:
novel.counts

Unnamed: 0,count
linked,33
original,17
linked AND original,17
linked NOT original,16
original NOT linked,0
linked OR original,33
one OR other NOT both,16


In [23]:
novel.genes

Unnamed: 0,linked,original,linked AND original,linked NOT original,original NOT linked,linked OR original,one OR other NOT both
0,TCONS_00034477,TCONS_00037543,TCONS_00037543,TCONS_00034477,,TCONS_00034477,TCONS_00034477
1,TCONS_00034478,TCONS_00037545,TCONS_00037545,TCONS_00034478,,TCONS_00034478,TCONS_00034478
2,TCONS_00034479,TCONS_00039298,TCONS_00039298,TCONS_00034479,,TCONS_00034479,TCONS_00034479
3,TCONS_00034480,TCONS_00039299,TCONS_00039299,TCONS_00034480,,TCONS_00034480,TCONS_00034480
4,TCONS_00034481,TCONS_00039300,TCONS_00039300,TCONS_00034481,,TCONS_00034481,TCONS_00034481
5,TCONS_00034482,TCONS_00039301,TCONS_00039301,TCONS_00034482,,TCONS_00034482,TCONS_00034482
6,TCONS_00034483,TCONS_00067396,TCONS_00067396,TCONS_00034483,,TCONS_00034483,TCONS_00034483
7,TCONS_00034484,TCONS_00067903,TCONS_00067903,TCONS_00034484,,TCONS_00034484,TCONS_00034484
8,TCONS_00034485,TCONS_00067904,TCONS_00067904,TCONS_00034485,,TCONS_00034485,TCONS_00034485
9,TCONS_00034486,TCONS_00067905,TCONS_00067905,TCONS_00034486,,TCONS_00034486,TCONS_00034486


# New proximal_id from LINKED

In [24]:
new_novel = split_linked.novel_tx - split_orig.novel_tx
new_official = split_linked.official_genes - split_orig.official_genes

In [25]:
new_prox_ids = list(new_novel) + list(new_official)

In [26]:
linked_new = linked[linked.proximal_id.isin(new_prox_ids)]

In [27]:
linked_new.shape

(104, 16)

In [28]:
linked_new

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,proximal_id,de_id,xloc_id,tcons_id,gene_id_external,gene_id_internal,lg2_fc,p,fdr,comparison,program,url
68,Scaffold64,732108,732109,official_annotations,GFUI043025,EDGR|MIDGUT_0003725,XLOC_020485,TCONS_00041812,GMOY005879,GFUI043025,-0.790157,0.001578,0.004656,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
69,Scaffold64,732108,732109,official_annotations,GFUI043025,EDGR|MIDGUT_0003725,XLOC_020485,TCONS_00041813,GMOY005879,GFUI043025,-0.790157,0.001578,0.004656,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
70,Scaffold64,732108,732109,official_annotations,GFUI043025,EDGR|MIDGUT_0003725,XLOC_020485,TCONS_00041814,GMOY005879,GFUI043025,-0.790157,0.001578,0.004656,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
71,Scaffold64,732119,732120,official_annotations,GFUI043025,EDGR|MIDGUT_0003725,XLOC_020485,TCONS_00041812,GMOY005879,GFUI043025,-0.790157,0.001578,0.004656,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
72,Scaffold64,732119,732120,official_annotations,GFUI043025,EDGR|MIDGUT_0003725,XLOC_020485,TCONS_00041813,GMOY005879,GFUI043025,-0.790157,0.001578,0.004656,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
73,Scaffold64,732119,732120,official_annotations,GFUI043025,EDGR|MIDGUT_0003725,XLOC_020485,TCONS_00041814,GMOY005879,GFUI043025,-0.790157,0.001578,0.004656,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
74,Scaffold64,732121,732122,official_annotations,GFUI043025,EDGR|MIDGUT_0003725,XLOC_020485,TCONS_00041812,GMOY005879,GFUI043025,-0.790157,0.001578,0.004656,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
75,Scaffold64,732121,732122,official_annotations,GFUI043025,EDGR|MIDGUT_0003725,XLOC_020485,TCONS_00041813,GMOY005879,GFUI043025,-0.790157,0.001578,0.004656,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
76,Scaffold64,732121,732122,official_annotations,GFUI043025,EDGR|MIDGUT_0003725,XLOC_020485,TCONS_00041814,GMOY005879,GFUI043025,-0.790157,0.001578,0.004656,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...
77,Scaffold64,732173,732174,official_annotations,GFUI043025,EDGR|MIDGUT_0003725,XLOC_020485,TCONS_00041812,GMOY005879,GFUI043025,-0.790157,0.001578,0.004656,MIDGUT,edger,https://www.vectorbase.org/Glossina_fuscipes/L...


In [29]:
linked_new.query(""" {name_col} == '{gname}' """.format(name_col='proximal_id', gname=''))[['SNP_chrom','SNP_end']].astype(tuple)

Unnamed: 0,SNP_chrom,SNP_end


# Reporting

In [30]:
# def get_genes_info(gnames)

In [31]:
tmpl = load_template(source="../reports/templates/gene_set_comparison.md")

In [32]:
def metadata(**kwargs):
    """Return dataframe built from var names.
    
    Removes everything to the left of the first `_` after DataFrame-ification.
    This allows you to set the order the rows with the info in that first bit of variable name.
    """
    df = pd.DataFrame(kwargs, index=['']).T
    df.index = [''.join(i.split('_',maxsplit=1)[1]) for i in df.index.values]
    return df

In [33]:
d = Munch()



d.metadata = rep.df_to_markdown(metadata(i1_Date=dt.date.today(), i2_Source_Type='snps_near_homologous_de'))
d.official_genes_set_counts = rep.df_to_markdown(official.counts)
d.official_genes_set_table = rep.df_to_markdown(official.genes)
d.novel_tx_set_counts = rep.df_to_markdown(official.counts)
d.novel_tx_set_table = rep.df_to_markdown(official.genes)
# d.genes = get_genes_info(official.genes['linked OR original'])
# d.txs = get_genes_info(novel.genes['linked OR original'])

In [34]:
print(d.metadata)

|             |                         |
|:------------|:------------------------|
| Date        | 2016-10-03              |
| Source_Type | snps_near_homologous_de |


In [35]:
coords = {'chr':'Scaffold104','start':593216,'end':597969}

In [36]:
# browser = rep.init_browser(prefs={"download.default_directory" : rep.get_abs_path("../reports/figures/genome_regions")})

In [37]:
bed = Path('../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/make_snp_beds/MS_NB_OT.original.bed')

In [38]:
data = bed.read_text()

In [39]:
print(data)

JFJR01012925	2544	2545
Scaffold1	2237686	2237687
Scaffold104	688652	688653
Scaffold105	310780	310781
Scaffold133	276182	276183
Scaffold137	615406	615407
Scaffold144	593215	593216
Scaffold175	347131	347132
Scaffold2	2370780	2370781
Scaffold20	1394091	1394092
Scaffold3	1688399	1688400
Scaffold33	1387925	1387926
Scaffold389	259727	259728
Scaffold48	1013372	1013373
Scaffold50	1026780	1026781
Scaffold761	42841	42842
Scaffold8	1744225	1744226
Scaffold993	1107	1108



In [40]:
linked_paths = list(Path().glob('../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/make_linked_snps_beds/*.bed'))
orig_paths = list(Path().glob('../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/make_snp_beds/*.bed'))

In [41]:
orig_paths

[PosixPath('../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/make_snp_beds/MS_NB_OT.original.bed'),
 PosixPath('../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/make_snp_beds/MS.original.bed'),
 PosixPath('../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/make_snp_beds/NB.original.bed'),
 PosixPath('../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/make_snp_beds/OT.original.bed')]

### Activate all my data tracks

In [42]:
# browser = rep.init_browser(file_paths=sorted(orig_paths + linked_paths))

In [43]:
# rep.enable_all_personal_data_tracks(browser)

In [44]:
# browser.get("https://www.vectorbase.org/Glossina_fuscipes/Transcript/Domains?db=core;g=GFUI028557")

In [45]:
# domains = browser.find_element_by_xpath('//*[@id="DomainSpreadsheet"]/div[1]/div[1]/div[1]/div[3]/div/a[2]')

In [46]:
dfs = pd.read_html('https://www.vectorbase.org/Glossina_fuscipes/Transcript/Domains?db=core;g=GFUI028557', )

In [48]:
len(dfs)

1

In [49]:
dfs[0]

Unnamed: 0,Name,Transcript ID,bp,Protein,Translation ID,Biotype,Flags
0,Novel,GFUI028557-RA,1782,365aa,GFUI028557-PA,<p>Genes and/or transcript that contains an op...,
