# Prep

## Imports

In [1]:
import pandas as pd

## Paths

In [2]:
linked_ = '../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/get_nearest_k_features/MS_NB_OT.linked.snps_in_features.xls'
orig_ = '../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/get_nearest_k_features/MS_NB_OT.original.snps_in_features.xls'

## Constants

In [3]:
distance = 5000

## Functions

In [4]:
def filter_by_distance(df, d=5000):
    return df.query(""" abs(distance) <= {d} """.format(d=d))

def set_comparison_table(a, b, a_name=None, b_name=None):
    a = set(a)
    b = set(b)
    
    data = [pd.Series((sorted(list(a & b))),name='{a} AND {b}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(a - b))),name='{a} NOT {b}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(b - a))),name='{b} NOT {a}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(a | b))),name='{a} OR {b}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(a ^ b))),name='one OR other NOT both'),]
    
    df = pd.DataFrame(data).T
    counts = pd.DataFrame(df.count(),columns=['count'])
    
    return {'genes':df, 'counts':counts}
    
    
    

## Loading

In [5]:
linked = pd.read_excel(linked_)
orig = pd.read_excel(orig_)

In [6]:
linked.head()

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,distance
0,Scaffold1004,18777,18778,official_annotations,Scaffold1004,32744,37536,GFUI001832,0,+,32744,37533,000,5,490573141913349,02369299334674443,13967
1,Scaffold1262,1654,1655,.,.,-1,-1,.,-1,.,.,.,.,.,.,.,-1
2,Scaffold1262,3836,3837,.,.,-1,-1,.,-1,.,.,.,.,.,.,.,-1
3,Scaffold220,393448,393449,novel_mapped_tx,Scaffold220,356496,396965,TCONS_00067396,0,-,284569,405927,000,32,"121,10,100,288,143,71,3,232,165,127,53,143,105...","284569,284691,284702,284803,288373,289823,2913...",0
4,Scaffold220,393448,393449,novel_mapped_tx,Scaffold220,402092,402510,TCONS_00067396,0,-,284569,405927,000,32,"121,10,100,288,143,71,3,232,165,127,53,143,105...","284569,284691,284702,284803,288373,289823,2913...",8644


In [7]:
orig.head()

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,distance
0,Scaffold13,1931632,1931633,novel_mapped_tx,Scaffold13,1890917,1895398,TCONS_00041588,0,-,1884154,1895586,0,20,"147,59,13,131,39,17,48,121,62,1341,2182,1346,1...","1884154,1884302,1884365,1884378,1884513,188455...",-36235
1,Scaffold13,1931632,1931633,novel_mapped_tx,Scaffold13,1890917,1895398,TCONS_00041590,0,-,1885040,1895401,0,8,33991346101109651346413,"1885040,1888446,1889798,1889808,1894384,189503...",-36235
2,Scaffold13,1931632,1931633,novel_mapped_tx,Scaffold13,1890917,1895398,TCONS_00041591,0,-,1884154,1895401,0,18,"147,59,13,131,39,17,48,121,62,1341,2182,1346,1...","1884154,1884302,1884365,1884378,1884513,188455...",-36235
3,Scaffold13,1931632,1931633,novel_mapped_tx,Scaffold13,1890917,1891292,TCONS_00041593,0,-,1884154,1891292,0,15,"147,59,13,131,39,17,48,121,62,1341,2182,1346,1...","1884154,1884302,1884365,1884378,1884513,188455...",-40341
4,Scaffold13,1931632,1931633,novel_mapped_tx,Scaffold13,1884154,1885040,TCONS_00041588,0,-,1884154,1895586,0,20,"147,59,13,131,39,17,48,121,62,1341,2182,1346,1...","1884154,1884302,1884365,1884378,1884513,188455...",-46593


## Cleaning

In [8]:
linked_wk = filter_by_distance(df=linked, d=distance)
orig_wk = filter_by_distance(df=orig, d=distance)

In [9]:
linked_wk.shape

(47, 17)

In [10]:
orig_wk.shape

(36, 17)

# Comparisons

In [11]:
# get sets of gene names to compare

linked_wk_official_genes = set(linked_wk.query(""" feature_set_name == 'official_annotations' """).name)
linked_wk_novel_tx = set(linked_wk.query(""" feature_set_name == 'novel_mapped_tx' """).name)

orig_wk_official_genes = set(orig_wk.query(""" feature_set_name == 'official_annotations' """).name)
orig_wk_novel_tx = set(orig_wk.query(""" feature_set_name == 'novel_mapped_tx' """).name)

## Official Genes

In [12]:
official = set_comparison_table(a=linked_wk_official_genes, 
                                b=orig_wk_official_genes,
                                a_name="linked", 
                                b_name="original")

In [13]:
official['counts']

Unnamed: 0,count
linked AND original,0
linked NOT original,8
original NOT linked,7
linked OR original,15
one OR other NOT both,15


In [14]:
official['genes']

Unnamed: 0,linked AND original,linked NOT original,original NOT linked,linked OR original,one OR other NOT both
0,,GFUI016866,GFUI008017,GFUI008017,GFUI008017
1,,GFUI033037,GFUI008024,GFUI008024,GFUI008024
2,,GFUI043025,GFUI021833,GFUI016866,GFUI016866
3,,GFUI043027,GFUI021834,GFUI021833,GFUI021833
4,,GFUI043030,GFUI021857,GFUI021834,GFUI021834
5,,GFUI043057,GFUI030827,GFUI021857,GFUI021857
6,,GFUI043067,GFUI047097,GFUI030827,GFUI030827
7,,GFUI043720,,GFUI033037,GFUI033037
8,,,,GFUI043025,GFUI043025
9,,,,GFUI043027,GFUI043027


## Novel Tx

In [15]:
novel = set_comparison_table(a=linked_wk_novel_tx, 
                             b=orig_wk_novel_tx,
                             a_name="linked", 
                             b_name="original")

In [16]:
novel['counts']

Unnamed: 0,count
linked AND original,0
linked NOT original,3
original NOT linked,2
linked OR original,5
one OR other NOT both,5


In [17]:
novel['genes']

Unnamed: 0,linked AND original,linked NOT original,original NOT linked,linked OR original,one OR other NOT both
0,,TCONS_00041812,TCONS_00080293,TCONS_00041812,TCONS_00041812
1,,TCONS_00041813,TCONS_00080294,TCONS_00041813,TCONS_00041813
2,,TCONS_00067396,,TCONS_00067396,TCONS_00067396
3,,,,TCONS_00080293,TCONS_00080293
4,,,,TCONS_00080294,TCONS_00080294
