# Prep

## Imports

In [1]:
import pandas as pd

## Paths

In [2]:
linked_ = '../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/get_nearest_k_features/MS.linked.snps_in_features.xls'
orig_ = '../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/get_nearest_k_features/MS.original.snps_in_features.xls'

## Constants

In [3]:
distance = 5000

## Functions

In [4]:
def filter_by_distance(df, d=5000):
    return df.query(""" abs(distance) <= {d} """.format(d=d))

def set_comparison_table(a, b, a_name=None, b_name=None):
    a = set(a)
    b = set(b)
    
    data = [pd.Series((sorted(list(a & b))),name='{a} AND {b}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(a - b))),name='{a} NOT {b}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(b - a))),name='{b} NOT {a}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(a | b))),name='{a} OR {b}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(a ^ b))),name='one OR other NOT both'),]
    
    df = pd.DataFrame(data).T
    counts = pd.DataFrame(df.count(),columns=['count'])
    
    return {'genes':df, 'counts':counts}
    
    
    

## Loading

In [5]:
linked = pd.read_excel(linked_)
orig = pd.read_excel(orig_)

In [6]:
linked.head()

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,distance
0,Scaffold10,1586111,1586112,novel_mapped_tx,Scaffold10,1513751,1543760,TCONS_00082660,0,-,1425420,1543760,0,30,"57,139,19,176,18,43,128,93,260,151,130,666,27,...","1425420,1425481,1425642,1425733,1425940,142595...",-42352
1,Scaffold10,1586111,1586112,novel_mapped_tx,Scaffold10,1513751,1543642,TCONS_00082661,0,-,1425420,1543642,0,26,"57,139,19,176,18,43,128,93,260,151,130,666,27,...","1425420,1425481,1425642,1425733,1425940,142595...",-42470
2,Scaffold10,1586111,1586112,novel_mapped_tx,Scaffold10,1513751,1543455,TCONS_00082667,0,-,1425420,1543455,0,29,"57,139,19,176,18,43,128,93,260,151,130,666,27,...","1425420,1425481,1425642,1425733,1425940,142595...",-42657
3,Scaffold10,1586111,1586112,novel_mapped_tx,Scaffold10,1513751,1543452,TCONS_00082662,0,-,1425420,1543452,0,23,"57,139,19,176,18,43,128,93,260,151,130,666,27,...","1425420,1425481,1425642,1425733,1425940,142595...",-42660
4,Scaffold10,1586111,1586112,novel_mapped_tx,Scaffold10,1513751,1543452,TCONS_00082663,0,-,1425420,1543452,0,27,"57,139,19,176,18,43,128,93,260,151,130,666,27,...","1425420,1425481,1425642,1425733,1425940,142595...",-42660


In [7]:
orig.head()

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,distance
0,Scaffold1,2366098,2366099,novel_mapped_tx,Scaffold1,2365004,2365581,TCONS_00071717,0,-,2365004,2365581,0,11,5521492148554153135115,"2365004,2365062,2365083,2365134,2365161,236521...",-518
1,Scaffold1,2366098,2366099,novel_mapped_tx,Scaffold1,2371177,2371335,TCONS_00071714,0,-,2371177,2371335,0,4,41271078,2371177237121923712462371257,5079
2,Scaffold1,2366098,2366099,novel_mapped_tx,Scaffold1,2428670,2428863,TCONS_00071722,0,+,2428670,2450038,0,34,"94,17,110,128,154,143,4,312,145,260,133,350,15...","2428670,2428770,2430803,2430997,2441830,244206...",62572
3,Scaffold1,2366098,2366099,novel_mapped_tx,Scaffold1,2432289,2441328,TCONS_00071722,0,+,2428670,2450038,0,34,"94,17,110,128,154,143,4,312,145,260,133,350,15...","2428670,2428770,2430803,2430997,2441830,244206...",66191
4,Scaffold1,2366098,2366099,novel_mapped_tx,Scaffold1,2453906,2472350,TCONS_00036328,0,-,2453906,2481291,0,31,"68,91,10,72,35,5,12,38,90,6,6,14,5,4,10,74,4,8...","2453906,2453975,2454068,2454083,2454158,245419...",87808


## Cleaning

In [8]:
linked_wk = filter_by_distance(df=linked, d=distance)
orig_wk = filter_by_distance(df=orig, d=distance)

In [9]:
linked_wk.shape

(61, 17)

In [10]:
orig_wk.shape

(28, 17)

# Comparisons

In [11]:
# get sets of gene names to compare

linked_wk_official_genes = set(linked_wk.query(""" feature_set_name == 'official_annotations' """).name)
linked_wk_novel_tx = set(linked_wk.query(""" feature_set_name == 'novel_mapped_tx' """).name)

orig_wk_official_genes = set(orig_wk.query(""" feature_set_name == 'official_annotations' """).name)
orig_wk_novel_tx = set(orig_wk.query(""" feature_set_name == 'novel_mapped_tx' """).name)

## Official Genes

In [12]:
official = set_comparison_table(a=linked_wk_official_genes, 
                                b=orig_wk_official_genes,
                                a_name="linked", 
                                b_name="original")

In [13]:
official['counts']

Unnamed: 0,count
linked AND original,0
linked NOT original,22
original NOT linked,17
linked OR original,39
one OR other NOT both,39


In [14]:
official['genes']

Unnamed: 0,linked AND original,linked NOT original,original NOT linked,linked OR original,one OR other NOT both
0,,GFUI002233,GFUI002104,GFUI002104,GFUI002104
1,,GFUI002250,GFUI002120,GFUI002120,GFUI002120
2,,GFUI002252,GFUI014494,GFUI002233,GFUI002233
3,,GFUI002256,GFUI016866,GFUI002250,GFUI002250
4,,GFUI003051,GFUI020790,GFUI002252,GFUI002252
5,,GFUI003053,GFUI020792,GFUI002256,GFUI002256
6,,GFUI007952,GFUI020794,GFUI003051,GFUI003051
7,,GFUI009421,GFUI020797,GFUI003053,GFUI003053
8,,GFUI011175,GFUI020800,GFUI007952,GFUI007952
9,,GFUI011180,GFUI023751,GFUI009421,GFUI009421


## Novel Tx

In [15]:
novel = set_comparison_table(a=linked_wk_novel_tx, 
                             b=orig_wk_novel_tx,
                             a_name="linked", 
                             b_name="original")

In [16]:
novel['counts']

Unnamed: 0,count
linked AND original,0
linked NOT original,32
original NOT linked,10
linked OR original,42
one OR other NOT both,42


In [17]:
novel['genes']

Unnamed: 0,linked AND original,linked NOT original,original NOT linked,linked OR original,one OR other NOT both
0,,TCONS_00023519,TCONS_00024646,TCONS_00023519,TCONS_00023519
1,,TCONS_00037543,TCONS_00041812,TCONS_00024646,TCONS_00024646
2,,TCONS_00037545,TCONS_00041813,TCONS_00037543,TCONS_00037543
3,,TCONS_00039298,TCONS_00053177,TCONS_00037545,TCONS_00037545
4,,TCONS_00039299,TCONS_00054066,TCONS_00039298,TCONS_00039298
5,,TCONS_00039300,TCONS_00054080,TCONS_00039299,TCONS_00039299
6,,TCONS_00039301,TCONS_00054081,TCONS_00039300,TCONS_00039300
7,,TCONS_00039303,TCONS_00054082,TCONS_00039301,TCONS_00039301
8,,TCONS_00067903,TCONS_00067396,TCONS_00039303,TCONS_00039303
9,,TCONS_00067904,TCONS_00071717,TCONS_00041812,TCONS_00041812
