# Prep

## Imports

In [1]:
import pandas as pd

## Paths

In [2]:
linked_ = '../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/get_nearest_k_features/NB.linked.snps_in_features.xls'
orig_ = '../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/get_nearest_k_features/NB.original.snps_in_features.xls'

## Constants

In [3]:
distance = 5000

## Functions

In [4]:
def filter_by_distance(df, d=5000):
    return df.query(""" abs(distance) <= {d} """.format(d=d))

def set_comparison_table(a, b, a_name=None, b_name=None):
    a = set(a)
    b = set(b)
    
    data = [pd.Series((sorted(list(a & b))),name='{a} AND {b}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(a - b))),name='{a} NOT {b}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(b - a))),name='{b} NOT {a}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(a | b))),name='{a} OR {b}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(a ^ b))),name='one OR other NOT both'),]
    
    df = pd.DataFrame(data).T
    counts = pd.DataFrame(df.count(),columns=['count'])
    
    return {'genes':df, 'counts':counts}
    
    
    

## Loading

In [5]:
linked = pd.read_excel(linked_)
orig = pd.read_excel(orig_)

In [6]:
linked.head()

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,distance
0,Scaffold1,2237682,2237683,novel_mapped_tx,Scaffold1,2226915,2227788,TCONS_00023667,0,-,2226915,2227788,0,4,216114281260,2226915222713322272472227528,-9895
1,Scaffold1,2237682,2237683,novel_mapped_tx,Scaffold1,2268286,2268509,TCONS_00023697,0,+,2268286,2268509,0,3,5216140,226828622683452268369,30604
2,Scaffold1,2237682,2237683,novel_mapped_tx,Scaffold1,2144078,2165341,TCONS_00038064,0,-,2144078,2165341,0,15,601141731011912178099667749,"2144078,2164656,2164669,2164712,2164785,216488...",-72342
3,Scaffold1,2237682,2237683,novel_mapped_tx,Scaffold1,2365004,2365581,TCONS_00071717,0,-,2365004,2365581,0,11,5521492148554153135115,"2365004,2365062,2365083,2365134,2365161,236521...",127322
4,Scaffold1,2237682,2237683,novel_mapped_tx,Scaffold1,2108451,2108494,TCONS_00038070,0,+,2108451,2110472,0,6,3175101865311321,210845121088642109663210990621104372110451,-129189


In [7]:
orig.head()

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,distance
0,Scaffold102,326971,326972,novel_mapped_tx,Scaffold102,365118,365134,TCONS_00081556,0,-,361821,365134,0,16,"31,104,231,237,4,19,19,169,391,346,104,180,515...","361821,361855,361959,362209,362447,362451,3624...",38147
1,Scaffold102,326971,326972,novel_mapped_tx,Scaffold102,365118,365134,TCONS_00083794,0,+,361821,365134,0,16,"31,104,25,96,224,4,19,16,172,391,346,104,180,1...","361821,361855,361959,362054,362222,362447,3624...",38147
2,Scaffold102,326971,326972,novel_mapped_tx,Scaffold102,410767,410831,TCONS_00083793,0,+,402801,410831,0,24,"69,31,24,57,54,115,13,55,899,194,545,921,490,5...","402801,402881,402917,402950,403007,403071,4038...",83796
3,Scaffold102,326971,326972,novel_mapped_tx,Scaffold102,410878,415275,TCONS_00081530,0,-,410878,446526,0,93,"10,6,40,34,15,6,41,23,39,8,85,4,130,62,199,14,...","410878,410888,410897,410943,410977,411011,4110...",83907
4,Scaffold102,326971,326972,novel_mapped_tx,Scaffold102,410878,415275,TCONS_00081531,0,-,410878,446526,0,93,"10,6,40,34,15,6,41,23,39,8,85,4,130,62,199,14,...","410878,410888,410897,410943,410977,411011,4110...",83907


## Cleaning

In [8]:
linked_wk = filter_by_distance(df=linked, d=distance)
orig_wk = filter_by_distance(df=orig, d=distance)

In [9]:
linked_wk.shape

(38, 17)

In [10]:
orig_wk.shape

(105, 17)

# Comparisons

In [11]:
# get sets of gene names to compare

linked_wk_official_genes = set(linked_wk.query(""" feature_set_name == 'official_annotations' """).name)
linked_wk_novel_tx = set(linked_wk.query(""" feature_set_name == 'novel_mapped_tx' """).name)

orig_wk_official_genes = set(orig_wk.query(""" feature_set_name == 'official_annotations' """).name)
orig_wk_novel_tx = set(orig_wk.query(""" feature_set_name == 'novel_mapped_tx' """).name)

## Official Genes

In [12]:
official = set_comparison_table(a=linked_wk_official_genes, 
                                b=orig_wk_official_genes,
                                a_name="linked", 
                                b_name="original")

In [13]:
official['counts']

Unnamed: 0,count
linked AND original,0
linked NOT original,13
original NOT linked,18
linked OR original,31
one OR other NOT both,31


In [14]:
official['genes']

Unnamed: 0,linked AND original,linked NOT original,original NOT linked,linked OR original,one OR other NOT both
0,,GFUI008496,GFUI002233,GFUI002233,GFUI002233
1,,GFUI011907,GFUI002250,GFUI002250,GFUI002250
2,,GFUI026687,GFUI002252,GFUI002252,GFUI002252
3,,GFUI026692,GFUI002256,GFUI002256,GFUI002256
4,,GFUI026694,GFUI003051,GFUI003051,GFUI003051
5,,GFUI026699,GFUI003053,GFUI003053,GFUI003053
6,,GFUI029720,GFUI007952,GFUI007952,GFUI007952
7,,GFUI035775,GFUI009421,GFUI008496,GFUI008496
8,,GFUI035780,GFUI019498,GFUI009421,GFUI009421
9,,GFUI035784,GFUI019501,GFUI011907,GFUI011907


## Novel Tx

In [15]:
novel = set_comparison_table(a=linked_wk_novel_tx, 
                             b=orig_wk_novel_tx,
                             a_name="linked", 
                             b_name="original")

In [16]:
novel['counts']

Unnamed: 0,count
linked AND original,0
linked NOT original,9
original NOT linked,31
linked OR original,40
one OR other NOT both,40


In [17]:
novel['genes']

Unnamed: 0,linked AND original,linked NOT original,original NOT linked,linked OR original,one OR other NOT both
0,,TCONS_00012154,TCONS_00023519,TCONS_00012154,TCONS_00012154
1,,TCONS_00032877,TCONS_00037543,TCONS_00023519,TCONS_00023519
2,,TCONS_00032910,TCONS_00037545,TCONS_00032877,TCONS_00032877
3,,TCONS_00032911,TCONS_00039298,TCONS_00032910,TCONS_00032910
4,,TCONS_00032912,TCONS_00039299,TCONS_00032911,TCONS_00032911
5,,TCONS_00032913,TCONS_00039300,TCONS_00032912,TCONS_00032912
6,,TCONS_00032914,TCONS_00039301,TCONS_00032913,TCONS_00032913
7,,TCONS_00041433,TCONS_00039303,TCONS_00032914,TCONS_00032914
8,,TCONS_00041435,TCONS_00048884,TCONS_00037543,TCONS_00037543
9,,,TCONS_00067903,TCONS_00037545,TCONS_00037545
