# Prep

## Imports

In [1]:
import pandas as pd

## Paths

In [2]:
linked_ = '../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/get_nearest_k_features/OT.linked.snps_in_features.xls'
orig_ = '../data/processed/gmm_to_gff_testing_filterPSL_100bp_local/get_nearest_k_features/OT.original.snps_in_features.xls'

## Constants

In [3]:
distance = 5000

## Functions

In [4]:
def filter_by_distance(df, d=5000):
    return df.query(""" abs(distance) <= {d} """.format(d=d))

def set_comparison_table(a, b, a_name=None, b_name=None):
    a = set(a)
    b = set(b)
    
    data = [pd.Series((sorted(list(a & b))),name='{a} AND {b}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(a - b))),name='{a} NOT {b}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(b - a))),name='{b} NOT {a}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(a | b))),name='{a} OR {b}'.format(a=a_name,b=b_name)),
            pd.Series((sorted(list(a ^ b))),name='one OR other NOT both'),]
    
    df = pd.DataFrame(data).T
    counts = pd.DataFrame(df.count(),columns=['count'])
    
    return {'genes':df, 'counts':counts}
    
    
    

## Loading

In [5]:
linked = pd.read_excel(linked_)
orig = pd.read_excel(orig_)

In [6]:
linked.head()

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,distance
0,Scaffold106,349186,349187,novel_mapped_tx,Scaffold106,320040,320102,TCONS_00083698,0,+,319095,320102,0,3,48146540,319095319597320062,-29085
1,Scaffold106,349186,349187,novel_mapped_tx,Scaffold106,319095,319104,TCONS_00083698,0,+,319095,320102,0,3,48146540,319095319597320062,-30083
2,Scaffold106,349186,349187,novel_mapped_tx,Scaffold106,383136,402324,TCONS_00083693,0,+,383136,587072,0,33,"74,156,55,158,82,47,6,72,141,38,113,83,210,5,6...","383136,383424,383583,383638,383796,383878,4030...",33950
3,Scaffold106,349186,349187,novel_mapped_tx,Scaffold106,383136,402324,TCONS_00083694,0,+,383136,587072,0,33,"74,156,55,158,82,47,6,72,141,38,113,83,210,5,6...","383136,383424,383583,383638,383796,383878,4030...",33950
4,Scaffold106,349186,349187,novel_mapped_tx,Scaffold106,405951,433030,TCONS_00083688,0,+,403014,584462,0,23,"7,72,141,38,113,83,210,5,63,82,184,95,5,159,89...","403014,403022,403095,403236,403274,403387,4034...",56765


In [7]:
orig.head()

Unnamed: 0,SNP_chrom,SNP_start,SNP_end,feature_set_name,chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,distance
0,JFJR01012925,2544,2545,.,.,-1,-1,.,-1,.,.,.,.,.,.,.,-1
1,Scaffold1,2237686,2237687,novel_mapped_tx,Scaffold1,2226915,2227788,TCONS_00023667,0,-,2226915,2227788,000,4,216114281260,2226915222713322272472227528,-9899
2,Scaffold1,2237686,2237687,novel_mapped_tx,Scaffold1,2268286,2268509,TCONS_00023697,0,+,2268286,2268509,000,3,5216140,226828622683452268369,30600
3,Scaffold1,2237686,2237687,novel_mapped_tx,Scaffold1,2144078,2165341,TCONS_00038064,0,-,2144078,2165341,000,15,601141731011912178099667749,"2144078,2164656,2164669,2164712,2164785,216488...",-72346
4,Scaffold1,2237686,2237687,novel_mapped_tx,Scaffold1,2365004,2365581,TCONS_00071717,0,-,2365004,2365581,000,11,5521492148554153135115,"2365004,2365062,2365083,2365134,2365161,236521...",127318


## Cleaning

In [8]:
linked_wk = filter_by_distance(df=linked, d=distance)
orig_wk = filter_by_distance(df=orig, d=distance)

In [9]:
linked_wk.shape

(32, 17)

In [10]:
orig_wk.shape

(39, 17)

# Comparisons

In [11]:
# get sets of gene names to compare

linked_wk_official_genes = set(linked_wk.query(""" feature_set_name == 'official_annotations' """).name)
linked_wk_novel_tx = set(linked_wk.query(""" feature_set_name == 'novel_mapped_tx' """).name)

orig_wk_official_genes = set(orig_wk.query(""" feature_set_name == 'official_annotations' """).name)
orig_wk_novel_tx = set(orig_wk.query(""" feature_set_name == 'novel_mapped_tx' """).name)

## Official Genes

In [12]:
official = set_comparison_table(a=linked_wk_official_genes, 
                                b=orig_wk_official_genes,
                                a_name="linked", 
                                b_name="original")

In [13]:
official['counts']

Unnamed: 0,count
linked AND original,0
linked NOT original,11
original NOT linked,13
linked OR original,24
one OR other NOT both,24


In [14]:
official['genes']

Unnamed: 0,linked AND original,linked NOT original,original NOT linked,linked OR original,one OR other NOT both
0,,GFUI002898,GFUI002713,GFUI002713,GFUI002713
1,,GFUI008017,GFUI007441,GFUI002898,GFUI002898
2,,GFUI008024,GFUI008496,GFUI007441,GFUI007441
3,,GFUI017734,GFUI011907,GFUI008017,GFUI008017
4,,GFUI021833,GFUI026687,GFUI008024,GFUI008024
5,,GFUI021834,GFUI026699,GFUI008496,GFUI008496
6,,GFUI021857,GFUI029720,GFUI011907,GFUI011907
7,,GFUI024278,GFUI030741,GFUI017734,GFUI017734
8,,GFUI028910,GFUI030748,GFUI021833,GFUI021833
9,,GFUI030827,GFUI035775,GFUI021834,GFUI021834


## Novel Tx

In [15]:
novel = set_comparison_table(a=linked_wk_novel_tx, 
                             b=orig_wk_novel_tx,
                             a_name="linked", 
                             b_name="original")

In [16]:
novel['counts']

Unnamed: 0,count
linked AND original,0
linked NOT original,19
original NOT linked,20
linked OR original,39
one OR other NOT both,39


In [17]:
novel['genes']

Unnamed: 0,linked AND original,linked NOT original,original NOT linked,linked OR original,one OR other NOT both
0,,TCONS_00031593,TCONS_00012154,TCONS_00012154,TCONS_00012154
1,,TCONS_00034477,TCONS_00032167,TCONS_00031593,TCONS_00031593
2,,TCONS_00034478,TCONS_00032172,TCONS_00032167,TCONS_00032167
3,,TCONS_00034479,TCONS_00032173,TCONS_00032172,TCONS_00032172
4,,TCONS_00034480,TCONS_00032174,TCONS_00032173,TCONS_00032173
5,,TCONS_00034481,TCONS_00032175,TCONS_00032174,TCONS_00032174
6,,TCONS_00034482,TCONS_00032176,TCONS_00032175,TCONS_00032175
7,,TCONS_00034483,TCONS_00032177,TCONS_00032176,TCONS_00032176
8,,TCONS_00034484,TCONS_00032877,TCONS_00032177,TCONS_00032177
9,,TCONS_00034485,TCONS_00032910,TCONS_00032877,TCONS_00032877
