In [1]:
%run ../../shared_setup.ipynb

docker image cggh/biipy:v1.6.0


In [5]:
def tabulate(f):
    class Tabulated(etl.Table):
        def __init__(self, *args, **kwargs):
            self.args = args
            self.kwargs = kwargs
        def __iter__(self):
            return f(*self.args, **self.kwargs)
    return Tabulated
        

In [6]:
@tabulate
def tabulate_test(n):
    yield ('foo', 'bar')
    for i in range(n):
        yield (i, 'a' * i)


In [16]:
tbl = tabulate_test(6)
tbl

0|foo,1|bar
0,
1,a
2,aa
3,aaa
4,aaaa


In [31]:
@tabulate
def tabulate_core_windows(window_size=5000):
    yield 'chrom', 'start', 'stop'
    for rec in tbl_regions_1b.eq('region_type', 'Core').records():
        for start in range(rec.region_start, rec.region_stop, window_size):
            yield rec.region_chrom, start, start + window_size - 1
    

In [32]:
tbl_5kb = tabulate_core_windows()
tbl_5kb

0|chrom,1|start,2|stop
Pf3D7_01_v3,92901,97900
Pf3D7_01_v3,97901,102900
Pf3D7_01_v3,102901,107900
Pf3D7_01_v3,107901,112900
Pf3D7_01_v3,112901,117900


In [3]:
tbl_co = (
    etl
    .frompickle(os.path.join(PUBLIC_DIR, 'tbl_co.pickle'))
)
display_with_nrows(tbl_co, caption='CO events')

0|sample,1|chrom,2|co_pos_mid,3|co_pos_min,4|co_pos_max,5|co_pos_range,6|cross,7|co_from_parent,8|co_to_parent
B1SD/PG0015-C/ERR019044,b'Pf3D7_01_v3',145052,144877,145227,350,hb3_dd2,hb3,dd2
GC03/PG0021-C/ERR015447,b'Pf3D7_01_v3',163584,163145,164024,879,hb3_dd2,dd2,hb3
XF12/PG0102-C/ERR029143,b'Pf3D7_01_v3',206769,205803,207736,1933,7g8_gb4,gb4,7g8
7C159/PG0040-Cx/ERR107475,b'Pf3D7_01_v3',206905,206074,207736,1662,hb3_dd2,hb3,dd2
CH3_61/PG0033-Cx/ERR175544,b'Pf3D7_01_v3',206905,206074,207736,1662,hb3_dd2,dd2,hb3


In [4]:
tbl_nco = (etl
    .frompickle(os.path.join(PUBLIC_DIR, 'tbl_conversion_tracts.pickle'))
    .eq('tract_type', 'NCO')
    # exclude the really long outliers
#     .lt('tract_length_min', 18000)
)
display_with_nrows(tbl_nco, caption='NCO tracts')

0|sample,1|cross,2|chrom,3|tract_start_min,4|tract_start_mid,5|tract_start_max,6|tract_stop_min,7|tract_stop_mid,8|tract_stop_max,9|tract_length_min,10|tract_length_mid,11|tract_length_max,12|tract_support,13|tract_is_complex,14|tract_blocks,15|co_pos_mid,16|co_pos_min,17|co_pos_max,18|co_pos_range,19|tract_type
1BB5/PG0023-C/ERR015449,hb3_dd2,b'Pf3D7_02_v3',551191,551710,552230,553769,554132,554496,1539,2422,3305,5,False,1,,,,,NCO
1BB5/PG0023-C/ERR015449,hb3_dd2,b'Pf3D7_08_v3',1293542,1294381,1295221,1296788,1297035,1297283,1567,2654,3741,4,False,1,,,,,NCO
1BB5/PG0023-C/ERR015449,hb3_dd2,b'Pf3D7_09_v3',950476,951138,951801,954256,954735,955215,2455,3597,4739,4,True,3,,,,,NCO
1BB5/PG0023-C/ERR015449,hb3_dd2,b'Pf3D7_12_v3',331487,331820,332154,336886,337497,338108,4732,5677,6621,7,False,1,,,,,NCO
1BB5/PG0023-C/ERR015449,hb3_dd2,b'Pf3D7_13_v3',756601,756957,757313,760183,761269,762355,2870,4312,5754,9,False,1,,,,,NCO


In [22]:
tbl_recom = (
    etl
    .cat(*[
        (tbl_co
         .cut('chrom', 'co_pos_min', 'co_pos_max')
         .rename({'co_pos_min': 'start', 'co_pos_max': 'stop'})
         .addfield('type', 'CO')),
        (tbl_nco
         .cut('chrom', 'tract_start_min', 'tract_stop_max')
         .rename({'tract_start_min': 'start', 'tract_stop_max': 'stop'})
         .addfield('type', 'NCO')),
    ])
    .sort(key=('chrom', 'start'))
)
tbl_recom

0|chrom,1|start,2|stop,3|type
b'Pf3D7_01_v3',127692,131098,NCO
b'Pf3D7_01_v3',144877,145227,CO
b'Pf3D7_01_v3',163145,164024,CO
b'Pf3D7_01_v3',175754,177355,NCO
b'Pf3D7_01_v3',205803,207736,CO


## Sandbox

In [5]:
combined = load_callsets(COMBINED_CALLSET_FN_TEMPLATE, 
                         variant_filter='FILTER_PASS',
                         call_filter=combined_conf_calls, 
                         sample_exclusions=excessive_recomb_samples)

2016-03-14 00:27:18.425202 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/3d7_hb3.combined.final.npz
2016-03-14 00:27:18.767055 :: filter variants: excluding 157 (0.4%) retaining 42087 (99.6%) of 42244 variants
2016-03-14 00:27:18.863999 :: filter calls: excluding 2439 (0.3%) retaining 881388 (99.7%) of 883827 calls
2016-03-14 00:27:18.866810 :: loading /data/plasmodium/pfalciparum/pf-crosses/data/public/20141022/hb3_dd2.combined.final.npz
2016-03-14 00:27:19.277826 :: filter variants: excluding 450 (1.2%) retaining 36461 (98.8%) of 36911 variants
2016-03-14 00:27:19.353628 :: filter samples: excluding ['SC01/PG0025-C/ERR019045'] including ['HB3/PG0004-CW/ERR012788', 'DD2/PG0008-CW/ERR012840', '1BB5/PG0023-C/ERR015449', '3BA6/PG0022-Cx/ERR126027', '3BD5/PG0024-C/ERR019053', '7C101/PG0074-C/ERR019048', '7C111/PG0038-C/ERR015457', '7C12/PG0035-Cx/ERR037704', '7C126/PG0047-C/ERR015452', '7C140/PG0039-C/ERR015454', '7C159/PG0040-Cx/ERR107475', '7C16/PG0036-C/ERR015

In [30]:
variants = combined['3d7_hb3']['variants']

In [32]:
pos = variants['POS'][variants['CHROM'] == b'Pf3D7_01_v3']
pos.shape

(1170,)

In [45]:
tbl_intervals = (
    etl
    .fromcolumns((pos[:-1], pos[1:], np.diff(pos)), header=('start', 'stop', 'length'))
    .addfield('chrom', b'Pf3D7_01_v3', index=0)
    .intervalleftjoin(tbl_recom.prefixheader('recom_'), 
                      lkey='chrom', lstart='start', lstop='stop',
                      rkey='recom_chrom', rstart='recom_start', rstop='recom_stop',
                      include_stop=True)
    .aggregate(key=('chrom', 'start', 'stop', 'length'),
               aggregation=list, value='recom_type')
    .rename('value', 'event_types')
    .convert('event_types', lambda v: [] if v == [None] else v)
    .addfield('n_events', lambda row: len(row.event_types))
)
tbl_intervals.gt('n_events', 0).display(50)

0|chrom,1|start,2|stop,3|length,4|event_types,5|n_events
b'Pf3D7_01_v3',127256,127692,436,['NCO'],1
b'Pf3D7_01_v3',127692,127725,33,['NCO'],1
b'Pf3D7_01_v3',127725,128164,439,['NCO'],1
b'Pf3D7_01_v3',128164,128186,22,['NCO'],1
b'Pf3D7_01_v3',128186,128543,357,['NCO'],1
b'Pf3D7_01_v3',128543,128805,262,['NCO'],1
b'Pf3D7_01_v3',128805,128831,26,['NCO'],1
b'Pf3D7_01_v3',128831,128832,1,['NCO'],1
b'Pf3D7_01_v3',128832,128841,9,['NCO'],1
b'Pf3D7_01_v3',128841,128883,42,['NCO'],1
