In [1]:
%run ../../shared_setup.ipynb

docker image cggh/biipy:v1.6.0


In [2]:
def tabulate(f):
    class Tabulated(etl.Table):
        def __init__(self, *args, **kwargs):
            self.args = args
            self.kwargs = kwargs
        def __iter__(self):
            return f(*self.args, **self.kwargs)
    return Tabulated
        

In [3]:
@tabulate
def tabulate_core_windows(window_size):
    yield 'chrom', 'start', 'stop'
    for rec in tbl_regions_1b.eq('region_type', 'Core').records():
        for start in range(rec.region_start, rec.region_stop, window_size):
            yield rec.region_chrom, start, start + window_size - 1
    

In [4]:
tbl_co = (
    etl
    .frompickle(os.path.join(PUBLIC_DIR, 'tbl_co.pickle'))
    .convert('chrom', lambda v: str(v, 'ascii'))
)
display_with_nrows(tbl_co, caption='CO events')

0|sample,1|chrom,2|co_pos_mid,3|co_pos_min,4|co_pos_max,5|co_pos_range,6|cross,7|co_from_parent,8|co_to_parent
B1SD/PG0015-C/ERR019044,Pf3D7_01_v3,145052,144877,145227,350,hb3_dd2,hb3,dd2
GC03/PG0021-C/ERR015447,Pf3D7_01_v3,163584,163145,164024,879,hb3_dd2,dd2,hb3
XF12/PG0102-C/ERR029143,Pf3D7_01_v3,206769,205803,207736,1933,7g8_gb4,gb4,7g8
7C159/PG0040-Cx/ERR107475,Pf3D7_01_v3,206905,206074,207736,1662,hb3_dd2,hb3,dd2
CH3_61/PG0033-Cx/ERR175544,Pf3D7_01_v3,206905,206074,207736,1662,hb3_dd2,dd2,hb3


In [5]:
tbl_windows = tabulate_core_windows(5000)
tbl_windows

0|chrom,1|start,2|stop
Pf3D7_01_v3,92901,97900
Pf3D7_01_v3,97901,102900
Pf3D7_01_v3,102901,107900
Pf3D7_01_v3,107901,112900
Pf3D7_01_v3,112901,117900


In [10]:
# count COs in windows
tbl_windows_co = (
    tbl_windows
    .intervalleftjoin(tbl_co, lkey='chrom', lstart='start', lstop='stop',
                      rkey='chrom', rstart='co_pos_min', rstop='co_pos_max',
                      include_stop=True)
    .cutout(4)
    .aggregate(key=('chrom', 'start', 'stop'),
               aggregation=lambda vals: collections.Counter([v for v in vals if v is not None]),
               value='cross')
    .rename('value', 'co_count')
    .addfield('co_count_3d7_hb3', lambda row: row.co_count['3d7_hb3'])
    .addfield('co_count_hb3_dd2', lambda row: row.co_count['hb3_dd2'])
    .addfield('co_count_7g8_gb4', lambda row: row.co_count['7g8_gb4'])
)
tbl_windows_co

0|chrom,1|start,2|stop,3|co_count,4|co_count_3d7_hb3,5|co_count_hb3_dd2,6|co_count_7g8_gb4
Pf3D7_01_v3,92901,97900,Counter(),0,0,0
Pf3D7_01_v3,97901,102900,Counter(),0,0,0
Pf3D7_01_v3,102901,107900,Counter(),0,0,0
Pf3D7_01_v3,107901,112900,Counter(),0,0,0
Pf3D7_01_v3,112901,117900,Counter(),0,0,0


In [11]:
tbl_windows_co.valuecounts('co_count_3d7_hb3').displayall()

0|co_count_3d7_hb3,1|count,2|frequency
0,3811,0.912814371257485
1,314,0.0752095808383233
2,50,0.0119760479041916


In [12]:
tbl_windows_co.valuecounts('co_count_hb3_dd2').displayall()

0|co_count_hb3_dd2,1|count,2|frequency
0,3464,0.8297005988023952
1,603,0.1444311377245509
2,101,0.024191616766467
3,7,0.0016766467065868


In [13]:
tbl_windows_co.valuecounts('co_count_7g8_gb4').displayall()

0|co_count_7g8_gb4,1|count,2|frequency
0,3688,0.8833532934131737
1,434,0.1039520958083832
2,48,0.0114970059880239
3,4,0.0009580838323353
4,1,0.0002395209580838


In [16]:
tbl_hotspots = (
    tbl_windows_co
    .select(lambda row: row.co_count_3d7_hb3 >= 2 or row.co_count_hb3_dd2 >= 2 or row.co_count_7g8_gb4 >= 2)
    .addfield('n_hot', lambda row: sum(1 for v in [row.co_count_3d7_hb3, row.co_count_hb3_dd2, row.co_count_7g8_gb4] if v >=2))
)
tbl_hotspots

0|chrom,1|start,2|stop,3|co_count,4|co_count_3d7_hb3,5|co_count_hb3_dd2,6|co_count_7g8_gb4,7|n_hot
Pf3D7_01_v3,202901,207900,"Counter({'hb3_dd2': 2, '7g8_gb4': 1})",0,2,1,1
Pf3D7_01_v3,322901,327900,"Counter({'3d7_hb3': 2, 'hb3_dd2': 1})",2,1,0,1
Pf3D7_01_v3,402901,407900,"Counter({'7g8_gb4': 2, 'hb3_dd2': 1})",0,1,2,1
Pf3D7_01_v3,550312,555311,Counter({'7g8_gb4': 2}),0,0,2,1
Pf3D7_02_v3,205801,210800,Counter({'3d7_hb3': 2}),2,0,0,1


In [17]:
tbl_hotspots.valuecounts('n_hot')

0|n_hot,1|count,2|frequency
1,197,0.965686274509804
2,7,0.034313725490196


In [18]:
tbl_hotspots.nrows()

204