In [1]:
%run ../../shared_setup.ipynb

docker image cggh/biipy:v1.6.0


In [2]:
fasta

<pyfasta.fasta.Fasta at 0x7fcc605c0a90>

In [5]:
sorted(fasta.keys())

['Pf3D7_01_v3',
 'Pf3D7_02_v3',
 'Pf3D7_03_v3',
 'Pf3D7_04_v3',
 'Pf3D7_05_v3',
 'Pf3D7_06_v3',
 'Pf3D7_07_v3',
 'Pf3D7_08_v3',
 'Pf3D7_09_v3',
 'Pf3D7_10_v3',
 'Pf3D7_11_v3',
 'Pf3D7_12_v3',
 'Pf3D7_13_v3',
 'Pf3D7_14_v3']

In [7]:
fasta['Pf3D7_01_v3']

NpyFastaRecord(0..640851)

In [58]:
class TandemRepeatTable(etl.Table):
    
    def __init__(self, chrom, unit_length, min_tract_length):
        self.chrom = chrom
        self.unit_length = unit_length
        self.min_tract_length = min_tract_length
        
    def __iter__(self):
        chrom = self.chrom
        unit_length = self.unit_length
        min_tract_length = self.min_tract_length
        log('generating tandem repeats for chromosome', chrom, 'unit length', unit_length)

        # header row
        yield ('chrom', 'pos', 'ru', 'nu', 'unit_length', 'tract_length', 'context')
        
        # obtain sequence
        seq = fasta[chrom]
        
        # begin iteration
        i = 0
        while i < len(seq):
            
            # current repeat unit
            ru = seq[i:i+unit_length]
            
            # number of repeat units found
            nu = 1
            
            # next potential unit to compare
            next_unit = seq[i+(nu*unit_length):i+((nu+1)*unit_length)]
            
            # discover units
            while next_unit == ru:
                nu += 1
                next_unit = seq[i+(nu*unit_length):i+((nu+1)*unit_length)]
                
            # compute tract length
            tract_length = nu * unit_length
            
            # decide if discovered
            if tract_length > min_tract_length:
                yield (chrom, i+1, ru, nu, unit_length, tract_length, seq[i-unit_length:i+tract_length+unit_length])
                i += nu * unit_length
            else:
                i += 1
        

In [59]:
tbl1 = TandemRepeatTable('Pf3D7_01_v3', 1, 6)
tbl1

2016-03-11 23:57:25.347237 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 1
2016-03-11 23:57:25.469907 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 1


0|chrom,1|pos,2|ru,3|nu,4|unit_length,5|tract_length,6|context
Pf3D7_01_v3,4179,g,8,1,8,aggggggggt
Pf3D7_01_v3,4337,t,18,1,18,attttttttttttttttttc
Pf3D7_01_v3,4411,t,8,1,8,gtttttttta
Pf3D7_01_v3,4633,t,7,1,7,attttttta
Pf3D7_01_v3,5457,g,8,1,8,aggggggggt


In [60]:
tbl2 = TandemRepeatTable('Pf3D7_01_v3', 2, 9)
tbl2

2016-03-11 23:57:32.072973 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 2
2016-03-11 23:57:32.422906 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 2


0|chrom,1|pos,2|ru,3|nu,4|unit_length,5|tract_length,6|context
Pf3D7_01_v3,4213,tg,6,2,12,tatgtgtgtgtgtgaa
Pf3D7_01_v3,4337,tt,9,2,18,aattttttttttttttttttcg
Pf3D7_01_v3,27416,aa,15,2,30,tcaaaaaaaaaaaaaaaaaaaaaaaaaaaaaatt
Pf3D7_01_v3,27499,aa,5,2,10,agaaaaaaaaaaga
Pf3D7_01_v3,27510,aa,7,2,14,agaaaaaaaaaaaaaaat


In [61]:
tbl_regions_1b

0|region_chrom,1|region_start,2|region_stop,3|region_type,4|region_size
Pf3D7_01_v3,1,27336,SubtelomericRepeat,27336
Pf3D7_01_v3,27337,92900,SubtelomericHypervariable,65564
Pf3D7_01_v3,92901,457931,Core,365031
Pf3D7_01_v3,457932,460311,Centromere,2380
Pf3D7_01_v3,460312,575900,Core,115589


In [62]:
def tabulate_tr(chrom):
    fn = '/data/plasmodium/pfalciparum/pf-crosses/data/genome/sanger/version3/September_2012/%s.tr.pickle' % chrom
    if not os.path.exists(fn):
        tbl_tr = (
            etl
            .cat(*[
                TandemRepeatTable(chrom, 1, 6), 
                TandemRepeatTable(chrom, 2, 9), 
                TandemRepeatTable(chrom, 3, 11), 
                TandemRepeatTable(chrom, 4, 13), 
                TandemRepeatTable(chrom, 5, 14), 
                TandemRepeatTable(chrom, 6, 16), 
                TandemRepeatTable(chrom, 7, 18), 
                TandemRepeatTable(chrom, 8, 18), 
                TandemRepeatTable(chrom, 9, 18), 
                TandemRepeatTable(chrom, 10, 18), 
            ])
            .groupselectmin(key=('chrom', 'pos'), value='unit_length')
            .intervalleftjoin(tbl_regions_1b, lkey='chrom', lstart='pos', lstop='pos', include_stop=True,
                              rkey='region_chrom', rstart='region_start', rstop='region_stop')
            .cutout('region_chrom', 'region_start', 'region_stop', 'region_size')
        )
        tbl_tr.topickle(fn)
    tbl_tr = etl.frompickle(fn)
    return tbl_tr
    

In [63]:
tbl = tabulate_tr('Pf3D7_01_v3')
tbl

0|chrom,1|pos,2|ru,3|nu,4|unit_length,5|tract_length,6|context,7|region_type
Pf3D7_01_v3,13,cctaaac,3,7,21,cctaaaacctaaaccctaaaccctaaaccctgaac,SubtelomericRepeat
Pf3D7_01_v3,285,cctaaac,4,7,28,cctaaaacctaaaccctaaaccctaaaccctaaaccctgaac,SubtelomericRepeat
Pf3D7_01_v3,322,aaaccta,3,7,21,tgaacctaaacctaaaacctaaaacctaaaaccct,SubtelomericRepeat
Pf3D7_01_v3,421,cttacttact,2,10,20,cttagtcttacttacttactcttacttacttactcttatc,SubtelomericRepeat
Pf3D7_01_v3,477,cttacttact,2,10,20,cttagtcttacttacttactcttacttacttactcttatc,SubtelomericRepeat


In [64]:
CHROMOSOMES

[b'Pf3D7_01_v3',
 b'Pf3D7_02_v3',
 b'Pf3D7_03_v3',
 b'Pf3D7_04_v3',
 b'Pf3D7_05_v3',
 b'Pf3D7_06_v3',
 b'Pf3D7_07_v3',
 b'Pf3D7_08_v3',
 b'Pf3D7_09_v3',
 b'Pf3D7_10_v3',
 b'Pf3D7_11_v3',
 b'Pf3D7_12_v3',
 b'Pf3D7_13_v3',
 b'Pf3D7_14_v3']

In [65]:
tbl_tr_wg = etl.cat(*[tabulate_tr(str(chrom, 'ascii')) for chrom in CHROMOSOMES])

2016-03-11 23:57:48.282326 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 1
2016-03-11 23:57:48.283699 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 2
2016-03-11 23:57:48.284878 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 3
2016-03-11 23:57:48.285998 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 4
2016-03-11 23:57:48.287093 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 5
2016-03-11 23:57:48.288331 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 6
2016-03-11 23:57:48.289332 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 7
2016-03-11 23:57:48.290305 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 8
2016-03-11 23:57:48.291258 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 9
2016-03-11 23:57:48.292083 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 10
2016-03-11 23:59:38.755236 ::

In [66]:
tbl_tr_wg

0|chrom,1|pos,2|ru,3|nu,4|unit_length,5|tract_length,6|context,7|region_type
Pf3D7_01_v3,13,cctaaac,3,7,21,cctaaaacctaaaccctaaaccctaaaccctgaac,SubtelomericRepeat
Pf3D7_01_v3,285,cctaaac,4,7,28,cctaaaacctaaaccctaaaccctaaaccctaaaccctgaac,SubtelomericRepeat
Pf3D7_01_v3,322,aaaccta,3,7,21,tgaacctaaacctaaaacctaaaacctaaaaccct,SubtelomericRepeat
Pf3D7_01_v3,421,cttacttact,2,10,20,cttagtcttacttacttactcttacttacttactcttatc,SubtelomericRepeat
Pf3D7_01_v3,477,cttacttact,2,10,20,cttagtcttacttacttactcttacttacttactcttatc,SubtelomericRepeat


In [67]:
tbl_tr_wg.eq('region_type', 'Core')

0|chrom,1|pos,2|ru,3|nu,4|unit_length,5|tract_length,6|context,7|region_type
Pf3D7_01_v3,93098,a,7,1,7,caaaaaaat,Core
Pf3D7_01_v3,93902,at,19,2,38,taatatatatatatatatatatatatatatatatatatataa,Core
Pf3D7_01_v3,94016,a,7,1,7,taaaaaaat,Core
Pf3D7_01_v3,94258,at,12,2,24,aaatatatatatatatatatatatataa,Core
Pf3D7_01_v3,94433,taa,4,3,12,ttttaataataataatat,Core
