In [1]:
%run ../../shared_setup.ipynb

docker image cggh/biipy:v1.6.0


In [2]:
fasta

<pyfasta.fasta.Fasta at 0x7f8ba809beb8>

In [3]:
sorted(fasta.keys())

['Pf3D7_01_v3',
 'Pf3D7_02_v3',
 'Pf3D7_03_v3',
 'Pf3D7_04_v3',
 'Pf3D7_05_v3',
 'Pf3D7_06_v3',
 'Pf3D7_07_v3',
 'Pf3D7_08_v3',
 'Pf3D7_09_v3',
 'Pf3D7_10_v3',
 'Pf3D7_11_v3',
 'Pf3D7_12_v3',
 'Pf3D7_13_v3',
 'Pf3D7_14_v3']

In [4]:
fasta['Pf3D7_01_v3']

NpyFastaRecord(0..640851)

In [5]:
class TandemRepeatTable(etl.Table):
    
    def __init__(self, chrom, unit_length):
        self.chrom = chrom
        self.unit_length = unit_length
        
    def __iter__(self):
        chrom = self.chrom
        unit_length = self.unit_length

        # header row
        yield ('chrom', 'start', 'stop', 'repeat_unit', 'n_units', 'unit_length', 'last_unit_length', 'tract_length')
        
        log('generating tandem repeats for chromosome', chrom, 'unit length', unit_length)

        # obtain sequence
        seq = fasta[chrom]
        
        # begin iteration
        i = 0
        while i < len(seq):
            
            # current repeat unit
            ru = seq[i:i+unit_length]
            
            # decide if repeat unit is not primitive
            primitive = True
            for n in range(1, unit_length):
                if unit_length % n == 0:
                    subs = set([ru[k:k+n] for k in range(0, unit_length, n)])
                    if len(subs) == 1:
                        primitive = False
                        break

            if not primitive:
                i += 1
                continue
                
            # number of repeat units found
            nu = 1
            
            # next potential unit to compare
            next_unit = seq[i+(nu*unit_length):i+((nu+1)*unit_length)]
            
            # discover whole units
            while next_unit == ru:
                nu += 1
                next_unit = seq[i+(nu*unit_length):i+((nu+1)*unit_length)]

            # discovered a repeat
            if nu > 1:

                # compute length of last (partial) unit
                for j, (a, b) in enumerate(zip(ru, next_unit)):
                    if a != b:
                        last_unit_length = j
                        break
                
                # compute tract length
                tract_length = (nu * unit_length) + last_unit_length
            
                yield (chrom, i+1, i+1+tract_length, ru, nu, unit_length, last_unit_length, tract_length)
                i += ((nu - 1) * unit_length) + 1
            
            else:
                i += 1
        

In [6]:
tbl1 = TandemRepeatTable('Pf3D7_01_v3', 1)
tbl1.display(10)

2016-03-13 23:29:04.590688 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 1


0|chrom,1|start,2|stop,3|repeat_unit,4|n_units,5|unit_length,6|last_unit_length,7|tract_length
Pf3D7_01_v3,3,5,a,2,1,0,2
Pf3D7_01_v3,5,8,c,3,1,0,3
Pf3D7_01_v3,9,13,a,4,1,0,4
Pf3D7_01_v3,13,15,c,2,1,0,2
Pf3D7_01_v3,16,19,a,3,1,0,3
Pf3D7_01_v3,19,22,c,3,1,0,3
Pf3D7_01_v3,23,26,a,3,1,0,3
Pf3D7_01_v3,26,29,c,3,1,0,3
Pf3D7_01_v3,30,33,a,3,1,0,3
Pf3D7_01_v3,33,36,c,3,1,0,3


In [7]:
tbl2 = TandemRepeatTable('Pf3D7_01_v3', 2)
tbl2.display(10)

2016-03-13 23:29:05.547180 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 2


0|chrom,1|start,2|stop,3|repeat_unit,4|n_units,5|unit_length,6|last_unit_length,7|tract_length
Pf3D7_01_v3,401,405,ct,2,2,0,4
Pf3D7_01_v3,429,433,ct,2,2,0,4
Pf3D7_01_v3,443,447,ct,2,2,0,4
Pf3D7_01_v3,485,489,ct,2,2,0,4
Pf3D7_01_v3,499,503,ct,2,2,0,4
Pf3D7_01_v3,527,531,ct,2,2,0,4
Pf3D7_01_v3,569,573,ct,2,2,0,4
Pf3D7_01_v3,583,587,ct,2,2,0,4
Pf3D7_01_v3,597,601,ct,2,2,0,4
Pf3D7_01_v3,611,615,ct,2,2,0,4


In [8]:
tbl2.gt('start', 94980).display(10)

2016-03-13 23:29:06.536204 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 2


0|chrom,1|start,2|stop,3|repeat_unit,4|n_units,5|unit_length,6|last_unit_length,7|tract_length
Pf3D7_01_v3,94981,94988,at,3,2,1,7
Pf3D7_01_v3,94991,94995,ca,2,2,0,4
Pf3D7_01_v3,94994,95022,at,14,2,0,28
Pf3D7_01_v3,95027,95032,ta,2,2,1,5
Pf3D7_01_v3,95034,95038,ct,2,2,0,4
Pf3D7_01_v3,95071,95075,ta,2,2,0,4
Pf3D7_01_v3,95075,95080,at,2,2,1,5
Pf3D7_01_v3,95089,95093,ca,2,2,0,4
Pf3D7_01_v3,95096,95103,ac,3,2,1,7
Pf3D7_01_v3,95157,95161,ca,2,2,0,4


In [9]:
tbl3 = TandemRepeatTable('Pf3D7_01_v3', 3)
tbl3.display(10)

2016-03-13 23:29:07.745614 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 3


0|chrom,1|start,2|stop,3|repeat_unit,4|n_units,5|unit_length,6|last_unit_length,7|tract_length
Pf3D7_01_v3,367,378,ttc,3,3,2,11
Pf3D7_01_v3,379,386,tct,2,3,1,7
Pf3D7_01_v3,449,456,tct,2,3,1,7
Pf3D7_01_v3,505,512,tct,2,3,1,7
Pf3D7_01_v3,547,554,tct,2,3,1,7
Pf3D7_01_v3,617,624,tct,2,3,1,7
Pf3D7_01_v3,673,680,tct,2,3,1,7
Pf3D7_01_v3,759,770,ttc,3,3,2,11
Pf3D7_01_v3,771,778,tct,2,3,1,7
Pf3D7_01_v3,785,792,tct,2,3,1,7


In [10]:
tbl4 = TandemRepeatTable('Pf3D7_01_v3', 4)
tbl4.display(10)

2016-03-13 23:29:07.936114 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 4


0|chrom,1|start,2|stop,3|repeat_unit,4|n_units,5|unit_length,6|last_unit_length,7|tract_length
Pf3D7_01_v3,403,415,ctta,3,4,0,12
Pf3D7_01_v3,417,431,ctta,3,4,2,14
Pf3D7_01_v3,431,445,ctta,3,4,2,14
Pf3D7_01_v3,473,487,ctta,3,4,2,14
Pf3D7_01_v3,487,501,ctta,3,4,2,14
Pf3D7_01_v3,529,543,ctta,3,4,2,14
Pf3D7_01_v3,571,585,ctta,3,4,2,14
Pf3D7_01_v3,585,599,ctta,3,4,2,14
Pf3D7_01_v3,599,613,ctta,3,4,2,14
Pf3D7_01_v3,641,655,ctta,3,4,2,14


In [11]:
tbl_regions_1b

0|region_chrom,1|region_start,2|region_stop,3|region_type,4|region_size
Pf3D7_01_v3,1,27336,SubtelomericRepeat,27336
Pf3D7_01_v3,27337,92900,SubtelomericHypervariable,65564
Pf3D7_01_v3,92901,457931,Core,365031
Pf3D7_01_v3,457932,460311,Centromere,2380
Pf3D7_01_v3,460312,575900,Core,115589


In [12]:
#!rm -v /data/plasmodium/pfalciparum/pf-crosses/data/genome/sanger/version3/September_2012/*.tr.pickle

rm: cannot remove '/data/plasmodium/pfalciparum/pf-crosses/data/genome/sanger/version3/September_2012/*.tr.pickle': No such file or directory


In [13]:
def tabulate_tr(chrom):
    fn = '/data/plasmodium/pfalciparum/pf-crosses/data/genome/sanger/version3/September_2012/%s.tr.pickle' % chrom
    if not os.path.exists(fn):
        tbl_tr = (
            etl
            .cat(*[TandemRepeatTable(chrom, ul) for ul in range(1, 25, 1)])
            .intervalleftjoin(tbl_regions_1b, lkey='chrom', lstart='start', lstop='stop', include_stop=True,
                              rkey='region_chrom', rstart='region_start', rstop='region_stop')
            .cutout('region_chrom', 'region_start', 'region_stop', 'region_size')
        )
        tbl_tr.topickle(fn)
    tbl_tr = etl.frompickle(fn)
    return tbl_tr
    

In [14]:
tbl = tabulate_tr('Pf3D7_01_v3')
tbl

2016-03-13 23:29:20.904568 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 1
2016-03-13 23:29:30.378943 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 2
2016-03-13 23:29:38.481936 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 3
2016-03-13 23:29:47.537447 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 4
2016-03-13 23:29:58.293079 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 5
2016-03-13 23:30:08.769620 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 6
2016-03-13 23:30:20.928325 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 7
2016-03-13 23:30:31.144460 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 8
2016-03-13 23:30:43.662936 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 9
2016-03-13 23:30:55.600663 :: generating tandem repeats for chromosome Pf3D7_01_v3 unit length 10
2016-03-13 23:31:08.803971 ::

0|chrom,1|start,2|stop,3|repeat_unit,4|n_units,5|unit_length,6|last_unit_length,7|tract_length,8|region_type
Pf3D7_01_v3,3,5,a,2,1,0,2,SubtelomericRepeat
Pf3D7_01_v3,5,8,c,3,1,0,3,SubtelomericRepeat
Pf3D7_01_v3,9,13,a,4,1,0,4,SubtelomericRepeat
Pf3D7_01_v3,13,15,c,2,1,0,2,SubtelomericRepeat
Pf3D7_01_v3,16,19,a,3,1,0,3,SubtelomericRepeat


In [15]:
tbl.sort(key=('chrom', 'start')).display(10)

0|chrom,1|start,2|stop,3|repeat_unit,4|n_units,5|unit_length,6|last_unit_length,7|tract_length,8|region_type
Pf3D7_01_v3,3,5,a,2,1,0,2,SubtelomericRepeat
Pf3D7_01_v3,5,8,c,3,1,0,3,SubtelomericRepeat
Pf3D7_01_v3,9,13,a,4,1,0,4,SubtelomericRepeat
Pf3D7_01_v3,13,15,c,2,1,0,2,SubtelomericRepeat
Pf3D7_01_v3,13,37,cctaaac,3,7,3,24,SubtelomericRepeat
Pf3D7_01_v3,16,19,a,3,1,0,3,SubtelomericRepeat
Pf3D7_01_v3,19,22,c,3,1,0,3,SubtelomericRepeat
Pf3D7_01_v3,23,26,a,3,1,0,3,SubtelomericRepeat
Pf3D7_01_v3,24,65,aaccctaaaccctg,2,14,13,41,SubtelomericRepeat
Pf3D7_01_v3,26,29,c,3,1,0,3,SubtelomericRepeat


In [16]:
CHROMOSOMES

[b'Pf3D7_01_v3',
 b'Pf3D7_02_v3',
 b'Pf3D7_03_v3',
 b'Pf3D7_04_v3',
 b'Pf3D7_05_v3',
 b'Pf3D7_06_v3',
 b'Pf3D7_07_v3',
 b'Pf3D7_08_v3',
 b'Pf3D7_09_v3',
 b'Pf3D7_10_v3',
 b'Pf3D7_11_v3',
 b'Pf3D7_12_v3',
 b'Pf3D7_13_v3',
 b'Pf3D7_14_v3']

In [17]:
tbl_tr_wg = etl.cat(*[tabulate_tr(str(chrom, 'ascii')) for chrom in CHROMOSOMES])

2016-03-13 23:34:38.935962 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 1
2016-03-13 23:34:52.743808 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 2
2016-03-13 23:35:04.465945 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 3
2016-03-13 23:35:17.622739 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 4
2016-03-13 23:35:32.424784 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 5
2016-03-13 23:35:46.880182 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 6
2016-03-13 23:36:04.431485 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 7
2016-03-13 23:36:19.590832 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 8
2016-03-13 23:36:38.275168 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 9
2016-03-13 23:36:55.889858 :: generating tandem repeats for chromosome Pf3D7_02_v3 unit length 10
2016-03-13 23:37:15.354497 ::

In [18]:
tbl_tr_wg

0|chrom,1|start,2|stop,3|repeat_unit,4|n_units,5|unit_length,6|last_unit_length,7|tract_length,8|region_type
Pf3D7_01_v3,3,5,a,2,1,0,2,SubtelomericRepeat
Pf3D7_01_v3,5,8,c,3,1,0,3,SubtelomericRepeat
Pf3D7_01_v3,9,13,a,4,1,0,4,SubtelomericRepeat
Pf3D7_01_v3,13,15,c,2,1,0,2,SubtelomericRepeat
Pf3D7_01_v3,16,19,a,3,1,0,3,SubtelomericRepeat
