Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

562 lines (455 sloc) 19.839 kb
# cython: profile=True
# Time-stamp: <2012-07-02 15:57:48 Tao Liu>
"""Module Description: For pileup functions.
Copyright (c) 2011 Tao Liu <taoliu@jimmy.harvard.edu>
This code is free software; you can redistribute it and/or modify it
under the terms of the BSD License (see the file COPYING included with
the distribution).
@status: experimental
@version: $Revision$
@author: Tao Liu
@contact: taoliu@jimmy.harvard.edu
"""
# ------------------------------------
# python modules
# ------------------------------------
import sys
from array import array
from MACS2.IO.cFixWidthTrack import FWTrackIII
from MACS2.IO.cPairedEndTrack import PETrackI
from MACS2.IO.cBedGraph import bedGraphTrackI
from MACS2.Constants import *
import numpy as np
cimport numpy as np
from numpy cimport int32_t
from cpython cimport bool
# ------------------------------------
# functions
# ------------------------------------
cdef inline int int_max(int a, int b): return a if a >= b else b
cdef inline long long_max(long a, long b): return a if a >= b else b
cdef inline float float_max(float a, float b): return a if a >= b else b
# Unified pileup function #
cpdef unified_pileup_bdg(track,
ds,
scale_factors,
float baseline_value = 0.0,
bint directional = True,
bint halfextension = True):
chrs = track.get_chr_names()
# if track.rlengths is None:
# track.rlengths = dict([(k, INT_MAX) for k in chrs])
# else:
# rlengths2 = dict([(k, track.rlengths[k]) for k in chrs])
# track.rlengths = rlengths2
if type(ds) is list:
# multiple pileup
if isinstance(track, FWTrackIII):
return pileup_w_multiple_d_bdg(track, ds, scale_factors,
baseline_value,
directional, halfextension)
elif isinstance(track, PETrackI):
return pileup_w_multiple_d_bdg_pe(track, ds, scale_factors,
baseline_value)
else:
raise ValueError("track must be of type FWTrackIII or PETrackI")
else:
if isinstance(track, FWTrackIII):
return pileup_bdg_se(track, ds, scale_factors,
baseline_value,
directional, halfextension)
elif isinstance(track, PETrackI):
if ds is None:
return pileup_bdg_pe(track, scale_factors, baseline_value)
else:
return pileup_bdg_pe_w_ext(track, ds, scale_factors,
baseline_value)
else:
raise ValueError("track must be of type FWTrackIII or PETrackI")
## Fixed-width functions ##
cdef pileup_bdg_se(object trackI, int d,
float scale_factor = 1.0,
float baseline_value = 0.0,
bool directional = True,
bool halfextension = True ):
"""Pileup tags into bedGraphTrackI object with extension. Tag will
be extended towards 3' side with size of d if directional is Ture,
or both sides with d/2 if directional is False.
A tag is a single genomic location.
trackI : A FWTrackIII object with raw plus and minus 5' end positions
d : tag will be extended to this value to 3' direction, unless directional is False.
baseline_value : a value to be filled for missing values.
directional: if False, the strand or direction of tag will be ignored, so that extenstion will be both sides with d/2.
halfextension: only make a fragment of d/2 size centered at fragment center
Return a bedGraphTrackI object.
"""
cdef:
long five_shift, three_shift, l
int rlength
str chrom
Ends ends
np.ndarray[np.int32_t, ndim=1] plus_tags, minus_tags
dict chrlengths = trackI.get_rlengths ()
#cdef int * start_poss
#cdef int * end_poss
ret = bedGraphTrackI(baseline_value=baseline_value) # bedGraphTrackI object to be returned.
if directional:
# only extend to 3' side
if halfextension:
five_shift = d/-4 # five shift is used to move cursor towards 5' direction to find the start of fragment
three_shift = d*3/4 # three shift is used to move cursor towards 3' direction to find the end of fragment
else:
five_shift = 0
three_shift = d
else:
# both sides
if halfextension:
five_shift = d/4
three_shift = five_shift
else:
five_shift = d/2
three_shift = d - five_shift
for chrom in sorted(chrlengths.keys()):
rlength = chrlengths[chrom]
(plus_tags, minus_tags) = trackI.get_locations_by_chr(chrom)
ends = start_and_end_poss( plus_tags, minus_tags, five_shift, three_shift , rlength)
ret.add_a_chromosome( chrom, pileup_a_chromosome ( ends.startposs, ends.endposs, scale_factor, baseline_value ) )
# free mem
ends.startposs.resize(100000, refcheck=False)
ends.startposs.resize(0, refcheck=False)
ends.endposs.resize(100000, refcheck=False)
ends.endposs.resize(0, refcheck=False)
return ret
cdef pileup_w_multiple_d_bdg(object trackI, list d_s, list scale_factor_s = [],
float baseline_value = 0.0,
bool directional = True,
bool halfextension = True):
"""Pileup tags into bedGraphTrackI object with extension. Tag will
be extended towards 3' side with size of d if directional is Ture,
or both sides with d/2 if directional is False.
A tag is a single genomic location.
trackI : A FWTrackIII object with raw plus and minus 5' end positions
d : tag will be extended to this value to 3' direction, unless directional is False.
baseline_value : a value to be filled for missing values.
directional: if False, the strand or direction of tag will be ignored, so that extenstion will be both sides with d/2.
halfextension: only make a fragment of d/2 size centered at fragment center
Return a bedGraphTrackI object.
"""
cdef:
long d, five_shift, three_shift, l, i
float scale_factor
dict chrlengths = trackI.get_rlengths()
Ends ends
assert len(d_s) == len(scale_factor_s), "Arguments d_s and scale_factor_s should have the same length!"
ret = bedGraphTrackI(baseline_value=baseline_value) # bedGraphTrackI object to be returned.
chrs = trackI.get_chr_names()
five_shift_s = []
three_shift_s = []
for d in d_s:
if directional:
# only extend to 3' side
if halfextension:
five_shift_s.append(d/-4) # five shift is used to move cursor towards 5' direction to find the start of fragment
three_shift_s.append(d*3/4) # three shift is used to move cursor towards 3' direction to find the end of fragment
else:
five_shift_s.append(0)
three_shift_s.append(d)
else:
# both sides
if halfextension:
five_shift_s.append(d/4)
three_shift_s.append(d/4)
else:
five_shift_s.append(d/2)
three_shift_s.append(d - d/2)
for chrom in sorted(chrlengths.keys()):
rlength = chrlengths[chrom]
(plus_tags,minus_tags) = trackI.get_locations_by_chr(chrom)
prev_pileup = None
for i in range(len(d_s)):
five_shift = five_shift_s[i]
three_shift = three_shift_s[i]
scale_factor = scale_factor_s[i]
#start_poss = build_start_poss( plus_tags, minus_tags, five_shift, three_shift, l )
#end_poss = build_end_poss( plus_tags, minus_tags, five_shift, three_shift, l )
ends = start_and_end_poss( plus_tags, minus_tags, five_shift, three_shift, rlength )
#print chrom, baseline_value
tmp_pileup = pileup_a_chromosome ( ends.startposs, ends.endposs, scale_factor, baseline_value )
# free mem
#del(start_poss)
#del(end_poss)
ends.startposs.resize(100000, refcheck=False)
ends.startposs.resize(0, refcheck=False)
ends.endposs.resize(100000, refcheck=False)
ends.endposs.resize(0, refcheck=False)
if prev_pileup:
prev_pileup = max_over_two_pv_array ( prev_pileup, tmp_pileup )
else:
prev_pileup = tmp_pileup
ret.add_a_chromosome( chrom, prev_pileup )
return ret
## Paired-end functions ##
# baseline_value needs to be float not int, otherwise we cause error in
# poisson CDF
cdef pileup_bdg_pe(object trackI, float scale_factor, float baseline_value):
"""Pileup fragments into bedGraphTrackI object.
trackI : A PETrackI object with genomic locations
baseline_value : a value to be filled for missing values.
scale_factor : value to be multiplied at each position
Return a bedGraphTrackI object.
"""
cdef:
int rlength
str chrom
np.ndarray[np.int32_t, ndim=2] locs
dict chrlengths = trackI.get_rlengths ()
ret = bedGraphTrackI(baseline_value=baseline_value) # bedGraphTrackI object to be returned.
for chrom in sorted(chrlengths.keys()):
rlength = chrlengths[chrom]
locs = trackI.get_locations_by_chr(chrom)
ret.add_a_chromosome(chrom, pileup_a_chromosome(locs[:,0], locs[:,1],
scale_factor,
baseline_value))
return ret
cdef pileup_bdg_pe_w_ext (object trackI, int d, float scale_factor = 1.0,
float baseline_value = 0.0):
"""Pileup fragments into bedGraphTrackI object with extension. Fragment will
be extended both directions from midpoint by distance d/2, or the original
width will be used if d = 0
trackI : A PETrackI object with genomic locations
d : fragments will be extended by 1/2 this value in both directions
from their midpoint
baseline_value : a value to be filled for missing values.
scale_factor : value to be multiplied at each position
Return a bedGraphTrackI object.
"""
cdef:
int five_shift, three_shift
int rlength
str chrom
np.ndarray[np.int32_t, ndim=2] locs
np.ndarray[np.int32_t, ndim=1] start_poss, end_poss
dict chrlengths = trackI.get_rlengths ()
ret = bedGraphTrackI(baseline_value=baseline_value) # bedGraphTrackI object to be returned.
five_shift = d/2
three_shift = d - five_shift
for chrom in sorted(chrlengths.keys()):
rlength = chrlengths[chrom]
locs = trackI.get_locations_by_chr(chrom)
midpoints = locs[:,0] + (locs[:,1] - locs[:,0]) / 2
# fix negative coordinations
start_poss = midpoints - five_shift
start_poss = fix_coordinates(start_poss, rlength)
end_poss = midpoints + three_shift
end_poss = fix_coordinates( end_poss, rlength)
pileup = pileup_a_chromosome ( start_poss, end_poss, scale_factor,
baseline_value )
ret.add_a_chromosome( chrom, pileup )
# free mem
start_poss.resize(100000, refcheck=False)
start_poss.resize(0, refcheck=False)
end_poss.resize(100000, refcheck=False)
end_poss.resize(0, refcheck=False)
return ret
cdef pileup_w_multiple_d_bdg_pe ( object trackI, list d_s = [],
list scale_factor_s = [],
float baseline_value = 0):
"""Pileup fragments into bedGraphTrackI object with extension. Fragment will
be extended by d / 2 in both directions from midpoint
trackI : A PETrackI object
d_s : tag will be extended by 1/2 this value in both directions by
each d and multiplied by corresponding scale_factor
baseline_value : a value to be filled for missing values.
scale_factor_s: same length as d_s, scale factor for each d
scale_factor_0: scale factor for original fragments
Return a bedGraphTrackI object.
"""
cdef:
long d, five_shift, three_shift, l, i
float scale_factor
dict chrlengths = trackI.get_rlengths ()
#cdef int * start_poss
#cdef int * end_poss
assert len(d_s) == len(scale_factor_s), "Arguments d_s and scale_factor_s should have the same length!"
ret = bedGraphTrackI(baseline_value=baseline_value) # bedGraphTrackI object to be returned.
chrs = trackI.get_chr_names()
five_shift_s = [d / 2 for d in d_s[1:]]
three_shift_s = [d - d / 2 for d in d_s[1:]]
for chrom in sorted(chrlengths.keys()):
rlength = chrlengths[chrom]
locs = trackI.get_locations_by_chr(chrom)
midpoints = locs[:,0] + (locs[:,1] - locs[:,0]) / 2
prev_pileup = pileup_a_chromosome(locs[:,0], locs[:,1],
scale_factor_s[0], baseline_value)
for i in range(len(five_shift_s)):
five_shift = five_shift_s[i]
three_shift = three_shift_s[i]
scale_factor = scale_factor_s[i + 1]
# fix negative coordinations
start_poss = midpoints - five_shift
start_poss = fix_coordinates(start_poss, rlength)
end_poss = midpoints + three_shift
end_poss = fix_coordinates( end_poss, rlength)
#print chrom, baseline_value
tmp_pileup = pileup_a_chromosome(start_poss, end_poss, scale_factor,
baseline_value)
# free mem
#del(start_poss)
#del(end_poss)
start_poss.resize(100000, refcheck=False)
start_poss.resize(0, refcheck=False)
end_poss.resize(100000, refcheck=False)
end_poss.resize(0, refcheck=False)
prev_pileup = max_over_two_pv_array ( prev_pileup, tmp_pileup )
ret.add_a_chromosome( chrom, prev_pileup )
return ret
cdef class Ends:
cdef:
np.ndarray startposs
np.ndarray endposs
cdef start_and_end_poss ( np.ndarray plus_tags, np.ndarray minus_tags,
long five_shift, long three_shift, int rlength):
cdef:
long i
long lp = plus_tags.shape[0]
long lm = minus_tags.shape[0]
long l = lp + lm
start_poss = np.concatenate( ( plus_tags-five_shift, minus_tags-three_shift ) )
end_poss = np.concatenate( ( plus_tags+three_shift, minus_tags+five_shift ) )
# sort
start_poss.sort()
end_poss.sort()
# fix negative coordinations and those extends over end of chromosomes
ends = Ends()
ends.startposs = fix_coordinates(start_poss, rlength)
ends.endposs = fix_coordinates(end_poss, rlength)
return ends
cdef np.ndarray[np.int32_t, ndim=1] fix_coordinates(np.ndarray[np.int32_t, ndim=1] poss,
int rlength):
cdef long i
for i in range( poss.shape[0] ):
if poss[i] < 0:
poss[i] = 0
else:
break
for i in range( poss.shape[0]-1, -1, -1 ):
if poss[i] > rlength:
poss[i] = rlength
else:
break
return poss
cdef pileup_a_chromosome ( np.ndarray start_poss, np.ndarray end_poss, float scale_factor = 1, float baseline_value = 0 ):
"""Return pileup of one chromosome.
"""
cdef:
long i_s, i_e, i
int a, b, p, pre_p, pileup
long l = end_poss.shape[0]
tmp = [array(BYTE4,[]),array(FBYTE4,[])] # for (endpos,value)
tmppadd = tmp[0].append
tmpvadd = tmp[1].append
i_s = 0 # index of start_poss
i_e = 0 # index of end_poss
pileup = 0
if start_poss.shape[0] == 0: return tmp
pre_p = min(start_poss[0],end_poss[0])
#print pre_p
if pre_p != 0:
# the first chunk of 0
tmppadd( pre_p )
tmpvadd( float_max(0,baseline_value) )
pre_v = pileup
while i_s < l and i_e < l:
a = start_poss[i_s]
b = end_poss[i_e]
if a < b:
p = a
if p != pre_p:
tmppadd( p )
tmpvadd( float_max(pileup * scale_factor, baseline_value) )
pre_p = p
pileup += 1
i_s += 1
elif a > b:
p = b
if p != pre_p:
tmppadd( p )
tmpvadd( float_max(pileup * scale_factor, baseline_value) )
pre_p = p
pileup -= 1
i_e += 1
else:
i_s += 1
i_e += 1
if i_e < l:
# add rest of end positions
for i in range(i_e, l):
p = end_poss[i]
#for p in end_poss[i_e:]:
if p != pre_p:
tmppadd( p )
tmpvadd( float_max(pileup * scale_factor, baseline_value) )
#ret.add_loc(chrom,pre_p,p,pileup)
pre_p = p
pileup -= 1
# if i_s < l:
# # add rest of start positions ( I don't think this will happen )
# raise Exception("start positions can't be the only things left!")
return tmp
cdef max_over_two_pv_array ( tmparray1, tmparray2 ):
"""Merge two position-value arrays. For intersection regions, take
the maximum value within region.
"""
cdef:
int pre_p, p1, p2
float v1, v2
list tmp
long l1, l2, l
tmp = [array(BYTE4,[]),array(FBYTE4,[])] # for (endpos,value)
tmppadd = tmp[0].append
tmpvadd = tmp[1].append
(p1s,v1s) = tmparray1
p1n = iter(p1s).next # assign the next function to a viable to speed up
v1n = iter(v1s).next
l1 = len(p1s)
(p2s,v2s) = tmparray2
p2n = iter(p2s).next # assign the next function to a viable to speed up
v2n = iter(v2s).next
l2 = len(p2s)
pre_p = 0 # remember the previous position in the new bedGraphTrackI object ret
try:
p1 = p1n()
v1 = v1n()
p2 = p2n()
v2 = v2n()
while True:
if p1 < p2:
# clip a region from pre_p to p1, then set pre_p as p1.
tmppadd( p1 )
tmpvadd( float_max(v1,v2) )
pre_p = p1
# call for the next p1 and v1
p1 = p1n()
v1 = v1n()
elif p2 < p1:
# clip a region from pre_p to p2, then set pre_p as p2.
tmppadd( p2 )
tmpvadd( float_max(v1,v2) )
pre_p = p2
# call for the next p2 and v2
p2 = p2n()
v2 = v2n()
elif p1 == p2:
# from pre_p to p1 or p2, then set pre_p as p1 or p2.
tmppadd( p1 )
tmpvadd( float_max(v1,v2) )
pre_p = p1
# call for the next p1, v1, p2, v2.
p1 = p1n()
v1 = v1n()
p2 = p2n()
v2 = v2n()
except StopIteration:
# meet the end of either bedGraphTrackI, simply exit
pass
return tmp
Jump to Line
Something went wrong with that request. Please try again.