Skip to content

Commit

Permalink
bug fixes; ard: bug with -c
Browse files Browse the repository at this point in the history
  • Loading branch information
tanlongzhi committed Sep 22, 2017
1 parent 2524984 commit 4015d4d
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 4 deletions.
74 changes: 74 additions & 0 deletions ard.py
@@ -0,0 +1,74 @@
import sys
import getopt
import gzip
import copy
from classes import ConData, file_to_con_data, LegData, counts_to_hist_num_with_zero, hist_num_to_string_with_zero, Leg

def ard(argv):
# default parameters
reference_file_name = None
min_separation = None
max_distance = 10000000

# progress display parameters
display_num_ref_cons = 10000

# read arguments
try:
opts, args = getopt.getopt(argv[1:], "c:s:d:")
except getopt.GetoptError as err:
sys.stderr.write("[E::" + __name__ + "] unknown command\n")
return 1
if len(args) == 0:
sys.stderr.write("Usage: metac ard [options] <in.con>\n")
sys.stderr.write("Options:\n")
sys.stderr.write(" -c <ref.con> contact file for reference points [<in.con> itself]\n")
sys.stderr.write(" -s INT only use intra-chromosomal reference points, min separation (bp) [only use inter-chromosomal] \n")
sys.stderr.write(" -d INT max distance (bp, L-inf norm) around reference points [" + str(max_distance) + "]\n")
return 1
for o, a in opts:
if o == "-c":
reference_file_name = a
elif o == "-s":
min_separation = int(a)
elif o == "-d":
max_distance = int(a)

# read CON file
con_file = gzip.open(args[0], "rb") if args[0].endswith(".gz") else open(args[0], "rb")
con_data = file_to_con_data(con_file)
sys.stderr.write("[M::" + __name__ + "] read " + str(con_data.num_cons()) + " contacts (" + str(round(100.0 * con_data.num_intra_chr() / con_data.num_cons(), 2)) + "% intra-chromosomal, " + str(round(100.0 * con_data.num_phased_legs() / con_data.num_cons() / 2, 2)) + "% legs phased)\n")

# read reference CON file
if reference_file_name is None:
# use itself
ref_con_data = copy.deepcopy(con_data)
else:
# open another file
ref_con_file = gzip.open(reference_file_name, "rb") if reference_file_name.endswith(".gz") else open(reference_file_name, "rb")
ref_con_data = file_to_con_data(ref_con_file)
sys.stderr.write("[M::" + __name__ + "] read " + str(ref_con_data.num_cons()) + " reference points (" + str(round(100.0 * ref_con_data.num_intra_chr() / ref_con_data.num_cons(), 2)) + "% intra-chromosomal)\n")

# keep only desired reference points
if min_separation is None:
# inter-chromosomal only
ref_con_data.clean_intra_chr()
else:
# intra-chromosmal only, remove small separations
ref_con_data.clean_inter_chr()
ref_con_data.clean_separation(min_separation)
sys.stderr.write("[M::" + __name__ + "] kept " + str(ref_con_data.num_cons()) + " reference points (" + str(round(100.0 * ref_con_data.num_intra_chr() / ref_con_data.num_cons(), 2)) + "% intra-chromosomal)\n")

# find relation positions
con_data.sort_cons()
num_ref_cons = 0
for ref_con in ref_con_data.get_cons():
num_ref_cons += 1
if num_ref_cons % display_num_ref_cons == 0:
sys.stderr.write("[M::" + __name__ + "] analyzed " + str(num_ref_cons) + " reference points\n")
for con in con_data.get_cons_near_inf(ref_con, max_distance):
sys.stdout.write(con.to_string_around(ref_con) + "\n")


return 0

45 changes: 42 additions & 3 deletions classes.py
Expand Up @@ -274,7 +274,9 @@ def same_chr_with(self, other):

def separation_with(self, other):
return abs(self.ref_locus - other.ref_locus)

def signed_separation_with(self, other):
return self.ref_locus - other.ref_locus

# check if a leg is in a region
def in_reg(self, reg):
if self.ref_name != reg.ref_name:
Expand Down Expand Up @@ -585,6 +587,11 @@ def is_isolated_phased(self, con_data, max_clean_distance, min_clean_count):

def to_string(self):
return "\t".join([leg.to_string() for leg in self.legs])

# ard: print relative locus with respect to a reference point
def to_string_around(self, other):
return str(self.leg_1().signed_separation_with(other.leg_1())) + "\t" + str(self.leg_2().signed_separation_with(other.leg_2()))

def ref_name_tuple_to_string(ref_name_tuple):
return ",".join(ref_name_tuple)

Expand Down Expand Up @@ -632,7 +639,20 @@ def get_cons_near(self, con, max_distance):
break
if self.cons[i].distance_half_with(con) <= max_distance:
yield self.cons[i]

# generator for contacts near a given contact (L-inf norm), assume sorted
def get_cons_near_inf(self, con, max_distance):
start_index = bisect.bisect_left(self.cons, con)
for i in range(start_index, len(self.cons)):
if self.cons[i].distance_leg_1_with(con) > max_distance:
break
if self.cons[i].distance_leg_2_with(con) <= max_distance:
yield self.cons[i]
for i in range(start_index - 1, -1, -1):
if self.cons[i].distance_leg_1_with(con) > max_distance:
break
if self.cons[i].distance_leg_2_with(con) <= max_distance:
yield self.cons[i]

def num_cons(self):
return(len(self.cons))
def num_phased_legs(self):
Expand Down Expand Up @@ -742,6 +762,7 @@ def apply_regs(self, inc_regs, exc_regs):

def to_string(self):
return "\n".join([con.to_string() for con in self.cons])


# a hashmap (tuples of two sorted chromosome names) of lists of contacts (a CON file)
class ConData:
Expand All @@ -753,11 +774,17 @@ def get_cons(self):
for ref_name_tuple in sorted(self.con_lists.keys()):
for con in self.con_lists[ref_name_tuple].get_cons():
yield con

# wrappers for generators
def get_cons_near(self, con, max_distance):
if con.ref_name_tuple() in self.con_lists:
for con in self.con_lists[con.ref_name_tuple()].get_cons_near(con, max_distance):
yield con

def get_cons_near_inf(self, con, max_distance):
if con.ref_name_tuple() in self.con_lists:
for con in self.con_lists[con.ref_name_tuple()].get_cons_near_inf(con, max_distance):
yield con

def add_empty_con_list(self, ref_name_tuple):
self.con_lists[ref_name_tuple] = ConList()

Expand All @@ -772,6 +799,18 @@ def merge_with(self, other):
self.con_lists[ref_name_tuple].merge_with(other.con_lists[ref_name_tuple])
else:
self.con_lists[ref_name_tuple] = other.con_lists[ref_name_tuple]

# remove all intra-chromosomal contacts
def clean_intra_chr(self):
for ref_name_tuple in self.con_lists.keys():
if ref_name_tuple[0] == ref_name_tuple[1]:
del self.con_lists[ref_name_tuple]

# remove all inter-chromosomal contacts
def clean_inter_chr(self):
for ref_name_tuple in self.con_lists.keys():
if ref_name_tuple[0] != ref_name_tuple[1]:
del self.con_lists[ref_name_tuple]

# wrappers for all ConList operations
def sort_cons(self):
Expand Down
2 changes: 1 addition & 1 deletion con_to_ncc.sh
Expand Up @@ -3,4 +3,4 @@
input_file="$1"
output_file=${input_file/.con.gz/.ncc}

gunzip -c "$input_file" | awk -F'[\t,]' -v OFS='\t' 'BEGIN {OFS = " "; haplotypes[0] = "pat"; haplotypes[1] = "mat"} {print $1"("haplotypes[$3]")",$2,$2,$2,$2,"+",$4"("haplotypes[$6]")",$5,$5,$5,$5,"+",NR,0,0;}' > "$output_file"
gunzip -c "$input_file" | awk -F'[\t,]' 'BEGIN {OFS = " "; haplotypes[0] = "pat"; haplotypes[1] = "mat"} {print $1"("haplotypes[$3]")",$2,$2,$2,$2,"+",$4"("haplotypes[$6]")",$5,$5,$5,$5,"+",NR,0,0;}' > "$output_file"
5 changes: 5 additions & 0 deletions metac
Expand Up @@ -16,6 +16,8 @@ def main():
sys.stderr.write(" impute3 impute missing haplotypes in a CON file based on a 3DG file\n")
sys.stderr.write(" clean3 remove contact-poor particles (e.g. centromeres) from a 3DG file\n")
sys.stderr.write(" vis visualize a 3DG file into mmCIF format, and color particles\n")
sys.stderr.write("\n")
sys.stderr.write(" ard contacts around certain points (e.g. other contacts)\n")
return 1
start_time = time.time()
if sys.argv[1] == "seg":
Expand Down Expand Up @@ -45,6 +47,9 @@ def main():
elif sys.argv[1] == "vis":
import vis
return_value = vis.vis(sys.argv[1:])
elif sys.argv[1] == "ard":
import ard
return_value = ard.ard(sys.argv[1:])
else:
sys.stderr.write("[E::" + __name__ + "] unknown command\n")
return 1
Expand Down

0 comments on commit 4015d4d

Please sign in to comment.