diff --git a/_oasis b/_oasis index 919d93d..df98e3b 100644 --- a/_oasis +++ b/_oasis @@ -75,6 +75,7 @@ Library biocaml , Biocaml_wig , Biocaml_mzData , Biocaml_table + , Biocaml_transcripts CSources: biocaml_pwm_stub.c, biocaml_mzData_stubs.c CCOpt: -O3 diff --git a/src/lib/biocaml_transcripts.ml b/src/lib/biocaml_transcripts.ml index cd799f1..3c6d220 100644 --- a/src/lib/biocaml_transcripts.ml +++ b/src/lib/biocaml_transcripts.ml @@ -1,8 +1,8 @@ open Biocaml_internal_pervasives - -type 'a transcript = { - exons : (int * int) list; - lo : int; +(* +type 'a transcript = { + exons : (int * int) list; + lo : int; hi : int; chr : string; info : 'a @@ -18,39 +18,39 @@ end module SIIMap = MMap.Make(String)(II) module SSMap = MMap.Make(String)(String) -let add_length_to_transcripts transcripts = - let f trx = - let length = +let add_length_to_transcripts transcripts = + let f trx = + let length = let g acc (lo,hi) = hi - lo + acc in - List.fold_left g 0 trx.exons + List.fold_left g 0 trx.exons in { trx with info = trx.info,length } in List.map f transcripts -let of_composite_channel - ?(chr_map=identity) - ?(increment_lo_hi=(0,0)) - ic = - let f acc l = +let of_composite_channel + ?(chr_map=identity) + ?(increment_lo_hi=(0,0)) + ic = + let f acc l = let lst = String.nsplit l "\t" in let inclo,inchi = increment_lo_hi in - let (nm,chr,st,fn) = + let (nm,chr,st,fn) = List.nth lst 0, chr_map (List.nth lst 1), int_of_string (List.nth lst 2) + inclo, int_of_string (List.nth lst 3) + inchi in - let g (nm,chr,st,fn) prev = match prev with - | None -> + let g (nm,chr,st,fn) prev = match prev with + | None -> { - exons = [st,fn]; + exons = [st,fn]; lo = st; hi= fn; chr = chr; info = nm; } - | Some trx -> + | Some trx -> { exons = (st,fn)::(trx.exons); lo = if st < trx.lo then st else trx.lo; @@ -66,29 +66,29 @@ let of_composite_channel let ans = List.rev (SSMap.fold folder [] ans) in add_length_to_transcripts ans -let of_composite_file ?(chr_map=identity) ?(increment_lo_hi=(0,0)) file = +let of_composite_file ?(chr_map=identity) ?(increment_lo_hi=(0,0)) file = try_finally (of_composite_channel ~chr_map ~increment_lo_hi) close_in (open_in file) -let of_bed_channel ?(chr_map=identity) ?(increment_lo_hi=(1,0)) ic = +let of_bed_channel ?(chr_map=identity) ?(increment_lo_hi=(1,0)) ic = let bed = Bed.to_list (Bed.of_channel ~chr_map ~increment_lo_hi ic) in - let f acc (chr,s,f) = + let f acc (chr,s,f) = { exons = [s,f]; lo = s; hi = f; chr = chr_map chr; info = ""; - }::acc + }::acc in let ans = List.rev (List.fold_left f [] bed) in add_length_to_transcripts ans -let of_bed_file ?(chr_map=identity) ?(increment_lo_hi=(1,0)) file = +let of_bed_file ?(chr_map=identity) ?(increment_lo_hi=(1,0)) file = try_finally (of_bed_channel ~chr_map ~increment_lo_hi) close_in (open_in file) let of_gff transcript_name_of_exon gff = - let f transcript_name row prev = + let f transcript_name row prev = let lo,hi = row.Gff.pos in match prev with | None -> { @@ -116,29 +116,29 @@ let of_gff transcript_name_of_exon gff = let ans = Gff.fold g StringMap.empty gff in StringMap.fold (fun _ x ans -> x::ans) ans [] -let all_probes_in - (trx_lst:'a t) - (prbs: (string * int * int * 'b) list) +let all_probes_in + (trx_lst:'a t) + (prbs: (string * int * int * 'b) list) : ('a * 'b array) t = let insert x prev = match prev with None -> [x] | Some l -> x::l in - let siimap_of_exons = + let siimap_of_exons = let f acc trx = SIIMap.add trx.chr (trx.lo,trx.hi) (trx.exons,trx.info) acc in List.fold_left f SIIMap.empty trx_lst - in - let stringmap_of_intervaltrees = + in + let stringmap_of_intervaltrees = let f acc trx = StringMap.add_with trx.chr (insert (trx.lo,trx.hi)) acc in let ans = List.fold_left f StringMap.empty trx_lst in StringMap.map IntervalTree.create ans in - let f acc (chr,s,f,v) = + let f acc (chr,s,f,v) = let itree = StringMap.find chr stringmap_of_intervaltrees in let trxs = IntervalTree.within itree (s,f) in - let g accu trx = + let g accu trx = let (exons,info) = SIIMap.find chr trx siimap_of_exons in - let g_insert (info,u) prev = - match prev with - | None -> info,[v] + let g_insert (info,u) prev = + match prev with + | None -> info,[v] | Some (i,lst) -> (assert (i = info); i,(v::lst)) in match IntervalTree.within (IntervalTree.create exons) (s,f) with @@ -149,8 +149,8 @@ let all_probes_in in let ans = List.fold_left f SIIMap.empty prbs in let ans = SIIMap.map (fun (info,lst) -> (info,Array.of_list lst)) ans in - let f acc trx = - try + let f acc trx = + try { exons = trx.exons; lo = trx.lo; @@ -163,9 +163,9 @@ let all_probes_in List.rev (List.fold_left f [] trx_lst) let all_points_in - (trx_lst:'a t) + (trx_lst:'a t) (points: (string * int * 'b) list) : ('a * 'b array) t = let probes = List.map (fun (x,y,z) -> (x,y,y,z)) points in all_probes_in trx_lst probes - +*) diff --git a/src/lib/biocaml_transcripts.mli b/src/lib/biocaml_transcripts.mli index 29ad2d1..1f93497 100644 --- a/src/lib/biocaml_transcripts.mli +++ b/src/lib/biocaml_transcripts.mli @@ -1,9 +1,9 @@ (** Transcripts are integer intervals containing a list of exons. Exons are themselves defined as a list of integer intervals. *) - -type 'a transcript = { - exons : (int * int) list; - lo : int; +(* +type 'a transcript = { + exons : (int * int) list; + lo : int; hi : int; chr : string; info : 'a @@ -27,7 +27,7 @@ val of_bed_file : with just a single exon in it, which are the coordinates of the transcript itself. Info of type string * int in answer are as in [of_composite_file] but the name is always the empty string. *) - + val of_gff : (Gff.row -> string option) -> Gff.t -> string t (** [of_gff f gff] converts [gff] to a list of [transcript]s. Function [f] will be applied to each row in [gff]. If the row is an exon it @@ -37,7 +37,7 @@ val of_gff : (Gff.row -> string option) -> Gff.t -> string t [info] in the answer is the name of the transcript. Raise [Failure] if [f] does anything erroneous such as map exons on different chromosomes to the same transcript. *) - + val all_probes_in : 'a t -> (string * int * int * 'b) list -> ('a * 'b array) t (** [all_probes_in transcripts probes] bins [probes] into [transcripts]. Each probe in [probes] has a genomic location and value @@ -48,3 +48,4 @@ val all_probes_in : 'a t -> (string * int * int * 'b) list -> ('a * 'b array) t val all_points_in : 'a t -> (string * int * 'b) list -> ('a * 'b array) t (** Like [all_probes_in] but the given "probes" are defined on single base pairs. *) +*)