Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
seqalign_pathing/src/lib.rs
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
140 lines (122 sloc)
4.42 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::collections::hash_map::{HashMap, Entry}; | |
use std::collections::binary_heap::BinaryHeap; | |
use std::cmp::max; | |
type Pos = (usize,usize); | |
pub type Score = i32; | |
/// The score for inserting or deleting a character, higher is better | |
pub const INDEL: Score = -1; | |
/// The score for two characters not changing | |
pub const MATCH: Score = 0; | |
/// The score for one character changing into another | |
pub const MISMATCH: Score = -1; | |
fn diag(pos: &Pos) -> i32 { | |
(pos.1 as i32) - (pos.0 as i32) | |
} | |
/// find the distance from the diagonal of `goal` times INDEL | |
fn heuristic(pos: &Pos, goal: &Pos) -> Score { | |
let goal_diag = diag(goal); | |
let our_diag = diag(pos); | |
let indel_dist = (our_diag - goal_diag).abs(); | |
// support non-zero MATCH cost | |
let total_dist = max(goal.0 - pos.0, goal.1 - pos.1) as i32; | |
let match_dist = total_dist - indel_dist; | |
return (indel_dist * INDEL) + (match_dist * MATCH); | |
} | |
/// Explore a successor state if we aren't already set to explore it | |
fn add_state(scores: &mut HashMap<Pos,Score>, q: &mut BinaryHeap<(Score,Pos)>, goal: &Pos, new_score: Score, pos: Pos) { | |
match scores.entry(pos) { | |
Entry::Vacant(e) => { e.insert(new_score); }, | |
Entry::Occupied(ref mut e) => { | |
if *(e.get()) >= new_score { return; } // obsolete | |
e.insert(new_score); | |
} | |
} | |
let priority = new_score + heuristic(&pos, goal); | |
q.push((priority, pos)); | |
} | |
/// Helper function to print an ASCII representation of which grid cells were explored | |
#[allow(dead_code)] | |
fn debug_print_table(scores: &HashMap<Pos,Score>, size: Pos) { | |
let (am, bm) = size; | |
for ai in 0..am { | |
for bi in 0..bm { | |
let inside = scores.contains_key(&(ai,bi)); | |
let chr = if inside { 'X' } else { ' ' }; | |
print!("{}", chr); | |
} | |
println!(""); | |
} | |
} | |
/// Find the score for aligning `a` and `b` based on the costs given by INDEL, | |
/// MATCH and MISMATCH. If INDEL and MISMATCH are -1 and MATCH is 0 then the | |
/// result is the negative of the Levenshtein distance of the two inputs. | |
pub fn alignment_score(a: &[u8], b: &[u8]) -> Score { | |
let start = (0,0); | |
let goal = (a.len(),b.len()); | |
let mut q = BinaryHeap::with_capacity(a.len()+b.len()); | |
q.push((heuristic(&start, &goal), start)); | |
let mut scores: HashMap<Pos,Score> = HashMap::with_capacity(a.len()+b.len()); | |
scores.insert(start,0); | |
let mut pop_count: usize = 0; | |
let mut eval_count: usize = 0; | |
loop { | |
let (score, pos) = q.pop().expect("should always reach goal"); | |
let score = score - heuristic(&pos, &goal); | |
pop_count += 1; | |
if pos == goal { | |
println!("scores: {:?}", scores.len()); | |
println!("pops: {:?}", pop_count); | |
println!("evals: {:?}", eval_count); | |
// debug_print_table(&scores, goal.clone()); | |
return score; // found it! | |
} | |
// this should be the best score we've found so far for this position | |
let best_score = scores[&pos]; | |
if score > best_score { | |
panic!("loop in DAG scores: {} < {}", score, best_score); | |
} else if score < best_score { | |
continue; // we've been here before, but better | |
} | |
eval_count += 1; | |
// explore successor states | |
let (ai,bi) = pos; | |
// indel | |
if ai < a.len() { | |
add_state(&mut scores, &mut q, &goal, score + INDEL, (ai+1, bi)); | |
} | |
if bi < b.len() { | |
add_state(&mut scores, &mut q, &goal, score + INDEL, (ai, bi+1)); | |
} | |
if ai < a.len() && bi < b.len() { | |
let diag_score = if a[ai] == b[bi] { | |
MATCH | |
} else { | |
MISMATCH | |
}; | |
add_state(&mut scores, &mut q, &goal, score + diag_score, (ai+1, bi+1)); | |
} | |
} | |
} | |
#[cfg(test)] | |
mod tests { | |
use super::*; | |
#[test] | |
fn it_works() { | |
let a = "GCATGCU"; | |
let b = "GATTACA"; | |
let score = alignment_score(a.as_bytes(), b.as_bytes()); | |
assert_eq!(2*MISMATCH+4*MATCH+2*INDEL, score); | |
} | |
#[test] | |
fn heuristic_1() { | |
let tests = vec![ | |
((0,0), (4,4), 0*INDEL+4*MATCH), | |
((4,3), (4,7), 4*INDEL), | |
((1,0), (4,4), 1*INDEL+3*MATCH), | |
((1,2), (4,4), 1*INDEL+2*MATCH), | |
]; | |
for (pos, goal, correct) in tests.into_iter() { | |
assert_eq!(correct, heuristic(&pos, &goal), "{:?} to {:?}", pos, goal); | |
} | |
} | |
} |