-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathlib.rs
140 lines (122 loc) · 4.42 KB
/
lib.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
use std::collections::hash_map::{HashMap, Entry};
use std::collections::binary_heap::BinaryHeap;
use std::cmp::max;
type Pos = (usize,usize);
pub type Score = i32;
/// The score for inserting or deleting a character, higher is better
pub const INDEL: Score = -1;
/// The score for two characters not changing
pub const MATCH: Score = 0;
/// The score for one character changing into another
pub const MISMATCH: Score = -1;
fn diag(pos: &Pos) -> i32 {
(pos.1 as i32) - (pos.0 as i32)
}
/// find the distance from the diagonal of `goal` times INDEL
fn heuristic(pos: &Pos, goal: &Pos) -> Score {
let goal_diag = diag(goal);
let our_diag = diag(pos);
let indel_dist = (our_diag - goal_diag).abs();
// support non-zero MATCH cost
let total_dist = max(goal.0 - pos.0, goal.1 - pos.1) as i32;
let match_dist = total_dist - indel_dist;
return (indel_dist * INDEL) + (match_dist * MATCH);
}
/// Explore a successor state if we aren't already set to explore it
fn add_state(scores: &mut HashMap<Pos,Score>, q: &mut BinaryHeap<(Score,Pos)>, goal: &Pos, new_score: Score, pos: Pos) {
match scores.entry(pos) {
Entry::Vacant(e) => { e.insert(new_score); },
Entry::Occupied(ref mut e) => {
if *(e.get()) >= new_score { return; } // obsolete
e.insert(new_score);
}
}
let priority = new_score + heuristic(&pos, goal);
q.push((priority, pos));
}
/// Helper function to print an ASCII representation of which grid cells were explored
#[allow(dead_code)]
fn debug_print_table(scores: &HashMap<Pos,Score>, size: Pos) {
let (am, bm) = size;
for ai in 0..am {
for bi in 0..bm {
let inside = scores.contains_key(&(ai,bi));
let chr = if inside { 'X' } else { ' ' };
print!("{}", chr);
}
println!("");
}
}
/// Find the score for aligning `a` and `b` based on the costs given by INDEL,
/// MATCH and MISMATCH. If INDEL and MISMATCH are -1 and MATCH is 0 then the
/// result is the negative of the Levenshtein distance of the two inputs.
pub fn alignment_score(a: &[u8], b: &[u8]) -> Score {
let start = (0,0);
let goal = (a.len(),b.len());
let mut q = BinaryHeap::with_capacity(a.len()+b.len());
q.push((heuristic(&start, &goal), start));
let mut scores: HashMap<Pos,Score> = HashMap::with_capacity(a.len()+b.len());
scores.insert(start,0);
let mut pop_count: usize = 0;
let mut eval_count: usize = 0;
loop {
let (score, pos) = q.pop().expect("should always reach goal");
let score = score - heuristic(&pos, &goal);
pop_count += 1;
if pos == goal {
println!("scores: {:?}", scores.len());
println!("pops: {:?}", pop_count);
println!("evals: {:?}", eval_count);
// debug_print_table(&scores, goal.clone());
return score; // found it!
}
// this should be the best score we've found so far for this position
let best_score = scores[&pos];
if score > best_score {
panic!("loop in DAG scores: {} < {}", score, best_score);
} else if score < best_score {
continue; // we've been here before, but better
}
eval_count += 1;
// explore successor states
let (ai,bi) = pos;
// indel
if ai < a.len() {
add_state(&mut scores, &mut q, &goal, score + INDEL, (ai+1, bi));
}
if bi < b.len() {
add_state(&mut scores, &mut q, &goal, score + INDEL, (ai, bi+1));
}
if ai < a.len() && bi < b.len() {
let diag_score = if a[ai] == b[bi] {
MATCH
} else {
MISMATCH
};
add_state(&mut scores, &mut q, &goal, score + diag_score, (ai+1, bi+1));
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn it_works() {
let a = "GCATGCU";
let b = "GATTACA";
let score = alignment_score(a.as_bytes(), b.as_bytes());
assert_eq!(2*MISMATCH+4*MATCH+2*INDEL, score);
}
#[test]
fn heuristic_1() {
let tests = vec![
((0,0), (4,4), 0*INDEL+4*MATCH),
((4,3), (4,7), 4*INDEL),
((1,0), (4,4), 1*INDEL+3*MATCH),
((1,2), (4,4), 1*INDEL+2*MATCH),
];
for (pos, goal, correct) in tests.into_iter() {
assert_eq!(correct, heuristic(&pos, &goal), "{:?} to {:?}", pos, goal);
}
}
}