forked from emk/subtitles-rs
-
Notifications
You must be signed in to change notification settings - Fork 2
/
align.rs
272 lines (247 loc) · 10.1 KB
/
align.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
//! Align two subtitle files.
use std::cmp::Ordering;
use time::MIN_SPACING;
use srt::{Subtitle, SubtitleFile};
use merge::merge_subtitles;
use clean::{strip_formatting, clean_subtitle_file};
use self::MatchQuality::{NoMatch, Nearby, Overlap};
// How well do two subtitles match each other, going solely by the time?
#[derive(PartialEq, Clone, Copy, Debug)]
enum MatchQuality {
NoMatch, // More than two seconds away.
Nearby(f32), // 0.0 <= seconds <= 2.0
Overlap(f32) // 0.0 < seconds
}
impl PartialOrd for MatchQuality {
fn partial_cmp(&self, other: &MatchQuality) -> Option<Ordering> {
match (self, other) {
(&Overlap(v1), &Overlap(v2)) => v1.partial_cmp(&v2),
(&Nearby(v1), &Nearby(v2)) =>
v1.partial_cmp(&v2).map(|c| c.reverse()),
(&NoMatch, &NoMatch) => Some(Ordering::Equal),
(&Overlap(_), _) => Some(Ordering::Greater),
(_, &Overlap(_)) => Some(Ordering::Less),
(&Nearby(_), _) => Some(Ordering::Greater),
(_, &Nearby(_)) => Some(Ordering::Less)
}
}
}
// Calculate the match quality of two subtitles.
fn match_quality(sub1: &Subtitle, sub2: &Subtitle) -> MatchQuality {
match sub1.period.distance(sub2.period) {
Some(distance) if distance > 2.0 => NoMatch,
Some(distance) => Nearby(distance),
None => Overlap(sub1.period.overlap(sub2.period)),
}
}
// Find the index of the best match for `sub` in `candidates`.
fn best_match(sub: &Subtitle, candidates: &Vec<Subtitle>) -> Option<usize> {
let mut best: Option<(usize, MatchQuality)> = None;
for (i, candidate) in candidates.iter().enumerate() {
let mq = match_quality(sub, candidate);
if mq == NoMatch { continue; }
match best {
None => { best = Some((i, mq)); }
Some((_, old_mq)) if mq > old_mq => { best = Some((i, mq)); }
_ => {}
}
}
best.map(|(i, _)| i)
}
// Find the index of the best match each subtitle in `subs` in `candidates`.
fn best_matches(subs: &Vec<Subtitle>, candidates: &Vec<Subtitle>) ->
Vec<Option<usize>>
{
// We could be a lot more efficient about this if we wanted.
subs.iter().map(|s| best_match(s, candidates)).collect()
}
// Make sure the subtitles are in-order and have sensible begin/end times.
//fn clean_subs(file: &SubtitleFile) -> Vec<Subtitle> {
// // Remove subtitles with bogus begin/end times.
// let result = file.subtitles.iter().filter(|s| s.begin < s.end).collect();
// // Sort subtitles by begin time.
// result.sort_by(|a, b| a.begin.cmp(b.begin));
// result
//}
/// Alignment specification, showing how to match up the specified indices
/// in two subtitle files.
type Alignment = Vec<(Vec<usize>, Vec<usize>)>;
// Returns true if `items[i].is_some()` and the value is found in `group`.
// Returns false if `i` is out of bounds.
fn group_contains(group: &Vec<usize>, items: &Vec<Option<usize>>, i: usize) ->
bool
{
if !(i < items.len()) { return false; }
match items[i] {
None => false,
Some(v) => group.iter().position(|e| *e == v).is_some()
}
}
/// Find a good way to align two subtitle files.
fn alignment(file1: &SubtitleFile, file2: &SubtitleFile) -> Alignment {
let (subs1, subs2) = (&file1.subtitles, &file2.subtitles);
// assert!(subs1 && subs2 contain valid subs in ascending order)
let matches1 = best_matches(subs1, subs2);
let matches2 = best_matches(subs2, subs1);
let mut alignment: Alignment = vec!();
let mut i1 = 0;
let mut i2 = 0;
while i1 < subs1.len() && i2 < subs2.len() {
debug!("subs1: {} matches {:?}, subs2: {} matches {:?}",
i1, matches1[i1], i2, matches2[i2]);
if subs1[i1].period.begin() < subs2[i2].period.begin() &&
matches1[i1] != Some(i2)
{
// Subs1 has an item which doesn't match subs2.
debug!("unmatched: [{}], []", i1);
alignment.push((vec!(i1), vec!()));
i1 += 1;
} else if subs2[i2].period.begin() < subs1[i1].period.begin() &&
matches2[i2] != Some(i1)
{
// Subs2 has an item which doesn't match subs1.
debug!("unmatched: [], [{}]", i2);
alignment.push((vec!(), vec!(i2)));
i2 += 1;
} else {
// We have some matches, so let's gather them all together.
assert!(matches1[i1] == Some(i2) || matches2[i2] == Some(i1));
let mut matched1 = vec!(i1); i1 += 1;
let mut matched2 = vec!(i2); i2 += 1;
while group_contains(&matched1, &matches2, i2) ||
group_contains(&matched2, &matches1, i1) {
if group_contains(&matched1, &matches2, i2) {
// i2 matches something in matched1, so add to matched2.
matched2.push(i2); i2 += 1;
} else if group_contains(&matched2, &matches1, i1) {
// i1 matches something in matched2, so add to matched1.
matched1.push(i1); i1 += 1;
}
debug!("grouping: {:?}, {:?}", matched1, matched2);
}
alignment.push((matched1, matched2));
}
}
alignment
}
#[test]
fn test_alignment() {
use std::path::Path;
// Load sample subtitles.
let path_es = Path::new("fixtures/sample.es.srt");
let srt_es = SubtitleFile::from_path(&path_es).unwrap();
let path_en = Path::new("fixtures/sample.en.srt");
let srt_en = SubtitleFile::from_path(&path_en).unwrap();
let expected =
vec!((vec!(0), vec!(0, 1)),
(vec!(), vec!(2)),
(vec!(1), vec!(3)),
(vec!(2), vec!(4)),
(vec!(3), vec!(5, 6)),
(vec!(4), vec!(7)));
assert_eq!(expected, alignment(&srt_es, &srt_en));
}
/// Align two subtitle files, merging subtitles as necessary.
pub fn align_files(file1: &SubtitleFile, file2: &SubtitleFile) ->
Vec<(Option<Subtitle>, Option<Subtitle>)>
{
fn merge(file: &SubtitleFile, indices: &[usize]) -> Option<Subtitle> {
let mut subs = vec!();
for &i in indices.iter() {
subs.push(file.subtitles[i].clone())
}
merge_subtitles(&subs)
}
alignment(file1, file2).iter().map(|&(ref indices1, ref indices2)| {
(merge(file1, &indices1),
merge(file2, &indices2))
}).collect()
}
/// If we have two files, align them. If one is missing, just return its
/// subtitles by themselves.
pub fn align_available_files(file1: &SubtitleFile,
file2_opt: Option<&SubtitleFile>) ->
Vec<(Option<Subtitle>, Option<Subtitle>)>
{
match file2_opt {
Some(ref file2) => align_files(file1, file2),
None => file1.subtitles.iter().cloned().map(|s| {
(Some(s), None)
}).collect(),
}
}
// Clone a subtitle and wrap its lines with formatting.
fn clone_as(sub: &Subtitle, before: &str, after: &str) -> Subtitle {
let lines = sub.lines.iter().map(|l| {
// For now, strip out existing formatting. We'll change this once
// color works.
let cleaned = strip_formatting(&l);
format!("{}{}{}", before, &cleaned, after)
}).collect();
Subtitle{index: sub.index, period: sub.period, lines: lines}
}
static STYLE1B: &'static str = "<font color=\"yellow\">";
static STYLE1E: &'static str = "</font>";
static STYLE2B: &'static str = "<i>";
static STYLE2E: &'static str = "</i>";
/// Combine two subtitle files into an aligned file.
pub fn combine_files(file1: &SubtitleFile, file2: &SubtitleFile)
-> SubtitleFile
{
let mut subs: Vec<Subtitle> = align_files(file1, file2).iter().map(|pair| {
match pair {
&(None, None) => panic!("Shouldn't have empty alignment pair!"),
&(Some(ref sub), None) => clone_as(sub, STYLE1B, STYLE1E),
&(None, Some(ref sub)) => clone_as(sub, STYLE2B, STYLE2E),
&(Some(ref sub1), Some(ref sub2)) => {
let mut new = clone_as(sub1, STYLE1B, STYLE1E);
let to_merge = clone_as(sub2, STYLE2B, STYLE2E);
let mut lines = to_merge.lines.clone();
lines.extend(new.lines);
new.lines = lines;
new
}
}
}).collect();
// Restore our invariants, which may have been thrown off by weird matches.
// TODO: Print out weird matches and analyze.
subs = clean_subtitle_file(&SubtitleFile{subtitles: subs}).unwrap()
.subtitles;
// Extend the time of each sub to account for increased text. We rely
// on clean_subtitle_file to clean up any remaining overlaps.
for i in 0..subs.len() {
debug!("growing: {:?} ({})", subs[i].period, subs[i].plain_text());
let mut wanted = subs[i].period.grow(2.0, 2.0);
if i != 0 {
debug!(" previous: {:?}", subs[i-1].period);
wanted.begin_after(subs[i-1].period.end()).unwrap();
}
if i+1 < subs.len() {
// Extend forward into any unused space, but don't intefere
// with the buffer we want to add before the next subtitle (or
// allow that space to cut back into space which was originally
// ours).
debug!(" next: {:?}", subs[i+1].period);
let limit = (subs[i+1].period.begin() - 2.0)
.max(subs[i].period.end() + MIN_SPACING);
wanted.end_before(limit).unwrap();
}
subs[i].period = wanted;
}
clean_subtitle_file(&SubtitleFile{subtitles: subs}).unwrap()
}
#[test]
fn test_combine_files() {
use difference::assert_diff;
use std::path::Path;
// Load sample subtitles.
let path_es = Path::new("fixtures/sample.es.srt");
let srt_es = SubtitleFile::from_path(&path_es).unwrap();
let path_en = Path::new("fixtures/sample.en.srt");
let srt_en = SubtitleFile::from_path(&path_en).unwrap();
let path_combined = Path::new("fixtures/combined.srt");
let expected = SubtitleFile::from_path(&path_combined).unwrap();
assert_diff(&expected.to_string(),
&combine_files(&srt_es, &srt_en).to_string(),
"\n", 0);
}