forked from emk/subtitles-rs
-
Notifications
You must be signed in to change notification settings - Fork 2
/
clean.rs
115 lines (95 loc) · 2.75 KB
/
clean.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
//! Tools for cleaning up subtitle files and getting them into some
//! kind of normalized format.
use regex::Regex;
use srt::{Subtitle,SubtitleFile};
use errors::*;
/// Remove the formatting from a subtitle.
pub fn strip_formatting(line: &str) -> String {
let formatting = Regex::new(r"<[a-z/][^>]*>").unwrap();
formatting.replace_all(&line, "")
}
// Clean up a single subtitle line.
fn clean_line(line: &str) -> String {
// Used to remove the following common sorts of closed-caption clutter:
//
// SPEAKER:
// ( sound effect )
// ♪ music ♪
let clutter = Regex::new(r"(\([^)]*\)|♪[^♪]*♪|[A-Z]{2,} ?:)").unwrap();
// Used to compress and normalize consecutive whitespace.
let whitespace = Regex::new(r"\s+").unwrap();
whitespace.replace_all(&clutter.replace_all(line, ""), " ")
.trim().to_string()
}
// Clean up a subtitle, or discard it if it looks useless.
fn clean_subtitle(sub: &Subtitle) -> Option<Subtitle> {
let lines: Vec<String> = sub.lines.iter()
.map(|l| clean_line(&l))
.filter(|l| l.len() > 0)
.map(|l| l.to_string()).collect();
if lines.len() == 0 { return None; }
Some(Subtitle{index: sub.index, period: sub.period, lines: lines})
}
/// Clean up various issues with subtitle files, including:
///
/// * Out of order subtitles.
/// * Overlapping subtitles.
/// * Sound effects.
/// * Music symbols.
pub fn clean_subtitle_file(file: &SubtitleFile) -> Result<SubtitleFile> {
// Clean individual subtitles and sort.
let mut subs: Vec<Subtitle> =
file.subtitles.iter().filter_map(clean_subtitle).collect();
subs.sort_by(|a, b| {
a.period.begin().partial_cmp(&b.period.begin()).unwrap()
});
// Fix overlaps.
if subs.len() >= 2 {
for i in 0..subs.len()-1 {
let limit = subs[i+1].period.begin();
try!(subs[i].period.end_before(limit));
}
}
// Renumber and return.
for (i, ref mut sub) in subs.iter_mut().enumerate() {
sub.index = i+1;
}
Ok(SubtitleFile{subtitles: subs})
}
#[test]
fn test_clean_subtitle_file() {
let dirty = SubtitleFile::from_str(r"19
00:01:03,163 --> 00:01:04,664
They've arrived.
( <i>door slams</i> )
20
00:01:07,700 --> 00:01:10,736
( <i>cheering</i> )
21
00:01:12,839 --> 00:01:13,840
♪ ♪
18
00:01:02,328 --> 00:01:03,162
JOE: Hey! ( waves arms )
53
00:02:47,965 --> 00:02:50,684
Out of order.
52
00:02:42,658 --> 00:02:48,865
Overlapping.
").unwrap();
let cleaned = "\u{FEFF}1
00:01:02,328 --> 00:01:03,162
Hey!
2
00:01:03,163 --> 00:01:04,664
They've arrived.
3
00:02:42,658 --> 00:02:47,964
Overlapping.
4
00:02:47,965 --> 00:02:50,684
Out of order.
";
assert_eq!(cleaned, &clean_subtitle_file(&dirty).unwrap().to_string());
}