Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to Unicode 11 #68

Merged
merged 6 commits into from
Oct 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 26 additions & 11 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,21 @@
# these are the surrogate codepoints, which are not valid rust characters
surrogate_codepoints = (0xd800, 0xdfff)

UNICODE_VERSION = (11, 0, 0)

UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION

def is_surrogate(n):
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]

def fetch(f):
if not os.path.exists(os.path.basename(f)):
os.system("curl -O http://www.unicode.org/Public/10.0.0/ucd/%s"
% f)
if "emoji" in f:
os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
% (UNICODE_VERSION[0], UNICODE_VERSION[1], f))
else:
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
% (UNICODE_VERSION_NUMBER, f))

if not os.path.exists(os.path.basename(f)):
sys.stderr.write("cannot load %s" % f)
Expand Down Expand Up @@ -262,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
pub use self::%sCat::*;

#[allow(non_camel_case_types)]
#[derive(Clone, Copy, PartialEq, Eq)]
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum %sCat {
""" % (name, Name, Name))

Expand Down Expand Up @@ -305,18 +313,13 @@ def emit_break_module(f, break_table, break_cats, name):
with open(r, "w") as rf:
# write the file's preamble
rf.write(preamble)

# download and parse all the data
fetch("ReadMe.txt")
with open("ReadMe.txt") as readme:
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
unicode_version = re.search(pattern, readme.read()).groups()
rf.write("""
/// The version of [Unicode](http://www.unicode.org/)
/// that this version of unicode-segmentation is based on.
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
""" % unicode_version)
""" % UNICODE_VERSION)

# download and parse all the data
gencats = load_gencats("UnicodeData.txt")
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])

Expand All @@ -341,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
grapheme_table = []
for cat in grapheme_cats:
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
grapheme_table.sort(key=lambda w: w[0])
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
last = -1
for chars in grapheme_table:
if chars[0] <= last:
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
last = chars[1]
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
rf.write("\n")

word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
Expand All @@ -352,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
word_table.sort(key=lambda w: w[0])
emit_break_module(rf, word_table, list(word_cats.keys()), "word")

# There are some emoji which are also ALetter, so this needs to be stored separately
# For efficiency, we could still merge the two tables and produce an ALetterEP state
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")

sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
sentence_table = []
for cat in sentence_cats:
Expand Down
6 changes: 3 additions & 3 deletions scripts/unicode_gen_breaktests.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def create_grapheme_data(f):
stype = "&'static [(&'static str, &'static [&'static str])]"
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/GraphemeBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
unicode.emit_table(f, "TEST_DIFF", test_diff, dtype, True, showfun, True)

Expand All @@ -187,7 +187,7 @@ def create_words_data(f):

wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/WordBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

def create_sentence_data(f):
Expand All @@ -201,7 +201,7 @@ def create_sentence_data(f):

wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/10.0.0/ucd/auxiliary/SentenceBreakTest.txt\n")
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

if __name__ == "__main__":
Expand Down
23 changes: 13 additions & 10 deletions src/grapheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,8 @@ enum GraphemeState {
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
Regional,
// The codepoint after is in the E_Modifier category, so whether it's a boundary
// depends on pre-context according to GB10.
// The codepoint after is Extended_Pictographic,
// so whether it's a boundary depends on pre-context according to GB11.
Emoji,
}

Expand Down Expand Up @@ -239,11 +239,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
(_, GC_ZWJ) => NotBreak, // GB9
(_, GC_SpacingMark) => Extended, // GB9a
(GC_Prepend, _) => Extended, // GB9b
(GC_E_Base, GC_E_Modifier) => NotBreak, // GB10
(GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10
(GC_Extend, GC_E_Modifier) => Emoji, // GB10
(GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11
(GC_ZWJ, GC_E_Base_GAZ) => NotBreak, // GB11
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
(_, _) => Break, // GB999
}
Expand Down Expand Up @@ -415,10 +411,17 @@ impl GraphemeCursor {

fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
use tables::grapheme as gr;
for ch in chunk.chars().rev() {
let mut iter = chunk.chars().rev();
if let Some(ch) = iter.next() {
if gr::grapheme_category(ch) != gr::GC_ZWJ {
self.decide(true);
return;
}
}
for ch in iter {
match gr::grapheme_category(ch) {
gr::GC_Extend => (),
gr::GC_E_Base | gr::GC_E_Base_GAZ => {
gr::GC_Extended_Pictographic => {
self.decide(false);
return;
}
Expand Down Expand Up @@ -484,7 +487,7 @@ impl GraphemeCursor {
let mut need_pre_context = true;
match self.cat_after.unwrap() {
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
_ => need_pre_context = self.cat_before.is_none(),
}
if need_pre_context {
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
//!
//! let s = "The quick (\"brown\") fox";
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
//! assert_eq!(w, b);
//! }
//! ```
Expand Down Expand Up @@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {
/// ```
/// # use self::unicode_segmentation::UnicodeSegmentation;
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
///
/// assert_eq!(&swu1[..], b);
/// ```
Expand Down
Loading