Improve UTF-8 handling

For manipulation of UTF-8 encoded strings (insert, delete, truncate, and size), usually conversion to wstring and back was necessary. This has two disadvantages: * the size of wstring is implementation defined * the double conversion is slow The purpose of this patch is to add functions to handle these tasks by a layer handling the different indexing between codepoints and characters. This commit applies these functions in several places to replace the old implementations. The byte_size_from_utf8_first() implementation will be replaced in a separate commit as it relies on the count_leading_ones function by 8680. Signed-off-by: aquileia <sk.aquileia@gmail.com>
wesnoth · Mar 22, 2014 · 52c80db · 52c80db
1 parent c928358
commit 52c80db
Show file tree

Hide file tree

Showing 9 changed files with 144 additions and 83 deletions.
diff --git a/src/dialogs.cpp b/src/dialogs.cpp
@@ -938,7 +938,7 @@ std::string load_game_dialog(display& disp, const config& game_config, bool* sel
 	std::vector<savegame::save_info>::const_iterator i;
 	for(i = games.begin(); i != games.end(); ++i) {
 		std::string name = i->name();
-		utils::truncate_as_wstring(name, std::min<size_t>(name.size(), 40));
+		utils::u8truncate(name, 40);	// truncate only acts if the name is longer
 
 		std::ostringstream str;
 		str << name << COLUMN_SEPARATOR << util::format_time_summary(i->modified());

diff --git a/src/gui/dialogs/addon_list.cpp b/src/gui/dialogs/addon_list.cpp
@@ -176,19 +176,16 @@ void taddon_list::pre_show(CVideo& /*video*/, twindow& window)
 			item["label"] = c["icon"];
 			data.insert(std::make_pair("icon", item));
 
-			std::string tmp = c["name"];
-			utils::truncate_as_wstring(tmp, 20);
-			item["label"] = tmp;
+			utf8_string tmp = c["name"];
+			item["label"] = utils::u8truncate(tmp, 20);
 			data.insert(std::make_pair("name", item));
 
 			tmp = c["version"].str();
-			utils::truncate_as_wstring(tmp, 12);
-			item["label"] = tmp;
+			item["label"] = utils::u8truncate(tmp, 12);
 			data.insert(std::make_pair("version", item));
 
 			tmp = c["author"].str();
-			utils::truncate_as_wstring(tmp, 16);
-			item["label"] = tmp;
+			item["label"] = utils::u8truncate(tmp, 16);
 			data.insert(std::make_pair("author", item));
 
 			item["label"] = c["downloads"];
@@ -212,19 +209,16 @@ void taddon_list::create_campaign(tpane& pane, const config& campaign)
 	item["label"] = campaign["icon"];
 	data.insert(std::make_pair("icon", item));
 
-	std::string tmp = campaign["name"];
-	utils::truncate_as_wstring(tmp, 20);
-	item["label"] = tmp;
+	utf8_string tmp = campaign["name"];
+	item["label"] = utils::u8truncate(tmp, 20);
 	data.insert(std::make_pair("name", item));
 
 	tmp = campaign["version"].str();
-	utils::truncate_as_wstring(tmp, 12);
-	item["label"] = tmp;
+	item["label"] = utils::u8truncate(tmp, 12);
 	data.insert(std::make_pair("version", item));
 
 	tmp = campaign["author"].str();
-	utils::truncate_as_wstring(tmp, 16);
-	item["label"] = tmp;
+	item["label"] = utils::u8truncate(tmp, 16);
 	data.insert(std::make_pair("author", item));
 
 	item["label"] = campaign["downloads"];

diff --git a/src/gui/widgets/password_box.cpp b/src/gui/widgets/password_box.cpp
@@ -40,7 +40,7 @@ namespace
 
 size_t get_text_length(const std::string& str)
 {
-	return utils::string_to_wstring(str).size();
+	return utils::u8size(str);
 }
 
 } // namespace

diff --git a/src/gui/widgets/text.cpp b/src/gui/widgets/text.cpp
@@ -142,23 +142,20 @@ void ttext_::insert_char(const Uint16 unicode)
 
 void ttext_::copy_selection(const bool mouse)
 {
-	int length = selection_length_;
-	unsigned start = selection_start_;
-
-	if(length == 0) {
-		return;
-	}
-
-	if(length < 0) {
-		length = -length;
-		start -= length;
+	if(selection_length_ == 0) return;
+
+	unsigned end,start = selection_start_;
+	const utf8_string txt = text_.text();
+
+	if(selection_length_  > 0) {
+		end   = utils::u8index(txt,start+selection_length_);
+		start = utils::u8index(txt,start);
+	} else {
+		// inverse selection: selection_start_ is in fact the end
+		end   = utils::u8index(txt,start);
+		start = utils::u8index(txt,start+selection_length_);
 	}
-
-	const wide_string& wtext = utils::string_to_wstring(text_.text());
-	const std::string& text = utils::wstring_to_string(
-			wide_string(wtext.begin() + start, wtext.begin() + start + length));
-
-	copy_to_clipboard(text, mouse);
+	copy_to_clipboard(txt.substr(start,end-start), mouse);
 }
 
 void ttext_::paste_selection(const bool mouse)

diff --git a/src/gui/widgets/text_box.cpp b/src/gui/widgets/text_box.cpp
@@ -208,11 +208,7 @@ void ttext_box::delete_selection()
 		start -= len;
 	}
 
-	// Update the text, we need to assume it's a wide string.
-	wide_string tmp = utils::string_to_wstring(get_value());
-	tmp.erase(tmp.begin() + start, tmp.begin() + start + len);
-	const std::string& text = utils::wstring_to_string(tmp);
-	set_value(text);
+	set_value(utils::u8erase(get_value(), start, len));
 	set_cursor(start, false);
 }
 

diff --git a/src/serialization/string_utils.cpp b/src/serialization/string_utils.cpp
@@ -840,26 +840,22 @@ std::vector< std::pair< int, int > > parse_ranges(std::string const &str)
 	return to_return;
 }
 
-static int byte_size_from_utf8_first(unsigned char ch)
+static int byte_size_from_utf8_first(const unsigned char ch)
 {
-	int count;
-
-	if ((ch & 0x80) == 0)
-		count = 1;
-	else if ((ch & 0xE0) == 0xC0)
-		count = 2;
-	else if ((ch & 0xF0) == 0xE0)
-		count = 3;
-	else if ((ch & 0xF8) == 0xF0)
-		count = 4;
-	else if ((ch & 0xFC) == 0xF8)
-		count = 5;
-	else if ((ch & 0xFE) == 0xFC)
-		count = 6;
-	else
-		throw invalid_utf8_exception(); // Stop on invalid characters
-
-	return count;
+	if (!(ch & 0x80)) {
+		return 1;  // US-ASCII character, 1 byte
+	}
+	/* first bit set: character not in US-ASCII, multiple bytes
+	 * number of set bits at the beginning = bytes per character
+	 * e.g. 11110xxx indicates a 4-byte character */
+	if (!(ch & 0x40)) throw invalid_utf8_exception();
+	switch (ch & 0x30) {
+	case 0x30:
+		if (ch & 0x08) throw invalid_utf8_exception();
+		return 4;
+	case 0x20: return 3;
+	default: return 2;
+	}
 }
 
 utf8_iterator::utf8_iterator(const std::string& str) :
@@ -1052,20 +1048,70 @@ utf8_string lowercase(const utf8_string& s)
 	return s;
 }
 
+unsigned int u8index(const utf8_string& str, const unsigned int index)
+{
+	// chr counts characters, i is the codepoint index
+	// remark: several functions rely on the fallback to str.length()
+	unsigned int chr, i = 0, len = str.size();
+	try {
+		for (chr=0; chr<index && i<len; ++chr) {
+			i += byte_size_from_utf8_first(str[i]);
+		}
+	} catch(invalid_utf8_exception&) {
+		ERR_GENERAL << "Invalid UTF-8 string.\n";
+	}
+	return i;
+}
+
+size_t u8size(const utf8_string& str)
+{
+	unsigned int chr, i = 0, len = str.size();
+	try {
+		for (chr=0; i<len; ++chr) {
+			i += byte_size_from_utf8_first(str[i]);
+		}
+	} catch(invalid_utf8_exception&) {
+		ERR_GENERAL << "Invalid UTF-8 string.\n";
+	}
+	return chr;
+}
+
+utf8_string& u8insert(utf8_string& str, const size_t pos, const utf8_string& insert)
+{
+	return str.insert(u8index(str, pos), insert);
+}
+
+utf8_string& u8erase(utf8_string& str, const size_t start, const size_t len)
+{
+	if (start > u8size(str)) return str;
+	unsigned pos = u8index(str, start);
+	if (len == std::string::npos) {
+		// without second argument, std::string::erase truncates
+		return str.erase(pos);
+	} else {
+		return str.erase(pos, u8index(str,start+len) - pos);
+	}
+}
+
+utf8_string& u8truncate(utf8_string& str, const size_t size)
+{
+	return u8erase(str, size);
+}
+
 void truncate_as_wstring(std::string& str, const size_t size)
 {
-	wide_string utf8_str = utils::string_to_wstring(str);
-	if(utf8_str.size() > size) {
-		utf8_str.resize(size);
-		str = utils::wstring_to_string(utf8_str);
+	wide_string wide = utils::string_to_wstring(str);
+	if(wide.size() > size) {
+		wide.resize(size);
+		str = utils::wstring_to_string(wide);
 	}
 }
 
 void ellipsis_truncate(std::string& str, const size_t size)
 {
 	const size_t prev_size = str.length();
 
-	truncate_as_wstring(str, size);
+	u8truncate(str, size);
 
 	if(str.length() != prev_size) {
 		str += ellipsis;

diff --git a/src/serialization/string_utils.hpp b/src/serialization/string_utils.hpp
@@ -358,6 +358,30 @@ std::string wchar_to_string(const wchar_t);
 /** Returns a lowercased version of the string. */
 utf8_string lowercase(const utf8_string&);
 
+/**
+ * index of the ...th character in an UTF-8 encoded string
+ * if there are less than index characters, return str.length()
+ */
+unsigned int u8index(const utf8_string& str, const unsigned int index);
+
+/** length in characters of an UTF-8 encoded string */
+size_t u8size(const utf8_string& str);
+
+/** insert at position pos into an UTF-8 encoded string */
+utf8_string& u8insert(utf8_string& str, const size_t pos, const utf8_string& insert);
+
+/**
+ * erase len characters at position start from an UTF-8 encoded string
+ * this implementation doesn't check for valid UTF-8, don't use for user input
+ */
+utf8_string& u8erase(utf8_string& str, const size_t start, const size_t len = std::string::npos);
+
+/**
+* truncate an UTF-8 encoded string after size characters
+* this implementation doesn't check for valid UTF-8, don't use for user input
+*/
+utf8_string& u8truncate(utf8_string& str, const size_t size);
+
 /**
  * Truncates a string.
  *

diff --git a/src/tests/test_serialization.cpp b/src/tests/test_serialization.cpp
@@ -33,5 +33,13 @@ BOOST_AUTO_TEST_CASE( utils_join_test )
 	fruit.push_back("lemons");
 
 	BOOST_CHECK( utils::join(fruit) == "apples,oranges,lemons" );
+
+	utf8_string unicode = "ünicod€ check";
+	BOOST_CHECK( u8size(unicode) == 13 );
+
+	int euro = u8index(unicode,6);
+	BOOST_CHECK( str.substr(euro,u8index(unicode,7)-euro) == "€" );
+
+	BOOST_CHECK( u8truncate(unicode,3) == "üni");
 }
 
diff --git a/src/text.cpp b/src/text.cpp
@@ -181,11 +181,22 @@ bool ttext::is_truncated() const
 
 unsigned ttext::insert_text(const unsigned offset, const std::string& text)
 {
-	if(text.empty()) {
+	if (text.empty() || length_ == maximum_length_) {
 		return 0;
 	}
 
-	return insert_unicode(offset, utils::string_to_wstring(text));
+	// do we really need that assert? u8insert will just append in this case, which seems fine
+	assert(offset <= length_);
+
+	unsigned len = utils::u8size(text);
+	if (length_ + len > maximum_length_) {
+		len = maximum_length_ - length_;
+	}
+	const utf8_string insert = text.substr(0, utils::u8index(text, len));
+	utf8_string tmp = text_;
+	set_text(utils::u8insert(tmp, offset, insert), false);
+	// report back how many characters were actually inserted (e.g. to move the cursor selection)
+	return len;
 }
 
 bool ttext::insert_unicode(const unsigned offset, const wchar_t unicode)
@@ -195,21 +206,8 @@ bool ttext::insert_unicode(const unsigned offset, const wchar_t unicode)
 
 unsigned ttext::insert_unicode(const unsigned offset, const wide_string& unicode)
 {
-	assert(offset <= length_);
-
-	if(length_ == maximum_length_) {
-		return 0;
-	}
-
-	const unsigned len = length_ + unicode.size() > maximum_length_
-		? maximum_length_ - length_  : unicode.size();
-
-	wide_string tmp = utils::string_to_wstring(text_);
-	tmp.insert(tmp.begin() + offset, unicode.begin(), unicode.begin() + len);
-
-	set_text(utils::wstring_to_string(tmp), false);
-
-	return len;
+	const utf8_string insert = utils::wstring_to_string(unicode);
+	return insert_text(offset, insert);
 }
 
 gui2::tpoint ttext::get_cursor_position(
@@ -449,10 +447,8 @@ ttext& ttext::set_maximum_length(const size_t maximum_length)
 	if(maximum_length != maximum_length_) {
 		maximum_length_ = maximum_length;
 		if(length_ > maximum_length_) {
-
-			wide_string tmp = utils::string_to_wstring(text_);
-			tmp.resize(maximum_length_);
-			set_text(utils::wstring_to_string(tmp), false);
+			utf8_string tmp = text_;
+			set_text(utils::u8truncate(tmp, maximum_length_), false);
 		}
 	}
 
@@ -598,7 +594,7 @@ struct decode_table
 	}
 };
 
-static decode_table decode_table;
+static struct decode_table decode_table;
 
 
 #ifndef _WIN32