Skip to content

Commit

Permalink
Improve UTF-8 handling
Browse files Browse the repository at this point in the history
For manipulation of UTF-8 encoded strings (insert, delete, truncate, and
size), usually conversion to wstring and back was necessary. This has
two disadvantages:
* the size of wstring is implementation defined
* the
double conversion is slow

The purpose of this patch is to add functions
to handle these tasks by a layer handling the different indexing between
codepoints and characters.

This commit applies these functions in
several places to replace the old implementations.

The byte_size_from_utf8_first() implementation will be replaced in a
separate commit as it relies on the count_leading_ones function by 8680.

Signed-off-by: aquileia <sk.aquileia@gmail.com>
  • Loading branch information
aquileia committed Mar 22, 2014
1 parent c928358 commit 52c80db
Show file tree
Hide file tree
Showing 9 changed files with 144 additions and 83 deletions.
2 changes: 1 addition & 1 deletion src/dialogs.cpp
Expand Up @@ -938,7 +938,7 @@ std::string load_game_dialog(display& disp, const config& game_config, bool* sel
std::vector<savegame::save_info>::const_iterator i;
for(i = games.begin(); i != games.end(); ++i) {
std::string name = i->name();
utils::truncate_as_wstring(name, std::min<size_t>(name.size(), 40));
utils::u8truncate(name, 40); // truncate only acts if the name is longer

std::ostringstream str;
str << name << COLUMN_SEPARATOR << util::format_time_summary(i->modified());
Expand Down
22 changes: 8 additions & 14 deletions src/gui/dialogs/addon_list.cpp
Expand Up @@ -176,19 +176,16 @@ void taddon_list::pre_show(CVideo& /*video*/, twindow& window)
item["label"] = c["icon"];
data.insert(std::make_pair("icon", item));

std::string tmp = c["name"];
utils::truncate_as_wstring(tmp, 20);
item["label"] = tmp;
utf8_string tmp = c["name"];
item["label"] = utils::u8truncate(tmp, 20);
data.insert(std::make_pair("name", item));

tmp = c["version"].str();
utils::truncate_as_wstring(tmp, 12);
item["label"] = tmp;
item["label"] = utils::u8truncate(tmp, 12);
data.insert(std::make_pair("version", item));

tmp = c["author"].str();
utils::truncate_as_wstring(tmp, 16);
item["label"] = tmp;
item["label"] = utils::u8truncate(tmp, 16);
data.insert(std::make_pair("author", item));

item["label"] = c["downloads"];
Expand All @@ -212,19 +209,16 @@ void taddon_list::create_campaign(tpane& pane, const config& campaign)
item["label"] = campaign["icon"];
data.insert(std::make_pair("icon", item));

std::string tmp = campaign["name"];
utils::truncate_as_wstring(tmp, 20);
item["label"] = tmp;
utf8_string tmp = campaign["name"];
item["label"] = utils::u8truncate(tmp, 20);
data.insert(std::make_pair("name", item));

tmp = campaign["version"].str();
utils::truncate_as_wstring(tmp, 12);
item["label"] = tmp;
item["label"] = utils::u8truncate(tmp, 12);
data.insert(std::make_pair("version", item));

tmp = campaign["author"].str();
utils::truncate_as_wstring(tmp, 16);
item["label"] = tmp;
item["label"] = utils::u8truncate(tmp, 16);
data.insert(std::make_pair("author", item));

item["label"] = campaign["downloads"];
Expand Down
2 changes: 1 addition & 1 deletion src/gui/widgets/password_box.cpp
Expand Up @@ -40,7 +40,7 @@ namespace

size_t get_text_length(const std::string& str)
{
return utils::string_to_wstring(str).size();
return utils::u8size(str);
}

} // namespace
Expand Down
29 changes: 13 additions & 16 deletions src/gui/widgets/text.cpp
Expand Up @@ -142,23 +142,20 @@ void ttext_::insert_char(const Uint16 unicode)

void ttext_::copy_selection(const bool mouse)
{
int length = selection_length_;
unsigned start = selection_start_;

if(length == 0) {
return;
}

if(length < 0) {
length = -length;
start -= length;
if(selection_length_ == 0) return;

unsigned end,start = selection_start_;
const utf8_string txt = text_.text();

if(selection_length_ > 0) {
end = utils::u8index(txt,start+selection_length_);
start = utils::u8index(txt,start);
} else {
// inverse selection: selection_start_ is in fact the end
end = utils::u8index(txt,start);
start = utils::u8index(txt,start+selection_length_);
}

const wide_string& wtext = utils::string_to_wstring(text_.text());
const std::string& text = utils::wstring_to_string(
wide_string(wtext.begin() + start, wtext.begin() + start + length));

copy_to_clipboard(text, mouse);
copy_to_clipboard(txt.substr(start,end-start), mouse);
}

void ttext_::paste_selection(const bool mouse)
Expand Down
6 changes: 1 addition & 5 deletions src/gui/widgets/text_box.cpp
Expand Up @@ -208,11 +208,7 @@ void ttext_box::delete_selection()
start -= len;
}

// Update the text, we need to assume it's a wide string.
wide_string tmp = utils::string_to_wstring(get_value());
tmp.erase(tmp.begin() + start, tmp.begin() + start + len);
const std::string& text = utils::wstring_to_string(tmp);
set_value(text);
set_value(utils::u8erase(get_value(), start, len));
set_cursor(start, false);
}

Expand Down
94 changes: 70 additions & 24 deletions src/serialization/string_utils.cpp
Expand Up @@ -840,26 +840,22 @@ std::vector< std::pair< int, int > > parse_ranges(std::string const &str)
return to_return;
}

static int byte_size_from_utf8_first(unsigned char ch)
static int byte_size_from_utf8_first(const unsigned char ch)
{
int count;

if ((ch & 0x80) == 0)
count = 1;
else if ((ch & 0xE0) == 0xC0)
count = 2;
else if ((ch & 0xF0) == 0xE0)
count = 3;
else if ((ch & 0xF8) == 0xF0)
count = 4;
else if ((ch & 0xFC) == 0xF8)
count = 5;
else if ((ch & 0xFE) == 0xFC)
count = 6;
else
throw invalid_utf8_exception(); // Stop on invalid characters

return count;
if (!(ch & 0x80)) {
return 1; // US-ASCII character, 1 byte
}
/* first bit set: character not in US-ASCII, multiple bytes
* number of set bits at the beginning = bytes per character
* e.g. 11110xxx indicates a 4-byte character */
if (!(ch & 0x40)) throw invalid_utf8_exception();
switch (ch & 0x30) {
case 0x30:
if (ch & 0x08) throw invalid_utf8_exception();
return 4;
case 0x20: return 3;
default: return 2;
}
}

utf8_iterator::utf8_iterator(const std::string& str) :
Expand Down Expand Up @@ -1052,20 +1048,70 @@ utf8_string lowercase(const utf8_string& s)
return s;
}

unsigned int u8index(const utf8_string& str, const unsigned int index)
{
// chr counts characters, i is the codepoint index
// remark: several functions rely on the fallback to str.length()
unsigned int chr, i = 0, len = str.size();
try {
for (chr=0; chr<index && i<len; ++chr) {
i += byte_size_from_utf8_first(str[i]);
}
} catch(invalid_utf8_exception&) {
ERR_GENERAL << "Invalid UTF-8 string.\n";
}
return i;
}

size_t u8size(const utf8_string& str)
{
unsigned int chr, i = 0, len = str.size();
try {
for (chr=0; i<len; ++chr) {
i += byte_size_from_utf8_first(str[i]);
}
} catch(invalid_utf8_exception&) {
ERR_GENERAL << "Invalid UTF-8 string.\n";
}
return chr;
}

utf8_string& u8insert(utf8_string& str, const size_t pos, const utf8_string& insert)
{
return str.insert(u8index(str, pos), insert);
}

utf8_string& u8erase(utf8_string& str, const size_t start, const size_t len)
{
if (start > u8size(str)) return str;
unsigned pos = u8index(str, start);
if (len == std::string::npos) {
// without second argument, std::string::erase truncates
return str.erase(pos);
} else {
return str.erase(pos, u8index(str,start+len) - pos);
}
}

utf8_string& u8truncate(utf8_string& str, const size_t size)
{
return u8erase(str, size);
}

void truncate_as_wstring(std::string& str, const size_t size)
{
wide_string utf8_str = utils::string_to_wstring(str);
if(utf8_str.size() > size) {
utf8_str.resize(size);
str = utils::wstring_to_string(utf8_str);
wide_string wide = utils::string_to_wstring(str);
if(wide.size() > size) {
wide.resize(size);
str = utils::wstring_to_string(wide);
}
}

void ellipsis_truncate(std::string& str, const size_t size)
{
const size_t prev_size = str.length();

truncate_as_wstring(str, size);
u8truncate(str, size);

if(str.length() != prev_size) {
str += ellipsis;
Expand Down
24 changes: 24 additions & 0 deletions src/serialization/string_utils.hpp
Expand Up @@ -358,6 +358,30 @@ std::string wchar_to_string(const wchar_t);
/** Returns a lowercased version of the string. */
utf8_string lowercase(const utf8_string&);

/**
* index of the ...th character in an UTF-8 encoded string
* if there are less than index characters, return str.length()
*/
unsigned int u8index(const utf8_string& str, const unsigned int index);

/** length in characters of an UTF-8 encoded string */
size_t u8size(const utf8_string& str);

/** insert at position pos into an UTF-8 encoded string */
utf8_string& u8insert(utf8_string& str, const size_t pos, const utf8_string& insert);

/**
* erase len characters at position start from an UTF-8 encoded string
* this implementation doesn't check for valid UTF-8, don't use for user input
*/
utf8_string& u8erase(utf8_string& str, const size_t start, const size_t len = std::string::npos);

/**
* truncate an UTF-8 encoded string after size characters
* this implementation doesn't check for valid UTF-8, don't use for user input
*/
utf8_string& u8truncate(utf8_string& str, const size_t size);

/**
* Truncates a string.
*
Expand Down
8 changes: 8 additions & 0 deletions src/tests/test_serialization.cpp
Expand Up @@ -33,5 +33,13 @@ BOOST_AUTO_TEST_CASE( utils_join_test )
fruit.push_back("lemons");

BOOST_CHECK( utils::join(fruit) == "apples,oranges,lemons" );

utf8_string unicode = "ünicod€ check";
BOOST_CHECK( u8size(unicode) == 13 );

int euro = u8index(unicode,6);
BOOST_CHECK( str.substr(euro,u8index(unicode,7)-euro) == "" );

BOOST_CHECK( u8truncate(unicode,3) == "üni");
}

40 changes: 18 additions & 22 deletions src/text.cpp
Expand Up @@ -181,11 +181,22 @@ bool ttext::is_truncated() const

unsigned ttext::insert_text(const unsigned offset, const std::string& text)
{
if(text.empty()) {
if (text.empty() || length_ == maximum_length_) {
return 0;
}

return insert_unicode(offset, utils::string_to_wstring(text));
// do we really need that assert? u8insert will just append in this case, which seems fine
assert(offset <= length_);

unsigned len = utils::u8size(text);
if (length_ + len > maximum_length_) {
len = maximum_length_ - length_;
}
const utf8_string insert = text.substr(0, utils::u8index(text, len));
utf8_string tmp = text_;
set_text(utils::u8insert(tmp, offset, insert), false);
// report back how many characters were actually inserted (e.g. to move the cursor selection)
return len;
}

bool ttext::insert_unicode(const unsigned offset, const wchar_t unicode)
Expand All @@ -195,21 +206,8 @@ bool ttext::insert_unicode(const unsigned offset, const wchar_t unicode)

unsigned ttext::insert_unicode(const unsigned offset, const wide_string& unicode)
{
assert(offset <= length_);

if(length_ == maximum_length_) {
return 0;
}

const unsigned len = length_ + unicode.size() > maximum_length_
? maximum_length_ - length_ : unicode.size();

wide_string tmp = utils::string_to_wstring(text_);
tmp.insert(tmp.begin() + offset, unicode.begin(), unicode.begin() + len);

set_text(utils::wstring_to_string(tmp), false);

return len;
const utf8_string insert = utils::wstring_to_string(unicode);
return insert_text(offset, insert);
}

gui2::tpoint ttext::get_cursor_position(
Expand Down Expand Up @@ -449,10 +447,8 @@ ttext& ttext::set_maximum_length(const size_t maximum_length)
if(maximum_length != maximum_length_) {
maximum_length_ = maximum_length;
if(length_ > maximum_length_) {

wide_string tmp = utils::string_to_wstring(text_);
tmp.resize(maximum_length_);
set_text(utils::wstring_to_string(tmp), false);
utf8_string tmp = text_;
set_text(utils::u8truncate(tmp, maximum_length_), false);
}
}

Expand Down Expand Up @@ -598,7 +594,7 @@ struct decode_table
}
};

static decode_table decode_table;
static struct decode_table decode_table;


#ifndef _WIN32
Expand Down

0 comments on commit 52c80db

Please sign in to comment.