make unicode_cast more generic

unicode_cast is now a template function that works with different types of containers. Especialy it now works with both std::wstring and vector<wchar_t> Conflicts: src/gui/auxiliary/event/handler.cpp
wesnoth · Oct 26, 2014 · cdd35a4 · cdd35a4
1 parent abfb75d
commit cdd35a4
Show file tree

Hide file tree

Showing 5 changed files with 405 additions and 269 deletions.
diff --git a/src/serialization/ucs4_convert_impl.hpp b/src/serialization/ucs4_convert_impl.hpp
@@ -0,0 +1,233 @@
+/*
+   Copyright (C) 2003 - 2014 by David White <dave@whitevine.net>
+   Part of the Battle for Wesnoth Project http://www.wesnoth.org/
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY.
+
+   See the COPYING file for more details.
+*/
+
+#ifndef SERIALIZATION_UCS4_CONVERT_IMPL_HPP_INCLUDED
+#define SERIALIZATION_UCS4_CONVERT_IMPL_HPP_INCLUDED
+
+#include "unicode_types.hpp"
+#include "util.hpp"
+#include <cassert>
+
+namespace ucs4_convert_impl
+{
+	struct utf8_impl
+	{
+		static const char* get_name()  { return "utf8"; };
+		static size_t byte_size_from_ucs4_codepoint(ucs4::char_t ch)
+		{
+			if(ch < (1u << 7))
+				return 1;
+			else if(ch < (1u << 11))
+				return 2;
+			else if(ch < (1u << 16))
+				return 3;
+			else if(ch < (1u << 21))
+				return 4;
+			else if(ch < (1u << 26))
+				return 5;
+			else if(ch < (1u << 31))
+				return 6;
+			else
+				throw utf8::invalid_utf8_exception(); // Invalid UCS-4
+		}
+
+		static int byte_size_from_utf8_first(utf8::char_t ch)
+		{
+			if (!(ch & 0x80)) {
+				return 1;  // US-ASCII character, 1 byte
+			}
+			/* first bit set: character not in US-ASCII, multiple bytes
+			* number of set bits at the beginning = bytes per character
+			* e.g. 11110xxx indicates a 4-byte character */
+			int count = count_leading_ones(ch);
+			if (count == 1 || count > 6) {		// count > 4 after RFC 3629
+				throw utf8::invalid_utf8_exception(); // Stop on invalid characters
+			}
+			return count;
+		};
+
+		/**
+			@param out an object to write utf8::char_t. required operations are:
+				1) push(utf8::char_t) to write a single character
+				2) can_push(size_t n) to check whether there is still enough space
+					for n characters.
+			@param ch the ucs4 chracter to write to the stream.
+		*/
+		template<typename writer>
+		static inline void write(writer out, ucs4::char_t ch)
+		{
+			size_t count = byte_size_from_ucs4_codepoint(ch);
+			assert(out.can_push(count));
+			if(count == 1) {
+				out.push(static_cast<utf8::char_t>(ch));
+			} else {
+				for(int j = static_cast<int>(count) - 1; j >= 0; --j) {
+					unsigned char c = (ch >> (6 * j)) & 0x3f;
+					c |= 0x80;
+					if(j == static_cast<int>(count) - 1) {
+						c |= 0xff << (8 - count);
+					}
+					out.push(c);
+				}
+			}
+		}	
+		/**
+			reads an ucs4 character from an utf8 stream
+			@param input an iterator pointing to the first character of a utf8 sequence to read
+			@param end an iterator poinint to the end of teh utf8 sequence to read.
+		*/
+		template<typename iitor_t>
+		static inline ucs4::char_t read(iitor_t& input, const iitor_t& end)
+		{
+			assert(input != end);
+			size_t size = byte_size_from_utf8_first(*input);
+
+			uint32_t current_char = static_cast<unsigned char>(*input);
+
+			// Convert the first character
+			if(size != 1) {
+				current_char &= 0xFF >> (size + 1);
+			}
+
+			// Convert the continuation bytes
+			// i == number of '++input'
+			++input;
+			for(size_t i = 1; i < size; ++i, ++input) {
+				// If the string ends occurs within an UTF8-sequence, this is bad.
+				if (input == end)
+					throw utf8::invalid_utf8_exception();
+
+				if ((*input & 0xC0) != 0x80)
+					throw utf8::invalid_utf8_exception();
+
+				current_char = (current_char << 6) | (static_cast<unsigned char>(*input) & 0x3F);
+			}
+			//i == size => input was increased size times.
+
+			// Check for non-shortest-form encoding
+			// This has been forbidden in Unicode 3.1 for security reasons
+			if (size > byte_size_from_ucs4_codepoint(current_char))
+				throw utf8::invalid_utf8_exception();
+			return current_char;
+		}
+	};
+
+	struct utf16_impl
+	{
+		static const char* get_name()  { return "utf16"; };
+		template<typename writer>
+		static inline void write(writer out, ucs4::char_t ch)
+		{
+			const uint32_t bit17 = 0x10000;
+
+			if(ch < bit17)
+			{
+				assert(out.can_push(1));
+				out.push(static_cast<utf16::char_t>(ch));
+			}
+			else 
+			{
+				assert(out.can_push(2));
+				const uint32_t char20 = ch - bit17;
+				assert(char20 < (1 << 20));
+				const ucs4::char_t lead = 0xD800 + (char20 >> 10);
+				const ucs4::char_t trail = 0xDC00 + (char20 & 0x3FF);
+				assert(lead < bit17);
+				assert(trail < bit17);
+				out.push(static_cast<utf16::char_t>(lead));
+				out.push(static_cast<utf16::char_t>(trail));
+			}
+		}
+
+		template<typename iitor_t>
+		static inline ucs4::char_t read(iitor_t& input, const iitor_t& end)
+		{
+			const int32_t last10 = 0x3FF;
+			const int32_t type_filter = 0xFC00;
+			const int32_t type_lead = 0xD800;
+			const int32_t type_trail = 0xDC00;
+
+			assert(input != end);
+			uint32_t current_char = static_cast<utf16::char_t>(*input);
+			++input;
+			uint32_t type = current_char & type_filter;
+			if(type == type_trail)
+			{
+				//found trail without head
+				throw utf8::invalid_utf8_exception();
+			}
+			else if(type == type_lead)
+			{
+				if(input == end)
+				{
+					//If the string ends occurs within an UTF16-sequence, this is bad.
+					throw utf8::invalid_utf8_exception();
+				}
+				if((*input & type_filter) != type_trail)
+				{
+					throw utf8::invalid_utf8_exception();
+				}
+				current_char &= last10;
+				current_char <<= 10;
+				current_char += (*input & last10);
+				current_char += 0x10000;
+				++input;
+			}
+			return current_char;
+		}
+	};
+
+	struct utf32_impl
+	{
+		static const char* get_name()  { return "UCS4"; };
+		template<typename writer>
+		static inline void write(writer out, ucs4::char_t ch)
+		{
+			assert(out.can_push(1));
+			out.push(ch);
+		}
+
+		template<typename iitor_t>
+		static inline ucs4::char_t read(iitor_t& input, const iitor_t& end)
+		{
+			assert(input != end);
+			uint32_t current_char = *input;
+			++input;
+			return current_char;
+		}
+	};
+
+	template<typename T_CHAR>
+	struct convert_impl {};
+
+	template<>
+	struct convert_impl<utf8::char_t>
+	{
+		typedef utf8_impl type;
+	};
+
+	template<>
+	struct convert_impl<utf16::char_t>
+	{
+		typedef utf16_impl type;
+	};
+
+	template<>
+	struct convert_impl<ucs4::char_t>
+	{
+		typedef utf32_impl type;
+	};
+}
+
+#endif