Skip to content

Commit

Permalink
Custom Binary Search; #529
Browse files Browse the repository at this point in the history
  • Loading branch information
the-moisrex committed Apr 11, 2024
1 parent 54b2c0a commit 8cae92c
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 31 deletions.
28 changes: 26 additions & 2 deletions tests/idna_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "../webpp/uri/uri.hpp"
#include "common/tests_common_pch.hpp"

// NOLINTBEGIN(*-magic-numbers)
using namespace webpp;

using Types =
Expand Down Expand Up @@ -77,10 +78,33 @@ TYPED_TEST(IDNATests, LabelSeparators) {
}
}

TYPED_TEST(IDNATests, MappingFindAlgorithmTest) {
TEST(BasicIDNATests, MappingFindAlgorithmTest) {
// 'A' should be mapped to 'a'
auto const pos = uri::idna::find_mapping_byte('A');
EXPECT_EQ(*pos, 2'147'483'713ULL)
<< "Position of the iterator: " << stl::distance(uri::idna::details::idna_mapping_table.begin(), pos);
<< "Position of the iterator: " << stl::distance(uri::idna::details::idna_mapping_table.begin(), pos)
<< "\nRange Start Character: " << (*pos & ~uri::idna::details::disallowed_mask);
EXPECT_EQ(*stl::next(pos), 'a');


// FD97..FD98 ; mapped ; 0646 062C 0645
// FD9E ; mapped ; 0628 062E 064A
// 'FD98' should be mapped to '0646 062C 0645'
auto mapped_pos = uri::idna::find_mapping_byte(0xFD98UL);
EXPECT_EQ(*mapped_pos, (0xFD97UL | uri::idna::details::mapped_mask | ((0xFD98UL - 0xFD97UL) << 24U)))
<< "Position of the iterator: "
<< stl::distance(uri::idna::details::idna_mapping_table.begin(), mapped_pos)
<< "\nRange Start Character: " << (*mapped_pos & ~uri::idna::details::disallowed_mask);
EXPECT_EQ(*(++mapped_pos), 0x0646);
EXPECT_EQ(*(++mapped_pos), 0x062C);
EXPECT_EQ(*(++mapped_pos), 0x0645);
}

TEST(BasicIDNATests, PerformMappingTest) {
// 'A' should be mapped to 'a'
std::string out;
EXPECT_TRUE(uri::idna::perform_mapping('A', out));
EXPECT_EQ(out, "a");
}

// NOLINTEND(*-magic-numbers)
13 changes: 4 additions & 9 deletions webpp/uri/authority.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,10 @@ namespace webpp::uri {


webpp_static_constexpr ascii_bitmap interesting_characters =
[]() consteval noexcept -> ascii_bitmap {
if (!IsSpecial) {
return forbidden_hosts;
}
if (ctx_type::is_modifiable) {
return ascii_bitmap{forbidden_domains, ascii_bitmap{UPPER_ALPHA<char>}};
}
return forbidden_domains;
}();
!IsSpecial
? forbidden_hosts
: (ctx_type::is_modifiable ? ascii_bitmap{forbidden_domains, ascii_bitmap{UPPER_ALPHA<char>}}
: forbidden_domains);

auto const authority_begin = ctx.pos;
auto host_begin = authority_begin;
Expand Down
15 changes: 14 additions & 1 deletion webpp/uri/idna/details/generate_idna_mapping_table.js
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ class MapTable extends TableTraits {
disallowedMask = 0xFF000000;
mappedMask = 0x80000000;
lengthLimit = 126; // We have 7 bits, but 0xFF would equal to disallowedMask
endingCodePoint = 0xFFFFFFFF; // this.disallowedMask | 0x00FFFFFF;

constructor(max) {
super(max, uint32);
Expand Down Expand Up @@ -324,6 +325,13 @@ class MapTable extends TableTraits {
++this.index;
}

/// Call this method when you're done with adding things to the table.
finish() {
// add a valid first-byte at the end for simplifying the algorithm, so we
// won't have to check the length of the array while performing mapping.
this.bytes[this.index++] = this.endingCodePoint;
}

get length() { return this.index; }

serializeTable(appendFunc, cols = 20 - this.sizeof) {
Expand All @@ -334,7 +342,9 @@ class MapTable extends TableTraits {
const is_first_byte = (byte >>> 31) === 0b1;
const is_disallowed = (byte >>> 24) === (this.disallowedMask >>> 24);
appendFunc(`${byte}${postfix} `);
if (is_disallowed) {
if (byte === this.endingCodePoint) {
appendFunc('/* Ending Code Point */');
} else if (is_disallowed) {
appendFunc('/* Disallowed */');
} else if (is_first_byte) {
if ((pos + 1 !== this.length) && (this.bytes[pos + 1] >>> 31) === 0b1) {
Expand Down Expand Up @@ -429,6 +439,9 @@ const processCachedFile =
return `${codePoints}`;
});

refTable.finish?.();
mapTable.finish?.();

await createTableFile(version, creationDate, [ refTable, mapTable ]);

console.log('File processing completed.');
Expand Down
9 changes: 5 additions & 4 deletions webpp/uri/idna/details/idna_mapping_table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*
* Auto generated from: generate_idna_mapping_table.js
* IDNA Creation Date: 2023-08-10, 22:32:27 GMT
* This file's generation date: Mon, 08 Apr 2024 22:07:17 GMT
* This file's generation date: Wed, 10 Apr 2024 22:05:09 GMT
* IDNA Mapping Table Version: 15.1.0
*
* Details about the contents of this file can be found here:
Expand Down Expand Up @@ -8164,11 +8164,11 @@ namespace webpp::uri::idna::details {
* IDNA Mapping Table
*
* Table size:
* - in bits: 488704
* - in bytes: 61088 B
* - in bits: 488736
* - in bytes: 61092 B
* - in KibiBytes: 60 KiB
*/
static constexpr std::array<std::uint32_t, 15'272ULL> idna_mapping_table{
static constexpr std::array<std::uint32_t, 15'273ULL> idna_mapping_table{
2'147'483'713ULL /* Mapped */,
97ULL,
2'147'483'714ULL /* Mapped */,
Expand Down Expand Up @@ -23441,6 +23441,7 @@ namespace webpp::uri::idna::details {
4'010'672'512ULL /* Ignored */,
4'279'108'080ULL /* Disallowed */,
1'114'111ULL,
4'294'967'295ULL /* Ending Code Point */,
};

} // namespace webpp::uri::idna::details
Expand Down
101 changes: 86 additions & 15 deletions webpp/uri/idna/idna_mappings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "../../std/algorithm.hpp"
#include "../../std/string.hpp"
#include "../../std/type_traits.hpp"
#include "../../strings/unicode.hpp"
#include "./details/idna_mapping_table.hpp"

namespace webpp::uri::idna {
Expand All @@ -17,6 +18,7 @@ namespace webpp::uri::idna {

static constexpr map_table_byte_type disallowed_mask = 0xFF00'0000;
static constexpr map_table_byte_type mapped_mask = 0x8000'0000;
static constexpr map_table_byte_type not_mapped_mask = disallowed_mask & mapped_mask;

// We have 7 bits, but 0xFF would equal to disallowedMask
static constexpr map_table_byte_type length_limit = 126;
Expand Down Expand Up @@ -60,21 +62,89 @@ namespace webpp::uri::idna {
using details::idna_mapping_table;
using details::map_table_byte_type;
using details::mapped_mask;
using details::not_mapped_mask;

map_table_byte_type const byte = inp_ch | mapped_mask;
auto const pos = stl::lower_bound(
idna_mapping_table.begin(),
idna_mapping_table.end(),
byte,
[](map_table_byte_type const lhs, map_table_byte_type const rhs) constexpr noexcept {
return (lhs | disallowed_mask) < (rhs | disallowed_mask);
});
map_table_byte_type const byte = static_cast<map_table_byte_type>(inp_ch) | disallowed_mask;

return pos;
// this is almost the same thing as std::partition_point and std::lower_bound, but with modifications.
auto length = idna_mapping_table.size();
auto first = idna_mapping_table.begin(); // NOLINT(*-qualified-auto)

while (length > 0) {
auto const half = length >> 1U;
auto middle = first; // NOLINT(*-qualified-auto)
std::advance(middle, half);

// non-first-characters are ignored here
while ((*middle & mapped_mask) == 0U) {
--middle;
}

if ((*middle | disallowed_mask) < byte) {
length = length - half - 1;
first = middle;
} else {
length = half;
}
}
return first;
}

template <istl::String OutStrT, typename Iter>
[[nodiscard]] static constexpr bool map(Iter beg, Iter end, OutStrT &out) {
/**
* Perform the mapping for a single character
* @returns false if the character is not allowed
*/
template <bool UseSTD3ASCIIRules = false, typename CharT, istl::String OutStrT>
[[nodiscard]] static constexpr bool perform_mapping(CharT const inp_ch, OutStrT& out) {
using details::disallowed_mask;
using details::map_table_byte_type;
using details::mapped_mask;

// todo: uncomment this
// static_assert(sizeof(CharT) >= 16U, // NOLINT(*-magic-numbers)
// "UTF-8 characters should not be used here, "
// "should first be converted to a 32bit character.");

auto const cur_char = static_cast<map_table_byte_type>(inp_ch);
auto const pos = find_mapping_byte(inp_ch);
auto const first_byte = *pos;
auto const length = first_byte & ~mapped_mask >> 24U;

if ([[maybe_unused]] bool const is_disallowed = length == disallowed_mask) {
auto const range_start = first_byte & ~disallowed_mask;
auto const range_end = *stl::next(pos);

// we don't realy need to check this because the all the senarios that this would not be true, the
// user has passed an invalid character anyway.
assert(cur_char >= range_start && cur_char <= range_end);
return false;
}

// mapping, or ignoring a character:
{
auto const range_start = first_byte & ~disallowed_mask;
auto const range_end = range_start + length;

// we don't really need to check the range because all the senarios that this would either be
// caught by the "reference table", or it's already detected that it's an invalid character.
assert(cur_char >= range_start && cur_char <= range_end);

// loop until we find the next "first-byte"
// we don't need to check the length of the array, there's a valid `first-byte` character at the
// end of the table intentionally inserted for this purpose.
for (auto cur_pos = stl::next(pos); (*cur_pos & mapped_mask) == 0; ++cur_pos) {
// todo: conver to utf-8
// out.append(*cur_pos);
}
}

// todo: handle STD3

return true;
}

template <bool UseSTD3ASCIIRules = false, istl::String OutStrT, typename Iter>
[[nodiscard]] static constexpr bool map(Iter beg, Iter end, OutStrT& out) {
using details::idna_reference_table;
using ref_table_byte_type = typename decltype(idna_reference_table)::value_type;

Expand All @@ -84,16 +154,17 @@ namespace webpp::uri::idna {
stl::size_t const byte_index = static_cast<stl::size_t>(*pos) / sizeof(ref_table_byte_type);
unsigned const rem_index = static_cast<stl::size_t>(*pos) % sizeof(ref_table_byte_type);
ref_table_byte_type const byte = idna_reference_table[byte_index];
bool const is_active = (byte & rem_index) != 0;

if (is_active) {
if ([[maybe_unused]] bool const should_map = (byte & rem_index) != 0) {
// now we should look at the mapping table

if (!perform_mapping<UseSTD3ASCIIRules>(*pos)) {
return false;
}
continue;
}

// it's a valid character
out += *pos;
out.append(*pos);
}
return true;
}
Expand Down

0 comments on commit 8cae92c

Please sign in to comment.