Skip to content

Commit

Permalink
Using Linear Search to fix everything else; #529
Browse files Browse the repository at this point in the history
  • Loading branch information
the-moisrex committed Apr 12, 2024
1 parent 0b82615 commit c822994
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 40 deletions.
28 changes: 23 additions & 5 deletions tests/idna_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,20 +117,28 @@ TEST(BasicIDNATests, MappingFindAlgorithmTest) {
<< "Position of the iterator: "
<< stl::distance(uri::idna::details::idna_mapping_table.begin(), first_pos)
<< "\nRange Start Character: " << (*first_pos & ~uri::idna::details::disallowed_mask);


auto const pos_4 = uri::idna::find_mapping_byte(0x2'fa39);
EXPECT_EQ((*pos_4 & ~uri::idna::details::disallowed_mask), 0x2'FA1E)
<< "Position of the iterator: " << stl::distance(uri::idna::details::idna_mapping_table.begin(), pos_4)
<< "\nRange Start Character: " << stl::hex << (*pos_4 & ~uri::idna::details::disallowed_mask)
<< stl::dec;
}

TEST(BasicIDNATests, TestingAllTheTable) {
stl::size_t errors = 0;
stl::size_t picking_last_one = 0;
stl::size_t picking_next_one = 0;

stl::uint32_t last_one = 0;
stl::uint32_t last_one = 0;
std::set<stl::uint32_t> faileds;
for (stl::uint32_t index = 0; index != uri::idna::details::idna_mapping_table.size(); ++index) {
auto const cur = uri::idna::details::idna_mapping_table[index];
if ((cur & uri::idna::details::mapped_mask) == 0) {
continue;
}
auto const length = cur & ~uri::idna::details::mapped_mask >> 24U;
auto length = (cur & ~uri::idna::details::mapped_mask) >> 24U;

auto range_start = cur & ~uri::idna::details::disallowed_mask;
auto range_end = uri::idna::details::idna_mapping_table[index + 1];
Expand All @@ -142,11 +150,15 @@ TEST(BasicIDNATests, TestingAllTheTable) {
action = "mapped/ignored";
}

length = range_end - range_start;

for (stl::uint32_t sub_index = range_start; sub_index <= range_end;) {
auto const sub_pos = uri::idna::find_mapping_byte(sub_index);
auto sub_pos = uri::idna::find_mapping_byte(sub_index);
std::string_view state = "";
if (*sub_pos != cur) {
++errors;
sub_pos = uri::idna::find_mapping_byte(sub_index);
faileds.insert(cur & ~uri::idna::details::disallowed_mask);
if (*sub_pos == last_one) {
state = " (last one) ";
++picking_last_one;
Expand All @@ -166,16 +178,19 @@ TEST(BasicIDNATests, TestingAllTheTable) {
<< "Index: " << index << "\n"
<< "Sub Index: " << sub_index << " HexChar: " << std::hex << sub_index << std::dec
<< " diff: " << (sub_index - range_start) << "\n"
<< "Current: " << stl::hex << cur << " " << (cur & ~uri::idna::details::disallowed_mask)
<< stl::dec << "\n"
<< "Range start: " << range_start << "\n"
<< "Range end: " << range_end << "\n"
<< "length: " << length << "\n"
<< "Position of the iterator: "
<< stl::distance(uri::idna::details::idna_mapping_table.begin(), sub_pos)
<< "\nCurrent: " << std::hex << (*sub_pos & ~uri::idna::details::disallowed_mask)
<< std::dec << state << "\nExpected: " << std::hex
<< (cur & ~uri::idna::details::disallowed_mask) << std::dec << "\naction: " << action;
}

auto const half = (range_end - range_start) / 2;
auto const half = length / 2;
sub_index += half;
if (half == 0) {
++sub_index;
Expand All @@ -184,7 +199,10 @@ TEST(BasicIDNATests, TestingAllTheTable) {

last_one = cur;
}
EXPECT_EQ(errors, 0);
EXPECT_EQ(errors, 0)
<< stl::accumulate(faileds.begin(), faileds.end(), std::string(), [](auto const& res, auto b) {
return res + ", " + stl::to_string(b);
}).substr(2);
EXPECT_EQ(picking_last_one, 0);
EXPECT_EQ(picking_next_one, 0);
}
Expand Down
24 changes: 17 additions & 7 deletions webpp/uri/idna/details/generate_idna_mapping_table.js
Original file line number Diff line number Diff line change
Expand Up @@ -253,14 +253,18 @@ class MapTable extends TableTraits {
splitIfNeeded(start, end, mappedTo) {
const length = end - start;
if (length > this.lengthLimit) {
console.log(`Splitting block: ${start}-${end}`);
let page = start
for (; page < end; page += this.lengthLimit + 1) {
console.log(`Splitting: ${page}-${page + this.lengthLimit}`);
this.map(page, page + this.lengthLimit, mappedTo);
console.log(`Splitting block: ${start}-${end}; length: ${length}`);
let page = start;
let remaining = length;
for (; remaining !== 0; page += this.lengthLimit + 1) {
const page_end = Math.min(page + this.lengthLimit, end);
this.map(page, page_end, mappedTo);
remaining = end - page_end;
console.log(`Splitting: ${page}-${page_end}; block length: ${
page_end - page}; remaining: ${remaining}`);
}
this.map(page - this.lengthLimit, end, mappedTo);
console.log(`Splitting: ${page - this.lengthLimit}-${end}`);
// this.map(page - this.lengthLimit, end, mappedTo);
// console.log(`Splitting: ${page - this.lengthLimit}-${end}`);
return true;
}
return false;
Expand Down Expand Up @@ -388,6 +392,7 @@ const processCachedFile =
const refTable = new MappingReferenceTable(200000);
const STD3Table = new STD3Mapper(1000);
const mapTable = new MapTable(100000);
let maxMappedCount = 0;
lines.forEach((line, index) => {
line = cleanComments(line)

Expand Down Expand Up @@ -433,6 +438,10 @@ const processCachedFile =
return;
}

if (mappedValues?.length > maxMappedCount) {
maxMappedCount = mappedValues.length;
}

// Process each line here
console.log(index, rangeStart, rangeEnd, status, mappedValues,
IDNA2008Status);
Expand All @@ -442,6 +451,7 @@ const processCachedFile =
refTable.finish?.();
mapTable.finish?.();

console.log("Max Mapped Count: ", maxMappedCount);
await createTableFile(version, creationDate, [ refTable, mapTable ]);

console.log('File processing completed.');
Expand Down
11 changes: 5 additions & 6 deletions webpp/uri/idna/details/idna_mapping_table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*
* Auto generated from: generate_idna_mapping_table.js
* IDNA Creation Date: 2023-08-10, 22:32:27 GMT
* This file's generation date: Wed, 10 Apr 2024 22:05:09 GMT
* This file's generation date: Fri, 12 Apr 2024 08:06:42 GMT
* IDNA Mapping Table Version: 15.1.0
*
* Details about the contents of this file can be found here:
Expand Down Expand Up @@ -8164,11 +8164,11 @@ namespace webpp::uri::idna::details {
* IDNA Mapping Table
*
* Table size:
* - in bits: 488736
* - in bytes: 61092 B
* - in bits: 488704
* - in bytes: 61088 B
* - in KibiBytes: 60 KiB
*/
static constexpr std::array<std::uint32_t, 15'273ULL> idna_mapping_table{
static constexpr std::array<std::uint32_t, 15'272ULL> idna_mapping_table{
2'147'483'713ULL /* Mapped */,
97ULL,
2'147'483'714ULL /* Mapped */,
Expand Down Expand Up @@ -23437,8 +23437,7 @@ namespace webpp::uri::idna::details {
4'278'395'824ULL /* Disallowed */,
917'759ULL,
4'262'330'624ULL /* Ignored */,
4'262'330'751ULL /* Ignored */,
4'010'672'512ULL /* Ignored */,
4'027'449'727ULL /* Ignored */,
4'279'108'080ULL /* Disallowed */,
1'114'111ULL,
4'294'967'295ULL /* Ending Code Point */,
Expand Down
75 changes: 53 additions & 22 deletions webpp/uri/idna/idna_mappings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,15 @@ namespace webpp::uri::idna {
using details::mapped_mask;
using details::not_mapped_mask;

map_table_byte_type const element = static_cast<map_table_byte_type>(inp_ch) | disallowed_mask;
auto const asked_char = static_cast<map_table_byte_type>(inp_ch);

// anything bigger than the last element is disallowed
if ((asked_char & disallowed_mask) != 0U) {
auto const last_element = idna_mapping_table.begin() + (idna_mapping_table.size() - 1);
return last_element;
}

map_table_byte_type const element = asked_char | disallowed_mask;

// this is almost the same thing as std::partition_point and std::lower_bound, but with modifications.
auto length = idna_mapping_table.size();
Expand All @@ -77,29 +85,52 @@ namespace webpp::uri::idna {
// | Should be ignored during binary search
// |
// first-byte: this is the byte we should find and compare against
for (;;) {
length >>= 1U; // devided by 2
auto middle = first; // NOLINT(*-qualified-auto)
std::advance(middle, length);

// non-first-characters are ignored here
decltype(length) remaining = 0;
while ((*middle & mapped_mask) == 0U) {
--middle;
++remaining;
// Binary Search:
// for (;;) {
// length >>= 1U; // devided by 2
// auto middle = first; // NOLINT(*-qualified-auto)
// std::advance(middle, length);
//
// // non-first-characters are ignored here
// decltype(length) remaining = 0;
// while ((*middle & mapped_mask) == 0U) {
// --middle;
// ++remaining;
// }
//
// if (first == middle) {
// stl::advance(middle, remaining);
// ++middle;
// while ((*middle & mapped_mask) == 0U) {
// ++middle;
// }
//
// if (element >= (*middle | disallowed_mask)) {
// first = middle;
// break;
// }
//
// break;
// }
// if (element < (*middle | disallowed_mask)) {
// length -= remaining;
// } else {
// // let's look into the hight half now
// first = middle;
// length += remaining;
// }
// }


// Alternative Linear Search:
for (auto cur = first; cur != idna_mapping_table.end(); ++cur) {
if ((*cur & mapped_mask) == 0U) {
continue;
}

if (middle == first) {
if ((*cur | disallowed_mask) > element) {
break;
}
if (element < (*middle | disallowed_mask)) {
length -= remaining;
} else {
// let's look into the hight half now
first = middle;
length += remaining;
++length;
}
first = cur;
}
return first;
}
Expand All @@ -122,7 +153,7 @@ namespace webpp::uri::idna {
auto const cur_char = static_cast<map_table_byte_type>(inp_ch);
auto const pos = find_mapping_byte(inp_ch);
auto const first_byte = *pos;
auto const length = first_byte & ~mapped_mask >> 24U;
auto const length = (first_byte & ~mapped_mask) >> 24U;

if ([[maybe_unused]] bool const is_disallowed = length == disallowed_mask) {
auto const range_start = first_byte & ~disallowed_mask;
Expand Down

0 comments on commit c822994

Please sign in to comment.