From c07cd121ccce5bda4a3f5810874f3edda892ece2 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 10:58:33 +0100 Subject: [PATCH 01/37] :zap: add EncodeBenchmarks for comprehensive encoding performance analysis --- .../QsNet.Benchmarks/EncodeBenchmarks.cs | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs diff --git a/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs b/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs new file mode 100644 index 0000000..60fdd8a --- /dev/null +++ b/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs @@ -0,0 +1,101 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Order; +using QsNet; +using QsNet.Models; +using QsNet.Enums; + +namespace QsNet.Benchmarks; + +[MemoryDiagnoser] +[SimpleJob(RuntimeMoniker.Net80)] +[Orderer(SummaryOrderPolicy.FastestToSlowest)] +public class EncodeBenchmarks +{ + // Size & shape + [Params(10, 100, 1000)] public int Count { get; set; } + [Params(8, 40)] public int ValueLen { get; set; } + [Params(0, 1, 10, 50)] public int NeedsEscPercent { get; set; } + + // Option toggles that materially affect Encode() + [Params(false, true)] public bool CommaLists { get; set; } + [Params(false, true)] public bool EncodeValuesOnly { get; set; } + [Params(false, true)] public bool EncodeDotInKeys { get; set; } + [Params(false, true)] public bool AllowDots { get; set; } + + private static string MakeValue(int len, int escPercent, Random rnd) + { + if (escPercent <= 0) + { + return new string('x', len); + } + + var chars = new char[len]; + for (int i = 0; i < len; i++) + { + bool needsEsc = rnd.Next(0, 100) < escPercent; + if (!needsEsc) + { + chars[i] = 'x'; + continue; + } + + // Mix of characters that typically require escaping + switch (rnd.Next(0, 4)) + { + case 0: chars[i] = ' '; break; // space -> %20 or + + case 1: chars[i] = '%'; break; // percent -> %25 + case 2: chars[i] = '\u00E4'; break; // non-ASCII -> UTF-8 percent-encoded + default: chars[i] = ','; break; // comma (should be encoded inside list items) + } + } + return new string(chars); + } + + private object _data = default!; + private EncodeOptions _options = default!; + + [GlobalSetup] + public void Setup() + { + var rnd = new Random(12345); + + // Build a realistic object graph to exercise the encoder: + // - list under key "a" (affected by ListFormat) + // - dotted key under nested dictionary (affected by EncodeDotInKeys) + // - a date and a boolean for primitive branches + var list = Enumerable.Range(0, Count) + .Select(_ => (object?)MakeValue(ValueLen, NeedsEscPercent, rnd)) + .ToList(); + + _data = new Dictionary + { + ["a"] = list, + ["b"] = new Dictionary + { + ["x.y"] = MakeValue(ValueLen, NeedsEscPercent, rnd), + ["inner"] = new Dictionary + { + ["z"] = MakeValue(ValueLen, NeedsEscPercent, rnd) + } + }, + ["c"] = new DateTimeOffset(2024, 1, 2, 3, 4, 5, TimeSpan.Zero), + ["d"] = true + }; + + _options = new EncodeOptions + { + ListFormat = CommaLists ? ListFormat.Comma : ListFormat.Indices, + EncodeValuesOnly = EncodeValuesOnly, + EncodeDotInKeys = EncodeDotInKeys, + AllowDots = AllowDots, + // Leave other toggles at defaults to mirror common usage. + }; + } + + [Benchmark] + public string Encode_Public() => Qs.Encode(_data, _options); +} \ No newline at end of file From 977a27434f9acc26363abb319397c747625e87f9 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 13:03:00 +0100 Subject: [PATCH 02/37] :zap: refactor EncodeBenchmarks to unify dot-related options into DotMode enum --- benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs b/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs index 60fdd8a..5404fe4 100644 --- a/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs +++ b/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs @@ -15,16 +15,22 @@ namespace QsNet.Benchmarks; [Orderer(SummaryOrderPolicy.FastestToSlowest)] public class EncodeBenchmarks { + public enum DotMode + { + None, // AllowDots=false, EncodeDotInKeys=false + AllowDots, // AllowDots=true, EncodeDotInKeys=false + AllowDotsAndEncode // AllowDots=true, EncodeDotInKeys=true + } + // Size & shape [Params(10, 100, 1000)] public int Count { get; set; } [Params(8, 40)] public int ValueLen { get; set; } - [Params(0, 1, 10, 50)] public int NeedsEscPercent { get; set; } + [Params(0, 50)] public int NeedsEscPercent { get; set; } // Option toggles that materially affect Encode() [Params(false, true)] public bool CommaLists { get; set; } [Params(false, true)] public bool EncodeValuesOnly { get; set; } - [Params(false, true)] public bool EncodeDotInKeys { get; set; } - [Params(false, true)] public bool AllowDots { get; set; } + [Params(DotMode.None, DotMode.AllowDots, DotMode.AllowDotsAndEncode)] public DotMode Dots { get; set; } private static string MakeValue(int len, int escPercent, Random rnd) { @@ -90,8 +96,8 @@ public void Setup() { ListFormat = CommaLists ? ListFormat.Comma : ListFormat.Indices, EncodeValuesOnly = EncodeValuesOnly, - EncodeDotInKeys = EncodeDotInKeys, - AllowDots = AllowDots, + AllowDots = Dots != DotMode.None, + EncodeDotInKeys = Dots == DotMode.AllowDotsAndEncode, // Leave other toggles at defaults to mirror common usage. }; } From b66435f5a2b3065077ecf18f5def34f50d1225e6 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 13:09:31 +0100 Subject: [PATCH 03/37] :zap: optimize Encoder sequence handling and dot encoding logic for improved performance and clarity --- QsNet/Internal/Encoder.cs | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index dd08133..c20844c 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -140,9 +140,8 @@ public static object Encode( return $"{fmt(keyPart)}={fmt(valuePart)}"; } - var values = new List(); if (undefined) - return values; + return Array.Empty(); // Detect sequence once and cache materialization for index access / counts var isSeq = false; @@ -156,7 +155,9 @@ public static object Encode( List objKeys; if (isCommaGen && obj is IEnumerable enumerable and not string and not IDictionary) { - List strings = []; + var strings = enumerable is ICollection { Count: > 0 } coll0 + ? new List(coll0.Count) + : new List(); if (encodeValuesOnly && encoder != null) foreach (var el in enumerable) strings.Add(el is null ? "" : encoder(el.ToString(), null, null)); @@ -229,9 +230,17 @@ public static object Encode( objKeys.Sort(Comparer.Create(sort)); } - values.Capacity = Math.Max(values.Capacity, objKeys.Count); + var values = new List(objKeys.Count); - var encodedPrefix = encodeDotInKeys ? keyPrefixStr.Replace(".", "%2E") : keyPrefixStr; +#if NETSTANDARD2_0 + var encodedPrefix = encodeDotInKeys && keyPrefixStr.IndexOf('.') >= 0 + ? keyPrefixStr.Replace(".", "%2E") + : keyPrefixStr; +#else + var encodedPrefix = encodeDotInKeys && keyPrefixStr.Contains('.', StringComparison.Ordinal) + ? keyPrefixStr.Replace(".", "%2E", StringComparison.Ordinal) + : keyPrefixStr; +#endif var adjustedPrefix = crt && isSeq && seqList is { Count: 1 } ? $"{encodedPrefix}[]" @@ -336,8 +345,7 @@ IConvertible when int.TryParse(key.ToString(), out var parsed) => break; } - case IEnumerable ie - and not string: + case IEnumerable and not string: { var idx = key switch { @@ -345,7 +353,7 @@ IConvertible when int.TryParse(key.ToString(), out var parsed) => IConvertible when int.TryParse(key.ToString(), out var parsed) => parsed, _ => -1 }; - var list2 = seqList ?? ie.Cast().ToList(); + var list2 = seqList!; if ((uint)idx < (uint)list2.Count) { value = list2[idx]; @@ -379,13 +387,13 @@ IConvertible when int.TryParse(key.ToString(), out var parsed) => parsed, #endif var keyPrefix = - obj is IEnumerable and not string and not IDictionary + isSeq ? gen(adjustedPrefix, encodedKey) : allowDots ? $"{adjustedPrefix}.{encodedKey}" : $"{adjustedPrefix}[{encodedKey}]"; - if (objKey is not null && obj is IDictionary or IEnumerable and not string) + if (objKey is not null && (obj is IDictionary || isSeq)) sideChannel.Set(objKey, step); var childSc = new SideChannelFrame(sideChannel); @@ -418,7 +426,9 @@ obj is IEnumerable and not string and not IDictionary addQueryPrefix ); - if (encoded is IEnumerable en and not string) + if (encoded is List enList) + values.AddRange(enList); + else if (encoded is IEnumerable en and not string) foreach (var item in en) values.Add(item); else From 6e806733f940f6e5c3c9c794f440b870beac7b0a Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 14:15:58 +0100 Subject: [PATCH 04/37] :zap: optimize Encoder leaf detection and sequence handling for improved performance and correctness --- QsNet/Internal/Encoder.cs | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index c20844c..9fae1b1 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -15,6 +15,12 @@ internal static class Encoder { private static readonly Formatter IdentityFormatter = s => s; + private static bool IsLeaf(object? v, bool skipNulls) + { + if (v is null) return skipNulls; + return Utils.IsNonNullishPrimitive(v) || v is byte[]; + } + /// /// Encodes the given data into a query string format. /// @@ -66,7 +72,7 @@ public static object Encode( var cs = charset ?? Encoding.UTF8; var gen = generateArrayPrefix ?? ListFormat.Indices.GetGenerator(); - var isCommaGen = ReferenceEquals(gen, ListFormat.Comma.GetGenerator()); + var isCommaGen = gen == ListFormat.Comma.GetGenerator(); var crt = commaRoundTrip ?? isCommaGen; var keyPrefixStr = prefix ?? (addQueryPrefix ? "?" : ""); @@ -149,18 +155,26 @@ public static object Encode( if (obj is IEnumerable seq0 and not string and not IDictionary) { isSeq = true; - seqList = seq0.Cast().ToList(); + if (obj is List already) + seqList = already; + else + seqList = seq0.Cast().ToList(); } List objKeys; if (isCommaGen && obj is IEnumerable enumerable and not string and not IDictionary) { - var strings = enumerable is ICollection { Count: > 0 } coll0 - ? new List(coll0.Count) - : new List(); + List strings; + if (obj is List listObj) + strings = new List(listObj.Count); + else if (enumerable is ICollection { Count: > 0 } coll0) + strings = new List(coll0.Count); + else + strings = []; + if (encodeValuesOnly && encoder != null) foreach (var el in enumerable) - strings.Add(el is null ? "" : encoder(el.ToString(), null, null)); + strings.Add(el is null ? "" : encoder(el, null, null)); else foreach (var el in enumerable) strings.Add(el?.ToString() ?? ""); @@ -396,7 +410,7 @@ IConvertible when int.TryParse(key.ToString(), out var parsed) => parsed, if (objKey is not null && (obj is IDictionary || isSeq)) sideChannel.Set(objKey, step); - var childSc = new SideChannelFrame(sideChannel); + var childSc = IsLeaf(value, skipNulls) ? sideChannel : new SideChannelFrame(sideChannel); var childEncoder = isCommaGen && encodeValuesOnly && obj is IEnumerable and not string From 52dab33f11c4be3c1fb28cbef11ced261eb38e87 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 16:17:51 +0100 Subject: [PATCH 05/37] :zap: optimize Encoder with fast paths for comma-joined arrays, direct iteration, and side-channel marking; improve value encoding and allocation efficiency --- QsNet/Internal/Encoder.cs | 380 +++++++++++++++++++++++++++++++++++--- 1 file changed, 351 insertions(+), 29 deletions(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index 9fae1b1..34f5537 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -1,7 +1,9 @@ using System; using System.Collections; using System.Collections.Generic; +using System.Globalization; using System.Linq; +using System.Runtime.CompilerServices; using System.Text; using QsNet.Enums; using QsNet.Models; @@ -15,6 +17,109 @@ internal static class Encoder { private static readonly Formatter IdentityFormatter = s => s; + private static readonly char[] HexUpper = "0123456789ABCDEF".ToCharArray(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsAsciiAlphaNum(char ch) + { + return ch is >= '0' and <= '9' or >= 'A' and <= 'Z' or >= 'a' and <= 'z'; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void AppendPctEncodedByte(StringBuilder sb, byte b) + { + sb.Append('%'); + sb.Append(HexUpper[b >> 4]); + sb.Append(HexUpper[b & 0xF]); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static string ToInvariantString(object? value) + { + if (value is null) return string.Empty; + return value switch + { + bool b => b ? "true" : "false", + sbyte v => v.ToString(CultureInfo.InvariantCulture), + byte v => v.ToString(CultureInfo.InvariantCulture), + short v => v.ToString(CultureInfo.InvariantCulture), + ushort v => v.ToString(CultureInfo.InvariantCulture), + int v => v.ToString(CultureInfo.InvariantCulture), + uint v => v.ToString(CultureInfo.InvariantCulture), + long v => v.ToString(CultureInfo.InvariantCulture), + ulong v => v.ToString(CultureInfo.InvariantCulture), + float v => v.ToString(CultureInfo.InvariantCulture), + double v => v.ToString(CultureInfo.InvariantCulture), + decimal v => v.ToString(CultureInfo.InvariantCulture), + char ch => ch.ToString(), + _ => value.ToString() ?? string.Empty + }; + } + + // Encode a single value for the comma-values-only fast path, without re-encoding the comma separators. + // RFC3986 by default; RFC1738 maps space to '+'. Commas inside values are percent-encoded as %2C. + private static void AppendCommaEncodedValue(StringBuilder sb, object? value, Encoding cs, Format format) + { + var s = ToInvariantString(value); + + for (var i = 0; i < s.Length; i++) + { + var ch = s[i]; + + // ASCII fast-path + if (ch <= 0x7F) + { + // unreserved: ALPHA / DIGIT / '-' / '.' / '_' / '~' + if (IsAsciiAlphaNum(ch) || ch == '-' || ch == '_' || ch == '.' || ch == '~') + { + sb.Append(ch); + continue; + } + + if (format == Format.Rfc1738 && ch == ' ') + { + sb.Append('+'); + continue; + } + + // Comma inside a value must be encoded (separators are appended by the caller) + if (ch == ',') + { + sb.Append("%2C"); + continue; + } + + AppendPctEncodedByte(sb, (byte)ch); + continue; + } + + // Non-ASCII: encode using the provided charset (UTF-8 by default) + if (char.IsSurrogatePair(s, i)) + { +#if NETSTANDARD2_0 + var bytes = cs.GetBytes(s.Substring(i, 2)); +#else + var bytes = cs.GetBytes(s, i, 2); +#endif + foreach (var t in bytes) + AppendPctEncodedByte(sb, t); + + i++; // consumed the low surrogate as well + } + else + { +#if NETSTANDARD2_0 + var bytes = cs.GetBytes(s.Substring(i, 1)); +#else + var bytes = cs.GetBytes(s, i, 1); +#endif + foreach (var t in bytes) + AppendPctEncodedByte(sb, t); + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool IsLeaf(object? v, bool skipNulls) { if (v is null) return skipNulls; @@ -77,28 +182,32 @@ public static object Encode( var keyPrefixStr = prefix ?? (addQueryPrefix ? "?" : ""); var obj = data; + var dotsAndEncode = allowDots && encodeDotInKeys; var objKey = data; // identity key var tmpSc = sideChannel; var step = 0; var found = false; - while (!found) - { - tmpSc = tmpSc.Parent; - if (tmpSc is null) - break; - step++; - if (objKey is not null && tmpSc.TryGet(objKey, out var pos)) + // Fast path (#3): skip cycle detection when the current value is a leaf. + // Leaves never recurse, so they can’t participate in cycles. + if (!IsLeaf(data, skipNulls)) + while (!found) { - if (pos == step) - throw new InvalidOperationException("Cyclic object value"); - found = true; - } + tmpSc = tmpSc.Parent; + if (tmpSc is null) + break; + step++; + if (objKey is not null && tmpSc.TryGet(objKey, out var pos)) + { + if (pos == step) + throw new InvalidOperationException("Cyclic object value"); + found = true; + } - if (tmpSc.Parent is null) - step = 0; - } + if (tmpSc.Parent is null) + step = 0; + } if (filter is FunctionFilter ff) obj = ff.Function(keyPrefixStr, obj); @@ -133,11 +242,7 @@ public static object Encode( { if (encoder == null) { - var s = obj switch - { - bool b => b ? "true" : "false", - _ => obj?.ToString() ?? "" - }; + var s = ToInvariantString(obj); return $"{fmt(keyPrefixStr)}={fmt(s)}"; } @@ -161,6 +266,217 @@ public static object Encode( seqList = seq0.Cast().ToList(); } + // Fast path (#1): when no sorting is requested, avoid building objKeys and + // iterate the structure directly to eliminate extra allocations and lookups. + if (sort == null && !(isCommaGen && obj is IEnumerable and not string and not IDictionary) && + filter is not IterableFilter) + { +#if NETSTANDARD2_0 + var encodedPrefixFast = encodeDotInKeys && keyPrefixStr.IndexOf('.') >= 0 + ? keyPrefixStr.Replace(".", "%2E") + : keyPrefixStr; +#else + var encodedPrefixFast = encodeDotInKeys && keyPrefixStr.Contains('.', StringComparison.Ordinal) + ? keyPrefixStr.Replace(".", "%2E", StringComparison.Ordinal) + : keyPrefixStr; +#endif + var adjustedPrefixFast = + crt && isSeq && seqList is { Count: 1 } + ? $"{encodedPrefixFast}[]" + : encodedPrefixFast; + + if (allowEmptyLists && isSeq && seqList is { Count: 0 }) + return $"{adjustedPrefixFast}[]"; + + // Fast path (#5): mark side-channel once per parent instead of per child + var markSideChannelFast = objKey is not null && (obj is IDictionary || isSeq); + if (markSideChannelFast) + sideChannel.Set(objKey!, step); + + List valuesFast; + + void AddKv(object? keyObj, object? val) + { + if (skipNulls && val is null) + return; + + var keyStr = keyObj?.ToString() ?? string.Empty; + var encodedKey = keyStr; +#if NETSTANDARD2_0 + if (dotsAndEncode && keyStr.IndexOf('.') >= 0) + encodedKey = keyStr.Replace(".", "%2E"); +#else + if (dotsAndEncode && keyStr.Contains('.', StringComparison.Ordinal)) + encodedKey = keyStr.Replace(".", "%2E", StringComparison.Ordinal); +#endif + var keyPrefixFast = + isSeq + ? gen(adjustedPrefixFast, encodedKey) + : allowDots + ? $"{adjustedPrefixFast}.{encodedKey}" + : $"{adjustedPrefixFast}[{encodedKey}]"; + + // Removed per-iteration sideChannel.Set + + var childSc = IsLeaf(val, skipNulls) ? sideChannel : new SideChannelFrame(sideChannel); + + var encoded = Encode( + val, + false, + childSc, + keyPrefixFast, + gen, + crt, + allowEmptyLists, + strictNullHandling, + skipNulls, + encodeDotInKeys, + encoder, + serializeDate, + sort, + filter, + allowDots, + format, + fmt, + encodeValuesOnly, + cs, + addQueryPrefix + ); + + switch (encoded) + { + case List enList: + valuesFast.AddRange(enList); + break; + case IEnumerable en and not string: + { + foreach (var item in en) + valuesFast.Add(item); + break; + } + default: + valuesFast.Add(encoded); + break; + } + } + + switch (obj) + { + case IDictionary dObj: + valuesFast = new List(dObj.Count); + foreach (var kv in dObj) + AddKv(kv.Key, kv.Value); + return valuesFast; + case IDictionary dStr: + valuesFast = new List(dStr.Count); + foreach (var kv in dStr) + AddKv(kv.Key, kv.Value); + return valuesFast; + case IDictionary map: + valuesFast = new List(map.Count); + foreach (DictionaryEntry de in map) + AddKv(de.Key, de.Value); + return valuesFast; + case Array arr: + valuesFast = new List(arr.Length); + for (var i = 0; i < arr.Length; i++) + AddKv(i, arr.GetValue(i)); + return valuesFast; + case IList list: + valuesFast = new List(list.Count); + for (var i = 0; i < list.Count; i++) + AddKv(i, list[i]); + return valuesFast; + default: + if (isSeq && seqList != null) + { + valuesFast = new List(seqList.Count); + for (var i = 0; i < seqList.Count; i++) + AddKv(i, seqList[i]); + return valuesFast; + } + + break; + } + // If we fall through (very uncommon), continue with the generic path below. + } + + // Fast path (#2): comma-joined arrays -> build the joined value once and short-circuit the generic path. + if (isCommaGen && obj is IEnumerable enumerableC and not string and not IDictionary && sort == null && + filter is not IterableFilter) + { +#if NETSTANDARD2_0 + var encodedPrefixC = encodeDotInKeys && keyPrefixStr.IndexOf('.') >= 0 + ? keyPrefixStr.Replace(".", "%2E") + : keyPrefixStr; +#else + var encodedPrefixC = encodeDotInKeys && keyPrefixStr.Contains('.', StringComparison.Ordinal) + ? keyPrefixStr.Replace(".", "%2E", StringComparison.Ordinal) + : keyPrefixStr; +#endif + // Materialize once for count checks and iteration + var listC = seqList ?? enumerableC.Cast().ToList(); + var adjustedPrefixC = crt && listC.Count == 1 ? $"{encodedPrefixC}[]" : encodedPrefixC; + + // Honor empty list handling semantics + if (allowEmptyLists && listC.Count == 0) + return $"{adjustedPrefixC}[]"; + if (listC.Count == 0) + return Array.Empty(); + + string joinedC; + if (encodeValuesOnly && encoder != null) + { + // Stream-encode each element and append literal commas between them. + var sbJoined = new StringBuilder(listC.Count * 8); + for (var i = 0; i < listC.Count; i++) + { + if (i > 0) sbJoined.Append(','); // separator comma is never re-encoded + AppendCommaEncodedValue(sbJoined, listC[i], cs, format); + } + + joinedC = sbJoined.ToString(); + + // Match legacy semantics: if the joined value is empty, treat it like `null`. + if (!string.IsNullOrEmpty(joinedC)) return $"{fmt(adjustedPrefixC)}={fmt(joinedC)}"; + if (skipNulls) + return Array.Empty(); + + if (strictNullHandling) + return !encodeValuesOnly + ? fmt(encoder(adjustedPrefixC, cs, format)) + : adjustedPrefixC; + // not strict: fall through to return `key=` below + + // In values-only mode we do not encode the key via `encoder`. + return $"{fmt(adjustedPrefixC)}={fmt(joinedC)}"; + } + + // Join raw string representations; apply encoder to the full result if provided. + var tmp = new List(listC.Count); + foreach (var el in listC) + tmp.Add(ToInvariantString(el)); + joinedC = string.Join(",", tmp); + + // Match legacy semantics: if the joined value is empty, treat it like `null`. + if (string.IsNullOrEmpty(joinedC)) + { + if (skipNulls) + return Array.Empty(); + + if (strictNullHandling) + return encoder != null && !encodeValuesOnly + ? fmt(encoder(adjustedPrefixC, cs, format)) + : adjustedPrefixC; + // not strict: fall through to return `key=` below + } + + if (encoder == null) return $"{fmt(adjustedPrefixC)}={fmt(joinedC)}"; + var keyPartC = encoder(adjustedPrefixC, null, null); + var valuePartC = encoder(joinedC, null, null); + return $"{fmt(keyPartC)}={fmt(valuePartC)}"; + } + List objKeys; if (isCommaGen && obj is IEnumerable enumerable and not string and not IDictionary) { @@ -263,6 +579,18 @@ public static object Encode( if (allowEmptyLists && isSeq && seqList is { Count: 0 }) return $"{adjustedPrefix}[]"; + // Fast path (#5): mark side-channel once per parent instead of per element + var markSideChannel = objKey is not null && (obj is IDictionary || isSeq); + if (markSideChannel) + sideChannel.Set(objKey!, step); + + // Fast path (#4): hoist child-encoder decision out of the loop. + // For comma-joined arrays in values-only mode, do not re-encode children. + var childEncoderForElements = + isCommaGen && encodeValuesOnly && obj is IEnumerable and not string + ? null + : encoder; + for (var i = 0; i < objKeys.Count; i++) { var key = objKeys[i]; @@ -393,10 +721,10 @@ IConvertible when int.TryParse(key.ToString(), out var parsed) => parsed, var keyStr = key?.ToString() ?? ""; var encodedKey = keyStr; #if NETSTANDARD2_0 - if (allowDots && encodeDotInKeys && keyStr.IndexOf('.') >= 0) + if (dotsAndEncode && keyStr.IndexOf('.') >= 0) encodedKey = keyStr.Replace(".", "%2E"); #else - if (allowDots && encodeDotInKeys && keyStr.Contains('.', StringComparison.Ordinal)) + if (dotsAndEncode && keyStr.Contains('.', StringComparison.Ordinal)) encodedKey = keyStr.Replace(".", "%2E", StringComparison.Ordinal); #endif @@ -407,16 +735,10 @@ IConvertible when int.TryParse(key.ToString(), out var parsed) => parsed, ? $"{adjustedPrefix}.{encodedKey}" : $"{adjustedPrefix}[{encodedKey}]"; - if (objKey is not null && (obj is IDictionary || isSeq)) - sideChannel.Set(objKey, step); + // Removed per-iteration sideChannel.Set var childSc = IsLeaf(value, skipNulls) ? sideChannel : new SideChannelFrame(sideChannel); - var childEncoder = - isCommaGen && encodeValuesOnly && obj is IEnumerable and not string - ? null - : encoder; - var encoded = Encode( value, valueUndefined, @@ -428,7 +750,7 @@ IConvertible when int.TryParse(key.ToString(), out var parsed) => parsed, strictNullHandling, skipNulls, encodeDotInKeys, - childEncoder, + childEncoderForElements, serializeDate, sort, filter, From d215524b900985ca75ce8480610dc0466b7bfe3d Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 17:40:34 +0100 Subject: [PATCH 06/37] :zap: optimize Encoder comma-value encoding by delegating to ValueEncoder and centralizing comma escaping --- QsNet/Internal/Encoder.cs | 69 +++++++++------------------------------ 1 file changed, 15 insertions(+), 54 deletions(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index 34f5537..5169190 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -58,65 +58,26 @@ private static string ToInvariantString(object? value) // Encode a single value for the comma-values-only fast path, without re-encoding the comma separators. // RFC3986 by default; RFC1738 maps space to '+'. Commas inside values are percent-encoded as %2C. - private static void AppendCommaEncodedValue(StringBuilder sb, object? value, Encoding cs, Format format) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void AppendCommaEncodedValue( + StringBuilder sb, + object? value, + Encoding cs, + Format format, + ValueEncoder? encoder + ) { - var s = ToInvariantString(value); - - for (var i = 0; i < s.Length; i++) - { - var ch = s[i]; - - // ASCII fast-path - if (ch <= 0x7F) - { - // unreserved: ALPHA / DIGIT / '-' / '.' / '_' / '~' - if (IsAsciiAlphaNum(ch) || ch == '-' || ch == '_' || ch == '.' || ch == '~') - { - sb.Append(ch); - continue; - } - - if (format == Format.Rfc1738 && ch == ' ') - { - sb.Append('+'); - continue; - } - - // Comma inside a value must be encoded (separators are appended by the caller) - if (ch == ',') - { - sb.Append("%2C"); - continue; - } - - AppendPctEncodedByte(sb, (byte)ch); - continue; - } + var encoded = encoder != null ? encoder(value, cs, format) : Utils.Encode(value, cs, format); - // Non-ASCII: encode using the provided charset (UTF-8 by default) - if (char.IsSurrogatePair(s, i)) - { #if NETSTANDARD2_0 - var bytes = cs.GetBytes(s.Substring(i, 2)); + if (encoded.IndexOf(',') >= 0) + encoded = encoded.Replace(",", "%2C"); // commas inside values must be encoded #else - var bytes = cs.GetBytes(s, i, 2); + if (encoded.Contains(',', StringComparison.Ordinal)) + encoded = encoded.Replace(",", "%2C", StringComparison.Ordinal); #endif - foreach (var t in bytes) - AppendPctEncodedByte(sb, t); - i++; // consumed the low surrogate as well - } - else - { -#if NETSTANDARD2_0 - var bytes = cs.GetBytes(s.Substring(i, 1)); -#else - var bytes = cs.GetBytes(s, i, 1); -#endif - foreach (var t in bytes) - AppendPctEncodedByte(sb, t); - } - } + sb.Append(encoded); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -432,7 +393,7 @@ void AddKv(object? keyObj, object? val) for (var i = 0; i < listC.Count; i++) { if (i > 0) sbJoined.Append(','); // separator comma is never re-encoded - AppendCommaEncodedValue(sbJoined, listC[i], cs, format); + AppendCommaEncodedValue(sbJoined, listC[i], cs, format, encoder); } joinedC = sbJoined.ToString(); From 5761bcc98b5df02b6d26d7686a47bbc91c1d199f Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 17:52:38 +0100 Subject: [PATCH 07/37] :fire: remove unused hex encoding helpers from Encoder; clarify dot encoding gating logic with comments --- QsNet/Internal/Encoder.cs | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index 5169190..ea1898b 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -17,22 +17,6 @@ internal static class Encoder { private static readonly Formatter IdentityFormatter = s => s; - private static readonly char[] HexUpper = "0123456789ABCDEF".ToCharArray(); - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsAsciiAlphaNum(char ch) - { - return ch is >= '0' and <= '9' or >= 'A' and <= 'Z' or >= 'a' and <= 'z'; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void AppendPctEncodedByte(StringBuilder sb, byte b) - { - sb.Append('%'); - sb.Append(HexUpper[b >> 4]); - sb.Append(HexUpper[b & 0xF]); - } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private static string ToInvariantString(object? value) { @@ -233,10 +217,12 @@ public static object Encode( filter is not IterableFilter) { #if NETSTANDARD2_0 + // Intentionally gate on encodeDotInKeys only to preserve legacy behavior when AllowDots = false var encodedPrefixFast = encodeDotInKeys && keyPrefixStr.IndexOf('.') >= 0 ? keyPrefixStr.Replace(".", "%2E") : keyPrefixStr; #else + // Intentionally gate on encodeDotInKeys only to preserve legacy behavior when AllowDots = false var encodedPrefixFast = encodeDotInKeys && keyPrefixStr.Contains('.', StringComparison.Ordinal) ? keyPrefixStr.Replace(".", "%2E", StringComparison.Ordinal) : keyPrefixStr; @@ -367,10 +353,12 @@ void AddKv(object? keyObj, object? val) filter is not IterableFilter) { #if NETSTANDARD2_0 + // Intentionally gate on encodeDotInKeys only to preserve legacy behavior when AllowDots = false var encodedPrefixC = encodeDotInKeys && keyPrefixStr.IndexOf('.') >= 0 ? keyPrefixStr.Replace(".", "%2E") : keyPrefixStr; #else + // Intentionally gate on encodeDotInKeys only to preserve legacy behavior when AllowDots = false var encodedPrefixC = encodeDotInKeys && keyPrefixStr.Contains('.', StringComparison.Ordinal) ? keyPrefixStr.Replace(".", "%2E", StringComparison.Ordinal) : keyPrefixStr; @@ -524,10 +512,12 @@ void AddKv(object? keyObj, object? val) var values = new List(objKeys.Count); #if NETSTANDARD2_0 + // Intentionally gate on encodeDotInKeys only to preserve legacy behavior when AllowDots = false var encodedPrefix = encodeDotInKeys && keyPrefixStr.IndexOf('.') >= 0 ? keyPrefixStr.Replace(".", "%2E") : keyPrefixStr; #else + // Intentionally gate on encodeDotInKeys only to preserve legacy behavior when AllowDots = false var encodedPrefix = encodeDotInKeys && keyPrefixStr.Contains('.', StringComparison.Ordinal) ? keyPrefixStr.Replace(".", "%2E", StringComparison.Ordinal) : keyPrefixStr; From 91b1b70e53855aebdc6d2533657160857f16000e Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 17:56:54 +0100 Subject: [PATCH 08/37] :bug: fix Encoder generator comparison to ensure correct comma generator detection --- QsNet/Internal/Encoder.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index ea1898b..72d88ca 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -122,7 +122,8 @@ public static object Encode( var cs = charset ?? Encoding.UTF8; var gen = generateArrayPrefix ?? ListFormat.Indices.GetGenerator(); - var isCommaGen = gen == ListFormat.Comma.GetGenerator(); + var commaGen = ListFormat.Comma.GetGenerator(); + var isCommaGen = gen == commaGen; var crt = commaRoundTrip ?? isCommaGen; var keyPrefixStr = prefix ?? (addQueryPrefix ? "?" : ""); From 404e7e4b8e9fa690aaf60bc9ec8e01acddabeac1 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 17:56:59 +0100 Subject: [PATCH 09/37] :zap: expand EncodeBenchmarks data with empty and nested list cases for broader coverage --- benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs b/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs index 5404fe4..aa7f459 100644 --- a/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs +++ b/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs @@ -80,6 +80,8 @@ public void Setup() _data = new Dictionary { ["a"] = list, + ["a_empty"] = new List(), + ["a_nested"] = new List { new List { MakeValue(ValueLen, NeedsEscPercent, rnd) } }, ["b"] = new Dictionary { ["x.y"] = MakeValue(ValueLen, NeedsEscPercent, rnd), From 21040cc15d49488a1e4c7092877564a4dbaaa1c9 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 18:10:32 +0100 Subject: [PATCH 10/37] :zap: add UtilsEncodeBenchmarks to evaluate Utils.Encode performance across encoding formats and data shapes --- .../QsNet.Benchmarks/UtilsEncodeBenchmarks.cs | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs diff --git a/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs b/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs new file mode 100644 index 0000000..e8ba65d --- /dev/null +++ b/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs @@ -0,0 +1,61 @@ +using System; +using System.Collections.Generic; +using System.Text; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Order; +using QsNet.Enums; +using QsNet.Models; +using QsNet.Internal; + +[MemoryDiagnoser] +public class UtilsEncodeBenchmarks +{ + [Params(0, 8, 40, 512, 4096)] + public int Len; + + [Params(Format.Rfc3986, Format.Rfc1738)] + public Format Fmt; + + // Encoding under test + [Params("UTF8", "Latin1")] + public string EncName { get; set; } = "UTF8"; + private Encoding _enc = default!; + + // Workload shape + [Params("ascii-safe", "utf8-mixed", "latin1-fallback", "reserved-heavy")] + public string DataKind { get; set; } = "ascii-safe"; + + private string _input = default!; + + [GlobalSetup] + public void Setup() + { + _enc = EncName == "Latin1" ? Encoding.Latin1 : new UTF8Encoding(false); + + // note: () included to exercise RFC1738 paren allowance + var asciiSafeBase = "abcDEF-_.~0123456789() "; + var utfMixedBase = "Café 北京 – ☕️ 😀 "; + var latin1Fallback = "Café – € àèìòù "; // '€' not in ISO-8859-1 -> numeric-entity fallback + var reservedHeavy = "name=obj[a]&b=c d/%[]()+="; + + var seed = DataKind switch + { + "ascii-safe" => asciiSafeBase, + "utf8-mixed" => utfMixedBase, + "latin1-fallback" => latin1Fallback, + "reserved-heavy" => reservedHeavy, + _ => asciiSafeBase + }; + + _input = string.Concat(Enumerable.Repeat(seed, Math.Max(1, (Len + seed.Length - 1) / seed.Length))) + .Substring(0, Len); + } + + [Benchmark(Baseline = true)] + public string Encode() => QsNet.Internal.Utils.Encode(_input, _enc, Fmt); + + // Orientation-only reference (different semantics for spaces/legacy, but useful for perf smell tests) + [Benchmark] + public string UriEscape() => Uri.EscapeDataString(_input); +} \ No newline at end of file From d3ccddfae57a72bcceb2d3354506bbc15def03dd Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 18:14:03 +0100 Subject: [PATCH 11/37] :bug: pass encoding context and format to encoder for correct key-value encoding --- QsNet/Internal/Encoder.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index 72d88ca..45093a4 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -422,8 +422,8 @@ void AddKv(object? keyObj, object? val) } if (encoder == null) return $"{fmt(adjustedPrefixC)}={fmt(joinedC)}"; - var keyPartC = encoder(adjustedPrefixC, null, null); - var valuePartC = encoder(joinedC, null, null); + var keyPartC = encoder(adjustedPrefixC, cs, format); + var valuePartC = encoder(joinedC, cs, format); return $"{fmt(keyPartC)}={fmt(valuePartC)}"; } From fdfd52a0c5385445c613fe394cc7075196cffdac Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 18:21:42 +0100 Subject: [PATCH 12/37] :bug: pass encoding context and format to encoder when encoding enumerable values --- QsNet/Internal/Encoder.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index 45093a4..b6512c5 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -440,7 +440,7 @@ void AddKv(object? keyObj, object? val) if (encodeValuesOnly && encoder != null) foreach (var el in enumerable) - strings.Add(el is null ? "" : encoder(el, null, null)); + strings.Add(el is null ? "" : encoder(el, cs, format)); else foreach (var el in enumerable) strings.Add(el?.ToString() ?? ""); From 48fcd14f47af46816f9d789273f6ef92f328743d Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 18:22:28 +0100 Subject: [PATCH 13/37] :bug: pass encoding context and format to encoder for key-value encoding --- QsNet/Internal/Encoder.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index b6512c5..c9d73bf 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -192,8 +192,8 @@ public static object Encode( return $"{fmt(keyPrefixStr)}={fmt(s)}"; } - var keyPart = encodeValuesOnly ? keyPrefixStr : encoder(keyPrefixStr, null, null); - var valuePart = encoder(obj, null, null); + var keyPart = encodeValuesOnly ? keyPrefixStr : encoder(keyPrefixStr, cs, format); + var valuePart = encoder(obj, cs, format); return $"{fmt(keyPart)}={fmt(valuePart)}"; } From 33dbcd4139523d9e71a43855ded5763bdd988e64 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 18:23:51 +0100 Subject: [PATCH 14/37] :bug: always use encoder for child elements to ensure correct encoding in all cases --- QsNet/Internal/Encoder.cs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index c9d73bf..db87599 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -538,10 +538,7 @@ void AddKv(object? keyObj, object? val) // Fast path (#4): hoist child-encoder decision out of the loop. // For comma-joined arrays in values-only mode, do not re-encode children. - var childEncoderForElements = - isCommaGen && encodeValuesOnly && obj is IEnumerable and not string - ? null - : encoder; + var childEncoderForElements = encoder; for (var i = 0; i < objKeys.Count; i++) { From 3c065b1cf4888a0e0733c656806f575e74810ddd Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 18:24:36 +0100 Subject: [PATCH 15/37] :bug: treat string as leaf value in IsLeaf to ensure correct encoding behavior --- QsNet/Internal/Encoder.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index db87599..0c1a7b2 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -68,7 +68,7 @@ private static void AppendCommaEncodedValue( private static bool IsLeaf(object? v, bool skipNulls) { if (v is null) return skipNulls; - return Utils.IsNonNullishPrimitive(v) || v is byte[]; + return v is string || v is byte[] || Utils.IsNonNullishPrimitive(v, skipNulls); } /// From 1e86570473a1442035bd8214a51ce8ac94c87ff0 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 19:47:09 +0100 Subject: [PATCH 16/37] chore: add missing System.Linq import to UtilsEncodeBenchmarks --- benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs b/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs index e8ba65d..cae242b 100644 --- a/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs +++ b/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Text; using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Jobs; From e0dfa0ba192efb0c44cb1ca97986f9a5b11b56c2 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 19:51:11 +0100 Subject: [PATCH 17/37] :bug: prevent double encoding of comma-joined array elements in values-only mode --- QsNet/Internal/Encoder.cs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index 0c1a7b2..c222e71 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -428,6 +428,7 @@ void AddKv(object? keyObj, object? val) } List objKeys; + var commaElementsAlreadyEncoded = false; if (isCommaGen && obj is IEnumerable enumerable and not string and not IDictionary) { List strings; @@ -439,11 +440,16 @@ void AddKv(object? keyObj, object? val) strings = []; if (encodeValuesOnly && encoder != null) + { foreach (var el in enumerable) strings.Add(el is null ? "" : encoder(el, cs, format)); + commaElementsAlreadyEncoded = true; + } else + { foreach (var el in enumerable) strings.Add(el?.ToString() ?? ""); + } if (strings.Count != 0) { @@ -537,8 +543,8 @@ void AddKv(object? keyObj, object? val) sideChannel.Set(objKey!, step); // Fast path (#4): hoist child-encoder decision out of the loop. - // For comma-joined arrays in values-only mode, do not re-encode children. - var childEncoderForElements = encoder; + // For comma-joined arrays in values-only mode, do not re-encode the joined string. + var childEncoderForElements = commaElementsAlreadyEncoded ? null : encoder; for (var i = 0; i < objKeys.Count; i++) { From 0b57cf38e80839dd01f23ebd752c0ece26fbec9b Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 19:53:40 +0100 Subject: [PATCH 18/37] chore: use Encoding.GetEncoding for Latin1 to ensure compatibility in benchmarks --- benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs b/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs index cae242b..3227278 100644 --- a/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs +++ b/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs @@ -32,7 +32,7 @@ public class UtilsEncodeBenchmarks [GlobalSetup] public void Setup() { - _enc = EncName == "Latin1" ? Encoding.Latin1 : new UTF8Encoding(false); + _enc = EncName == "Latin1" ? Encoding.GetEncoding("ISO-8859-1") : new UTF8Encoding(false); // note: () included to exercise RFC1738 paren allowance var asciiSafeBase = "abcDEF-_.~0123456789() "; From fece10621051f1f8e04a885918219db032b69834 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 20:11:44 +0100 Subject: [PATCH 19/37] :zap: optimize UTF-8 percent-encoding by switching to single-pass algorithm with lazy allocation --- QsNet/Internal/Utils.cs | 134 +++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 70 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index e52d4b6..9fbdb1b 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -399,7 +399,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo return string.Empty; var nonNullStr = str!; - if (Equals(encoding, Encoding.GetEncoding("ISO-8859-1"))) + if (encoding.CodePage == 28591) // ISO-8859-1 (Latin-1) { #pragma warning disable CS0618 // Type or member is obsolete return MyRegex1() @@ -419,86 +419,80 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo #pragma warning restore CS0618 // Type or member is obsolete } - var buffer = new StringBuilder(); - var j = 0; - - while (j < nonNullStr.Length) + // Single-pass UTF-8 encoder with lazy builder to avoid allocations for already-safe strings. + StringBuilder? sb = null; + var safeFrom = 0; + var i = 0; + while (i < nonNullStr.Length) { - // Take up to SegmentLimit characters, but never split a surrogate pair across the boundary. - var remaining = nonNullStr.Length - j; - var segmentLen = remaining >= SegmentLimit ? SegmentLimit : remaining; - - // If the last char of this segment is a high surrogate and the next char exists and is a low surrogate, - // shrink the segment by one so the pair is encoded together in the next iteration. - if ( - segmentLen < remaining && - char.IsHighSurrogate(nonNullStr[j + segmentLen - 1]) && - char.IsLowSurrogate(nonNullStr[j + segmentLen]) - ) - segmentLen--; // keep the high surrogate with its low surrogate in the next chunk - - var segment = nonNullStr.Substring(j, segmentLen); + int c = nonNullStr[i]; - var i = 0; - while (i < segment.Length) + // Allowed unreserved: - . _ ~, digits, letters, and RFC1738 parentheses + if (c is 0x2D or 0x2E or 0x5F or 0x7E + || c is >= 0x30 and <= 0x39 + || c is >= 0x41 and <= 0x5A + || c is >= 0x61 and <= 0x7A + || (fmt == Format.Rfc1738 && c is 0x28 or 0x29)) { - var c = (int)segment[i]; + i++; + continue; + } - switch (c) - { - case 0x2D or 0x2E or 0x5F or 0x7E: - case >= 0x30 and <= 0x39: - case >= 0x41 and <= 0x5A: - case >= 0x61 and <= 0x7A: - case 0x28 or 0x29 when fmt == Format.Rfc1738: - buffer.Append(segment[i]); - i++; - continue; - // ASCII - case < 0x80: - buffer.Append(HexTable.Table[c]); - i++; - continue; - // 2 bytes - case < 0x800: - buffer.Append(HexTable.Table[0xC0 | (c >> 6)]); - buffer.Append(HexTable.Table[0x80 | (c & 0x3F)]); - i++; - continue; - case < 0xD800: - // 3 bytes - case >= 0xE000: - buffer.Append(HexTable.Table[0xE0 | (c >> 12)]); - buffer.Append(HexTable.Table[0x80 | ((c >> 6) & 0x3F)]); - buffer.Append(HexTable.Table[0x80 | (c & 0x3F)]); - i++; - continue; - } + // First time we need to encode: materialize the builder and copy the safe prefix + sb ??= new StringBuilder(nonNullStr.Length + (nonNullStr.Length >> 1)); + if (i > safeFrom) sb.Append(nonNullStr, safeFrom, i - safeFrom); - // 4 bytes (surrogate pair) – only if valid pair; otherwise treat as 3-byte fallback - if (i + 1 >= segment.Length || !char.IsSurrogatePair(segment[i], segment[i + 1])) - { - // Fallback: percent-encode the single surrogate code unit to remain lossless - buffer.Append(HexTable.Table[0xE0 | (c >> 12)]); - buffer.Append(HexTable.Table[0x80 | ((c >> 6) & 0x3F)]); - buffer.Append(HexTable.Table[0x80 | (c & 0x3F)]); + switch (c) + { + case < 0x80: + // ASCII but reserved -> %XX + sb.Append(HexTable.Table[c]); i++; - continue; - } + break; + case < 0x800: + // 2-byte UTF-8 sequence + sb.Append(HexTable.Table[0xC0 | (c >> 6)]); + sb.Append(HexTable.Table[0x80 | (c & 0x3F)]); + i++; + break; + case < 0xD800 or >= 0xE000: + // 3-byte UTF-8 sequence + sb.Append(HexTable.Table[0xE0 | (c >> 12)]); + sb.Append(HexTable.Table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(HexTable.Table[0x80 | (c & 0x3F)]); + i++; + break; + default: + { + // Surrogate area + if (i + 1 < nonNullStr.Length && char.IsSurrogatePair(nonNullStr[i], nonNullStr[i + 1])) + { + var codePoint = char.ConvertToUtf32(nonNullStr[i], nonNullStr[i + 1]); + sb.Append(HexTable.Table[0xF0 | (codePoint >> 18)]); + sb.Append(HexTable.Table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(HexTable.Table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(HexTable.Table[0x80 | (codePoint & 0x3F)]); + i += 2; + } + else + { + // Lone surrogate -> encode the single code unit as 3-byte sequence, preserving data + sb.Append(HexTable.Table[0xE0 | (c >> 12)]); + sb.Append(HexTable.Table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(HexTable.Table[0x80 | (c & 0x3F)]); + i++; + } - var nextC = segment[i + 1]; - var codePoint = char.ConvertToUtf32((char)c, nextC); - buffer.Append(HexTable.Table[0xF0 | (codePoint >> 18)]); - buffer.Append(HexTable.Table[0x80 | ((codePoint >> 12) & 0x3F)]); - buffer.Append(HexTable.Table[0x80 | ((codePoint >> 6) & 0x3F)]); - buffer.Append(HexTable.Table[0x80 | (codePoint & 0x3F)]); - i += 2; // Skip the next character as it's part of the surrogate pair + break; + } } - j += segment.Length; // advance by the actual processed count + safeFrom = i; } - return buffer.ToString(); + return sb is null + ? nonNullStr + : sb.Append(nonNullStr, safeFrom, nonNullStr.Length - safeFrom).ToString(); } /// From 087509300c33f54e4c6e8e98f2d61bf080f3cdf3 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 20:13:33 +0100 Subject: [PATCH 20/37] :fire: remove unused SegmentLimit constant from Utils --- QsNet/Internal/Utils.cs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index 9fbdb1b..44cdf03 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -22,11 +22,6 @@ internal static class Utils internal static partial class Utils #endif { - /// - /// The maximum length of a segment to encode in a single pass. - /// - private const int SegmentLimit = 1024; - /// /// A regex to match percent-encoded characters in the format %XX. /// From 9110868460297c3d2668af9db258322d3dbb5419 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 20:14:30 +0100 Subject: [PATCH 21/37] :zap: use encoding.CodePage for Latin1 check to improve reliability --- QsNet/Internal/Utils.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index 44cdf03..0227d02 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -501,7 +501,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo encoding ??= Encoding.UTF8; var strWithoutPlus = str?.Replace('+', ' '); - if (Equals(encoding, Encoding.GetEncoding("ISO-8859-1"))) + if (encoding.CodePage == 28591) // ISO-8859-1 (Latin-1) try { return MyRegex() From 85d306aba08924b541c50e5afc9a123f80e53d65 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 21:15:09 +0100 Subject: [PATCH 22/37] :zap: optimize URL encoding by adding escape-heavy mode and fast ASCII scan --- QsNet/Internal/Utils.cs | 238 +++++++++++++++++++++++++++------------- 1 file changed, 162 insertions(+), 76 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index 0227d02..fc3ab5a 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -366,6 +366,17 @@ out var code return sb.ToString(); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsUnreservedAscii(char ch, Format fmt) + { + // -, ., _, ~, 0-9, A-Z, a-z (+ () for RFC1738) + return ch is '-' or '.' or '_' or '~' + || ch is >= '0' and <= '9' + || ch is >= 'A' and <= 'Z' + || ch is >= 'a' and <= 'z' + || (fmt == Format.Rfc1738 && ch is '(' or ')'); + } + /// /// Encodes a value into a URL-encoded string. /// @@ -392,102 +403,177 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo if (string.IsNullOrEmpty(str)) return string.Empty; - var nonNullStr = str!; + var s = str!; - if (encoding.CodePage == 28591) // ISO-8859-1 (Latin-1) + // Latin-1 (ISO-8859-1) path with fast skip when no %u-escapes are present + if (encoding.CodePage == 28591) { -#pragma warning disable CS0618 // Type or member is obsolete - return MyRegex1() - .Replace( - Escape(str!, fmt), - match => - { +#pragma warning disable CS0618 // Escape is obsolete but intentional here + var escaped = Escape(s, fmt); +#pragma warning restore CS0618 #if NETSTANDARD2_0 - var code = int.Parse(match.Value.Substring(2), NumberStyles.HexNumber, - CultureInfo.InvariantCulture); + if (escaped.IndexOf("%u", StringComparison.OrdinalIgnoreCase) < 0) + return escaped; #else - var code = int.Parse(match.Value[2..], NumberStyles.HexNumber, CultureInfo.InvariantCulture); + if (!escaped.Contains("%u", StringComparison.OrdinalIgnoreCase)) + return escaped; #endif - return $"%26%23{code}%3B"; - } - ); -#pragma warning restore CS0618 // Type or member is obsolete + return MyRegex1().Replace(escaped, m => + { +#if NETSTANDARD2_0 + var code = int.Parse(m.Value.Substring(2), NumberStyles.HexNumber, CultureInfo.InvariantCulture); +#else + var code = int.Parse(m.Value.AsSpan(2), NumberStyles.HexNumber, CultureInfo.InvariantCulture); +#endif + return $"%26%23{code}%3B"; + }); } - // Single-pass UTF-8 encoder with lazy builder to avoid allocations for already-safe strings. - StringBuilder? sb = null; - var safeFrom = 0; + // UTF-8 path with two strategies: + // 1) run-copy mode for mixed/mostly-safe inputs (lazy flush of safe runs) + // 2) escape-heavy mode for mostly-unsafe inputs (big prealloc, simpler loop) +#if NETSTANDARD2_0 + var len = s.Length; +#else + ReadOnlySpan src = s.AsSpan(); + int len = src.Length; +#endif + + // Scan to first unsafe ASCII (anything non-ASCII is unsafe-by-definition for this pass) var i = 0; - while (i < nonNullStr.Length) + while (i < len && s[i] <= 0x7F && IsUnreservedAscii(s[i], fmt)) i++; + if (i == len) + return s; // all safe ASCII + + // Sample up to 64 chars after first unsafe to decide whether it's escape-heavy + var sampleEnd = Math.Min(len, i + 64); + var unsafeCount = 0; + for (var k = i; k < sampleEnd; k++) { - int c = nonNullStr[i]; + var ch = s[k]; + if (ch > 0x7F || !IsUnreservedAscii(ch, fmt)) + unsafeCount++; + } - // Allowed unreserved: - . _ ~, digits, letters, and RFC1738 parentheses - if (c is 0x2D or 0x2E or 0x5F or 0x7E - || c is >= 0x30 and <= 0x39 - || c is >= 0x41 and <= 0x5A - || c is >= 0x61 and <= 0x7A - || (fmt == Format.Rfc1738 && c is 0x28 or 0x29)) - { - i++; - continue; - } + var escapeHeavy = unsafeCount * 4 >= (sampleEnd - i) * 3; // ≥75% unsafe - // First time we need to encode: materialize the builder and copy the safe prefix - sb ??= new StringBuilder(nonNullStr.Length + (nonNullStr.Length >> 1)); - if (i > safeFrom) sb.Append(nonNullStr, safeFrom, i - safeFrom); + var cap = escapeHeavy + ? len >= int.MaxValue / 3 ? int.MaxValue : len * 3 + : len + 16; - switch (c) + var sb = new StringBuilder(cap); + var table = HexTable.Table; + + if (!escapeHeavy) + { + var lastSafe = 0; + for (var idx = 0; idx < len; idx++) { - case < 0x80: - // ASCII but reserved -> %XX - sb.Append(HexTable.Table[c]); - i++; - break; - case < 0x800: - // 2-byte UTF-8 sequence - sb.Append(HexTable.Table[0xC0 | (c >> 6)]); - sb.Append(HexTable.Table[0x80 | (c & 0x3F)]); - i++; - break; - case < 0xD800 or >= 0xE000: - // 3-byte UTF-8 sequence - sb.Append(HexTable.Table[0xE0 | (c >> 12)]); - sb.Append(HexTable.Table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(HexTable.Table[0x80 | (c & 0x3F)]); - i++; - break; - default: - { - // Surrogate area - if (i + 1 < nonNullStr.Length && char.IsSurrogatePair(nonNullStr[i], nonNullStr[i + 1])) - { - var codePoint = char.ConvertToUtf32(nonNullStr[i], nonNullStr[i + 1]); - sb.Append(HexTable.Table[0xF0 | (codePoint >> 18)]); - sb.Append(HexTable.Table[0x80 | ((codePoint >> 12) & 0x3F)]); - sb.Append(HexTable.Table[0x80 | ((codePoint >> 6) & 0x3F)]); - sb.Append(HexTable.Table[0x80 | (codePoint & 0x3F)]); - i += 2; - } - else + int c = s[idx]; + var safeAscii = c <= 0x7F && IsUnreservedAscii((char)c, fmt); + if (safeAscii) + continue; + + // flush preceding safe run + if (idx > lastSafe) + sb.Append(s, lastSafe, idx - lastSafe); + + switch (c) + { + case < 0x80: + sb.Append(table[c]); + break; + case < 0x800: + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + break; + case < 0xD800: + case >= 0xE000: + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + break; + default: { - // Lone surrogate -> encode the single code unit as 3-byte sequence, preserving data - sb.Append(HexTable.Table[0xE0 | (c >> 12)]); - sb.Append(HexTable.Table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(HexTable.Table[0x80 | (c & 0x3F)]); - i++; + // surrogate handling + if (idx + 1 < len && char.IsSurrogatePair(s[idx], s[idx + 1])) + { + var codePoint = char.ConvertToUtf32(s[idx], s[idx + 1]); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + idx++; // consume low surrogate + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + + break; } + } - break; - } + lastSafe = idx + 1; } - safeFrom = i; + if (lastSafe < len) + sb.Append(s, lastSafe, len - lastSafe); + } + else + { + // Escape-heavy mode: no run bookkeeping, big prealloc + if (i > 0) sb.Append(s, 0, i); + + for (var j = i; j < len; j++) + { + int c = s[j]; + + switch (c) + { + case <= 0x7F when IsUnreservedAscii((char)c, fmt): + sb.Append((char)c); + continue; + case < 0x80: + sb.Append(table[c]); + break; + case < 0x800: + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + break; + case < 0xD800: + case >= 0xE000: + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + break; + default: + { + if (j + 1 < len && char.IsSurrogatePair(s[j], s[j + 1])) + { + var codePoint = char.ConvertToUtf32(s[j], s[j + 1]); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + j++; + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + + break; + } + } + } } - return sb is null - ? nonNullStr - : sb.Append(nonNullStr, safeFrom, nonNullStr.Length - safeFrom).ToString(); + return sb.ToString(); } /// From 6197f88d91fab1487087e178a53e2483eb2680d4 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 22:20:58 +0100 Subject: [PATCH 23/37] :zap: split IsUnreservedAscii into RFC3986 and RFC1738 variants for more accurate percent-encoding --- QsNet/Internal/Utils.cs | 367 +++++++++++++++++++++++++++------------- 1 file changed, 247 insertions(+), 120 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index fc3ab5a..b765e58 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -367,14 +367,17 @@ out var code } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsUnreservedAscii(char ch, Format fmt) + private static bool IsUnreservedAscii3986(char ch) { - // -, ., _, ~, 0-9, A-Z, a-z (+ () for RFC1738) - return ch is '-' or '.' or '_' or '~' - || ch is >= '0' and <= '9' - || ch is >= 'A' and <= 'Z' - || ch is >= 'a' and <= 'z' - || (fmt == Format.Rfc1738 && ch is '(' or ')'); + // -, ., _, ~, 0-9, A-Z, a-z + return ch is '-' or '.' or '_' or '~' or >= '0' and <= '9' or >= 'A' and <= 'Z' or >= 'a' and <= 'z'; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsUnreservedAscii1738(char ch) + { + // RFC1738 extends RFC3986 set with parentheses + return ch is '(' or ')' || IsUnreservedAscii3986(ch); } /// @@ -432,148 +435,272 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo // UTF-8 path with two strategies: // 1) run-copy mode for mixed/mostly-safe inputs (lazy flush of safe runs) // 2) escape-heavy mode for mostly-unsafe inputs (big prealloc, simpler loop) -#if NETSTANDARD2_0 var len = s.Length; -#else - ReadOnlySpan src = s.AsSpan(); - int len = src.Length; -#endif - // Scan to first unsafe ASCII (anything non-ASCII is unsafe-by-definition for this pass) - var i = 0; - while (i < len && s[i] <= 0x7F && IsUnreservedAscii(s[i], fmt)) i++; - if (i == len) - return s; // all safe ASCII - - // Sample up to 64 chars after first unsafe to decide whether it's escape-heavy - var sampleEnd = Math.Min(len, i + 64); - var unsafeCount = 0; - for (var k = i; k < sampleEnd; k++) + if (fmt == Format.Rfc1738) { - var ch = s[k]; - if (ch > 0x7F || !IsUnreservedAscii(ch, fmt)) - unsafeCount++; - } - - var escapeHeavy = unsafeCount * 4 >= (sampleEnd - i) * 3; // ≥75% unsafe - - var cap = escapeHeavy - ? len >= int.MaxValue / 3 ? int.MaxValue : len * 3 - : len + 16; + // Scan to first unsafe ASCII (anything non-ASCII is unsafe-by-definition for this pass) + var i = 0; + while (i < len && s[i] <= 0x7F && IsUnreservedAscii1738(s[i])) i++; + if (i == len) + return s; // all safe ASCII + + // Sample up to 64 chars after first unsafe to decide whether it's escape-heavy + var sampleEnd = Math.Min(len, i + 64); + var unsafeCount = 0; + for (var k = i; k < sampleEnd; k++) + { + var ch = s[k]; + if (ch > 0x7F || !IsUnreservedAscii1738(ch)) + unsafeCount++; + } - var sb = new StringBuilder(cap); - var table = HexTable.Table; + var escapeHeavy = unsafeCount * 4 >= (sampleEnd - i) * 3; // ≥75% unsafe + var cap = escapeHeavy ? len >= int.MaxValue / 3 ? int.MaxValue : len * 3 : len + 16; + var sb = new StringBuilder(cap); + var table = HexTable.Table; - if (!escapeHeavy) - { - var lastSafe = 0; - for (var idx = 0; idx < len; idx++) + if (!escapeHeavy) { - int c = s[idx]; - var safeAscii = c <= 0x7F && IsUnreservedAscii((char)c, fmt); - if (safeAscii) - continue; + var lastSafe = 0; + for (var idx = 0; idx < len; idx++) + { + int c = s[idx]; + var safeAscii = c <= 0x7F && IsUnreservedAscii1738((char)c); + if (safeAscii) + continue; - // flush preceding safe run - if (idx > lastSafe) - sb.Append(s, lastSafe, idx - lastSafe); + // flush preceding safe run + if (idx > lastSafe) + sb.Append(s, lastSafe, idx - lastSafe); - switch (c) - { - case < 0x80: - sb.Append(table[c]); - break; - case < 0x800: - sb.Append(table[0xC0 | (c >> 6)]); - sb.Append(table[0x80 | (c & 0x3F)]); - break; - case < 0xD800: - case >= 0xE000: - sb.Append(table[0xE0 | (c >> 12)]); - sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(table[0x80 | (c & 0x3F)]); - break; - default: - { - // surrogate handling - if (idx + 1 < len && char.IsSurrogatePair(s[idx], s[idx + 1])) - { - var codePoint = char.ConvertToUtf32(s[idx], s[idx + 1]); - sb.Append(table[0xF0 | (codePoint >> 18)]); - sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); - sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); - sb.Append(table[0x80 | (codePoint & 0x3F)]); - idx++; // consume low surrogate - } - else + switch (c) + { + case < 0x80: + sb.Append(table[c]); + break; + case < 0x800: + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + break; + case < 0xD800: + case >= 0xE000: + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + break; + default: { - sb.Append(table[0xE0 | (c >> 12)]); - sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(table[0x80 | (c & 0x3F)]); + // surrogate handling + if (idx + 1 < len && char.IsSurrogatePair(s[idx], s[idx + 1])) + { + var codePoint = char.ConvertToUtf32(s[idx], s[idx + 1]); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + idx++; // consume low surrogate + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + + break; } + } - break; - } + lastSafe = idx + 1; } - lastSafe = idx + 1; + if (lastSafe < len) + sb.Append(s, lastSafe, len - lastSafe); + } + else + { + // Escape-heavy mode: no run bookkeeping, big prealloc + if (i > 0) sb.Append(s, 0, i); + + for (var j = i; j < len; j++) + { + int c = s[j]; + + switch (c) + { + case <= 0x7F when IsUnreservedAscii1738((char)c): + sb.Append((char)c); + continue; + case < 0x80: + sb.Append(table[c]); + break; + case < 0x800: + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + break; + case < 0xD800: + case >= 0xE000: + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + break; + default: + { + if (j + 1 < len && char.IsSurrogatePair(s[j], s[j + 1])) + { + var codePoint = char.ConvertToUtf32(s[j], s[j + 1]); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + j++; + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + + break; + } + } + } } - if (lastSafe < len) - sb.Append(s, lastSafe, len - lastSafe); + return sb.ToString(); } else { - // Escape-heavy mode: no run bookkeeping, big prealloc - if (i > 0) sb.Append(s, 0, i); - - for (var j = i; j < len; j++) + // RFC3986 path (no parentheses allowed) + var i = 0; + while (i < len && s[i] <= 0x7F && IsUnreservedAscii3986(s[i])) i++; + if (i == len) + return s; + + var sampleEnd = Math.Min(len, i + 64); + var unsafeCount = 0; + for (var k = i; k < sampleEnd; k++) { - int c = s[j]; + var ch = s[k]; + if (ch > 0x7F || !IsUnreservedAscii3986(ch)) + unsafeCount++; + } + + var escapeHeavy = unsafeCount * 4 >= (sampleEnd - i) * 3; // ≥75% unsafe + var cap = escapeHeavy ? len >= int.MaxValue / 3 ? int.MaxValue : len * 3 : len + 16; + var sb = new StringBuilder(cap); + var table = HexTable.Table; - switch (c) + if (!escapeHeavy) + { + var lastSafe = 0; + for (var idx = 0; idx < len; idx++) { - case <= 0x7F when IsUnreservedAscii((char)c, fmt): - sb.Append((char)c); + int c = s[idx]; + var safeAscii = c <= 0x7F && IsUnreservedAscii3986((char)c); + if (safeAscii) continue; - case < 0x80: - sb.Append(table[c]); - break; - case < 0x800: - sb.Append(table[0xC0 | (c >> 6)]); - sb.Append(table[0x80 | (c & 0x3F)]); - break; - case < 0xD800: - case >= 0xE000: - sb.Append(table[0xE0 | (c >> 12)]); - sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(table[0x80 | (c & 0x3F)]); - break; - default: - { - if (j + 1 < len && char.IsSurrogatePair(s[j], s[j + 1])) - { - var codePoint = char.ConvertToUtf32(s[j], s[j + 1]); - sb.Append(table[0xF0 | (codePoint >> 18)]); - sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); - sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); - sb.Append(table[0x80 | (codePoint & 0x3F)]); - j++; - } - else + + if (idx > lastSafe) + sb.Append(s, lastSafe, idx - lastSafe); + + switch (c) + { + case < 0x80: + sb.Append(table[c]); + break; + case < 0x800: + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + break; + case < 0xD800: + case >= 0xE000: + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + break; + default: { - sb.Append(table[0xE0 | (c >> 12)]); - sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(table[0x80 | (c & 0x3F)]); + if (idx + 1 < len && char.IsSurrogatePair(s[idx], s[idx + 1])) + { + var codePoint = char.ConvertToUtf32(s[idx], s[idx + 1]); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + idx++; + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + + break; } + } + + lastSafe = idx + 1; + } + + if (lastSafe < len) + sb.Append(s, lastSafe, len - lastSafe); + } + else + { + if (i > 0) sb.Append(s, 0, i); + for (var j = i; j < len; j++) + { + int c = s[j]; + + switch (c) + { + case <= 0x7F when IsUnreservedAscii3986((char)c): + sb.Append((char)c); + continue; + case < 0x80: + sb.Append(table[c]); break; - } + case < 0x800: + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + break; + case < 0xD800: + case >= 0xE000: + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + break; + default: + { + if (j + 1 < len && char.IsSurrogatePair(s[j], s[j + 1])) + { + var codePoint = char.ConvertToUtf32(s[j], s[j + 1]); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + j++; + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + + break; + } + } } } - } - return sb.ToString(); + return sb.ToString(); + } } /// From b39ec6c480c29623674030d8ef0947b9b7a413f5 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 23:48:53 +0100 Subject: [PATCH 24/37] :zap: optimize Latin-1 URL encoding by removing regex dependency and adding fast ASCII scan for RFC1738 and RFC3986 formats --- QsNet/Internal/Utils.cs | 218 +++++++++++++++++++++++++++++++++------- 1 file changed, 184 insertions(+), 34 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index b765e58..4a2e2b7 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -37,21 +37,6 @@ private static Regex MyRegex() private static partial Regex MyRegex(); #endif - /// - /// A regex to match Unicode percent-encoded characters in the format %uXXXX. - /// -#if NETSTANDARD2_0 - private static readonly Regex MyRegex1Instance = new("%u[0-9a-f]{4}", RegexOptions.IgnoreCase); - - private static Regex MyRegex1() - { - return MyRegex1Instance; - } -#else - [GeneratedRegex("%u[0-9a-f]{4}", RegexOptions.IgnoreCase, "en-GB")] - private static partial Regex MyRegex1(); -#endif - /// /// Merges two objects, where the source object overrides the target object. If the source is a /// Dictionary, it will merge its entries into the target. If the source is an IEnumerable, it will append @@ -380,6 +365,26 @@ private static bool IsUnreservedAscii1738(char ch) return ch is '(' or ')' || IsUnreservedAscii3986(ch); } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsSafeLatin1Ascii1738(char ch) + { + // Legacy Latin-1 encode behavior: + // - treat '+' as safe (do not encode) + // - treat '~' as unsafe (percent-encode) + // - RFC1738 adds '(' and ')' + return ch is '+' or '(' or ')' or '-' or '.' or '_' or >= '0' and <= '9' or >= 'A' and <= 'Z' + or >= 'a' and <= 'z'; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsSafeLatin1Ascii3986(char ch) + { + // Legacy Latin-1 encode behavior: + // - treat '+' as safe (do not encode) + // - treat '~' as unsafe (percent-encode) + return ch is '+' or '-' or '.' or '_' or >= '0' and <= '9' or >= 'A' and <= 'Z' or >= 'a' and <= 'z'; + } + /// /// Encodes a value into a URL-encoded string. /// @@ -407,35 +412,180 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo if (string.IsNullOrEmpty(str)) return string.Empty; var s = str!; + var len = s.Length; // Latin-1 (ISO-8859-1) path with fast skip when no %u-escapes are present if (encoding.CodePage == 28591) { -#pragma warning disable CS0618 // Escape is obsolete but intentional here - var escaped = Escape(s, fmt); -#pragma warning restore CS0618 -#if NETSTANDARD2_0 - if (escaped.IndexOf("%u", StringComparison.OrdinalIgnoreCase) < 0) - return escaped; -#else - if (!escaped.Contains("%u", StringComparison.OrdinalIgnoreCase)) - return escaped; -#endif - return MyRegex1().Replace(escaped, m => + var table = HexTable.Table; + + if (fmt == Format.Rfc1738) { -#if NETSTANDARD2_0 - var code = int.Parse(m.Value.Substring(2), NumberStyles.HexNumber, CultureInfo.InvariantCulture); -#else - var code = int.Parse(m.Value.AsSpan(2), NumberStyles.HexNumber, CultureInfo.InvariantCulture); -#endif - return $"%26%23{code}%3B"; - }); + // Legacy behavior: in Latin-1 mode, treat '+' as safe (do not percent-encode) + // Scan to first unsafe ASCII (anything non-ASCII is unsafe for this pass) + var i = 0; + while (i < len && s[i] <= 0x7F && IsSafeLatin1Ascii1738(s[i])) i++; + if (i == len) + return s; // all safe ASCII + + // Sample to decide escape density + var sampleEnd = Math.Min(len, i + 64); + var unsafeCount = 0; + for (var k = i; k < sampleEnd; k++) + { + var ch = s[k]; + if (ch > 0x7F || !IsSafeLatin1Ascii1738(ch)) + unsafeCount++; + } + + var escapeHeavy = unsafeCount * 4 >= (sampleEnd - i) * 3; // ≥75% unsafe + var cap = escapeHeavy ? len >= int.MaxValue / 3 ? int.MaxValue : len * 3 : len + 16; + var sb = new StringBuilder(cap); + + if (!escapeHeavy) + { + var lastSafe = 0; + for (var idx = 0; idx < len; idx++) + { + int c = s[idx]; + var safeAscii = c <= 0x7F && IsSafeLatin1Ascii1738((char)c); + if (safeAscii) + continue; + + // flush preceding safe run + if (idx > lastSafe) + sb.Append(s, lastSafe, idx - lastSafe); + + if (c <= 0xFF) + { + sb.Append(table[c]); // %XX for Latin-1 bytes + } + else + { + // For non-Latin1 code units, emit percent-encoded numeric entity: %26%23{code}%3B + sb.Append("%26%23"); + sb.Append(c); + sb.Append("%3B"); + } + + lastSafe = idx + 1; + } + + if (lastSafe < len) + sb.Append(s, lastSafe, len - lastSafe); + } + else + { + // Escape-heavy mode: no run bookkeeping + if (i > 0) sb.Append(s, 0, i); + + for (var j = i; j < len; j++) + { + int c = s[j]; + + switch (c) + { + case <= 0x7F when IsSafeLatin1Ascii1738((char)c): + sb.Append((char)c); + continue; + case <= 0xFF: + sb.Append(table[c]); + break; + default: + sb.Append("%26%23"); + sb.Append(c); + sb.Append("%3B"); + break; + } + } + } + + return sb.ToString(); + } + else + { + // Legacy behavior: in Latin-1 mode, treat '+' as safe (do not percent-encode) + // RFC3986 path (no parentheses allowed) + var i = 0; + while (i < len && s[i] <= 0x7F && IsSafeLatin1Ascii3986(s[i])) i++; + if (i == len) + return s; // all safe ASCII + + var sampleEnd = Math.Min(len, i + 64); + var unsafeCount = 0; + for (var k = i; k < sampleEnd; k++) + { + var ch = s[k]; + if (ch > 0x7F || !IsSafeLatin1Ascii3986(ch)) + unsafeCount++; + } + + var escapeHeavy = unsafeCount * 4 >= (sampleEnd - i) * 3; // ≥75% unsafe + var cap = escapeHeavy ? len >= int.MaxValue / 3 ? int.MaxValue : len * 3 : len + 16; + var sb = new StringBuilder(cap); + + if (!escapeHeavy) + { + var lastSafe = 0; + for (var idx = 0; idx < len; idx++) + { + int c = s[idx]; + var safeAscii = c <= 0x7F && IsSafeLatin1Ascii3986((char)c); + if (safeAscii) + continue; + + if (idx > lastSafe) + sb.Append(s, lastSafe, idx - lastSafe); + + if (c <= 0xFF) + { + sb.Append(table[c]); + } + else + { + sb.Append("%26%23"); + sb.Append(c); + sb.Append("%3B"); + } + + lastSafe = idx + 1; + } + + if (lastSafe < len) + sb.Append(s, lastSafe, len - lastSafe); + } + else + { + if (i > 0) sb.Append(s, 0, i); + + for (var j = i; j < len; j++) + { + int c = s[j]; + + switch (c) + { + case <= 0x7F when IsSafeLatin1Ascii3986((char)c): + sb.Append((char)c); + continue; + case <= 0xFF: + sb.Append(table[c]); + break; + default: + sb.Append("%26%23"); + sb.Append(c); + sb.Append("%3B"); + break; + } + } + } + + return sb.ToString(); + } } // UTF-8 path with two strategies: // 1) run-copy mode for mixed/mostly-safe inputs (lazy flush of safe runs) // 2) escape-heavy mode for mostly-unsafe inputs (big prealloc, simpler loop) - var len = s.Length; if (fmt == Format.Rfc1738) { From d00acbf2e3d199dac5c295d4f2ba2a4ced99c119 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 23:50:12 +0100 Subject: [PATCH 25/37] :zap: inline code variable declaration in entity parsing for improved clarity --- QsNet/Internal/Utils.cs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index 4a2e2b7..214832a 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -1196,14 +1196,13 @@ public static string InterpretNumericEntities(string str) if (j < n && str[j] == ';' && j > startDigits) { - int code; #if NETSTANDARD2_0 var digits = str.Substring(startDigits, j - startDigits); var ok = int.TryParse( digits, hex ? NumberStyles.HexNumber : NumberStyles.Integer, CultureInfo.InvariantCulture, - out code + out var code ); #else var digits = str.AsSpan(startDigits, j - startDigits); @@ -1211,7 +1210,7 @@ out code digits, hex ? NumberStyles.HexNumber : NumberStyles.Integer, CultureInfo.InvariantCulture, - out code + out var code ); #endif if (!ok) From 17339dd6b62cd3b3ef38a3f82b82fedcdd0718c4 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Sun, 31 Aug 2025 23:58:10 +0100 Subject: [PATCH 26/37] :zap: emit U+FFFD for unpaired surrogates in UTF-8 encoding to improve correctness --- QsNet/Internal/Utils.cs | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index 214832a..86d1570 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -653,9 +653,10 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo } else { - sb.Append(table[0xE0 | (c >> 12)]); - sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(table[0x80 | (c & 0x3F)]); + // Unpaired surrogate: emit U+FFFD (EF BF BD) + sb.Append(table[0xEF]); + sb.Append(table[0xBF]); + sb.Append(table[0xBD]); } break; @@ -708,9 +709,10 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo } else { - sb.Append(table[0xE0 | (c >> 12)]); - sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(table[0x80 | (c & 0x3F)]); + // Unpaired surrogate: emit U+FFFD (EF BF BD) + sb.Append(table[0xEF]); + sb.Append(table[0xBF]); + sb.Append(table[0xBD]); } break; @@ -784,9 +786,10 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo } else { - sb.Append(table[0xE0 | (c >> 12)]); - sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(table[0x80 | (c & 0x3F)]); + // Unpaired surrogate: emit U+FFFD (EF BF BD) + sb.Append(table[0xEF]); + sb.Append(table[0xBF]); + sb.Append(table[0xBD]); } break; @@ -838,9 +841,10 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo } else { - sb.Append(table[0xE0 | (c >> 12)]); - sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(table[0x80 | (c & 0x3F)]); + // Unpaired surrogate: emit U+FFFD (EF BF BD) + sb.Append(table[0xEF]); + sb.Append(table[0xBF]); + sb.Append(table[0xBD]); } break; From 4aafaf402a76cc64a5f8d90caff7d509d293063b Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Mon, 1 Sep 2025 00:13:45 +0100 Subject: [PATCH 27/37] :zap: improve Latin-1 detection by supporting iso-8859-1 WebName in encoding checks --- QsNet/Internal/Utils.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index 86d1570..9bcf979 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -415,7 +415,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo var len = s.Length; // Latin-1 (ISO-8859-1) path with fast skip when no %u-escapes are present - if (encoding.CodePage == 28591) + if (encoding.CodePage == 28591 || string.Equals(encoding.WebName, "iso-8859-1", StringComparison.OrdinalIgnoreCase)) { var table = HexTable.Table; From 594e03413bd8d5873e5670656cb4cae54feeba1e Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Mon, 1 Sep 2025 08:46:25 +0100 Subject: [PATCH 28/37] :zap: enhance UTF-8 encoding to handle unpaired surrogates by emitting U+FFFD and improve clarity in encoding logic --- QsNet/Internal/Utils.cs | 294 +++++++++++++++++++++------------------- 1 file changed, 156 insertions(+), 138 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index 9bcf979..d2e8b6a 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -385,6 +385,8 @@ private static bool IsSafeLatin1Ascii3986(char ch) return ch is '+' or '-' or '.' or '_' or >= '0' and <= '9' or >= 'A' and <= 'Z' or >= 'a' and <= 'z'; } + private const string Utf8ReplacementPercent = "%EF%BF%BD"; // percent-encoded UTF-8 for U+FFFD + /// /// Encodes a value into a URL-encoded string. /// @@ -415,7 +417,8 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo var len = s.Length; // Latin-1 (ISO-8859-1) path with fast skip when no %u-escapes are present - if (encoding.CodePage == 28591 || string.Equals(encoding.WebName, "iso-8859-1", StringComparison.OrdinalIgnoreCase)) + if (encoding.CodePage == 28591 || + string.Equals(encoding.WebName, "iso-8859-1", StringComparison.OrdinalIgnoreCase)) { var table = HexTable.Table; @@ -624,43 +627,46 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo if (idx > lastSafe) sb.Append(s, lastSafe, idx - lastSafe); - switch (c) + // fast UTF-8 encode, surrogate-aware + if ((uint)c < 0x80) { - case < 0x80: - sb.Append(table[c]); - break; - case < 0x800: - sb.Append(table[0xC0 | (c >> 6)]); - sb.Append(table[0x80 | (c & 0x3F)]); - break; - case < 0xD800: - case >= 0xE000: - sb.Append(table[0xE0 | (c >> 12)]); - sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(table[0x80 | (c & 0x3F)]); - break; - default: + sb.Append(table[c]); + } + else if (c < 0x800) + { + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + else if ((uint)(c - 0xD800) <= 0x07FF) + { + // Surrogates range + if ((uint)(c - 0xD800) <= 0x03FF && idx + 1 < len) + { + int d = s[idx + 1]; + if ((uint)(d - 0xDC00) <= 0x03FF) { - // surrogate handling - if (idx + 1 < len && char.IsSurrogatePair(s[idx], s[idx + 1])) - { - var codePoint = char.ConvertToUtf32(s[idx], s[idx + 1]); - sb.Append(table[0xF0 | (codePoint >> 18)]); - sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); - sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); - sb.Append(table[0x80 | (codePoint & 0x3F)]); - idx++; // consume low surrogate - } - else - { - // Unpaired surrogate: emit U+FFFD (EF BF BD) - sb.Append(table[0xEF]); - sb.Append(table[0xBF]); - sb.Append(table[0xBD]); - } - - break; + var codePoint = 0x10000 + (((c - 0xD800) << 10) | (d - 0xDC00)); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + idx++; // consume low surrogate + } + else + { + sb.Append(Utf8ReplacementPercent); // unpaired high surrogate } + } + else + { + sb.Append(Utf8ReplacementPercent); // unpaired low surrogate + } + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); } lastSafe = idx + 1; @@ -677,46 +683,50 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo for (var j = i; j < len; j++) { int c = s[j]; - - switch (c) + if ((uint)c < 0x80) { - case <= 0x7F when IsUnreservedAscii1738((char)c): + if (IsUnreservedAscii1738((char)c)) + { sb.Append((char)c); continue; - case < 0x80: - sb.Append(table[c]); - break; - case < 0x800: - sb.Append(table[0xC0 | (c >> 6)]); - sb.Append(table[0x80 | (c & 0x3F)]); - break; - case < 0xD800: - case >= 0xE000: - sb.Append(table[0xE0 | (c >> 12)]); - sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(table[0x80 | (c & 0x3F)]); - break; - default: - { - if (j + 1 < len && char.IsSurrogatePair(s[j], s[j + 1])) - { - var codePoint = char.ConvertToUtf32(s[j], s[j + 1]); - sb.Append(table[0xF0 | (codePoint >> 18)]); - sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); - sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); - sb.Append(table[0x80 | (codePoint & 0x3F)]); - j++; - } - else - { - // Unpaired surrogate: emit U+FFFD (EF BF BD) - sb.Append(table[0xEF]); - sb.Append(table[0xBF]); - sb.Append(table[0xBD]); - } + } - break; + sb.Append(table[c]); + } + else if (c < 0x800) + { + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + else if ((uint)(c - 0xD800) <= 0x07FF) + { + if ((uint)(c - 0xD800) <= 0x03FF && j + 1 < len) + { + int d = s[j + 1]; + if ((uint)(d - 0xDC00) <= 0x03FF) + { + var codePoint = 0x10000 + (((c - 0xD800) << 10) | (d - 0xDC00)); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + j++; + } + else + { + sb.Append(Utf8ReplacementPercent); } + } + else + { + sb.Append(Utf8ReplacementPercent); + } + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); } } } @@ -758,42 +768,46 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo if (idx > lastSafe) sb.Append(s, lastSafe, idx - lastSafe); - switch (c) + // fast UTF-8 encode, surrogate-aware + if ((uint)c < 0x80) { - case < 0x80: - sb.Append(table[c]); - break; - case < 0x800: - sb.Append(table[0xC0 | (c >> 6)]); - sb.Append(table[0x80 | (c & 0x3F)]); - break; - case < 0xD800: - case >= 0xE000: - sb.Append(table[0xE0 | (c >> 12)]); - sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(table[0x80 | (c & 0x3F)]); - break; - default: + sb.Append(table[c]); + } + else if (c < 0x800) + { + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + else if ((uint)(c - 0xD800) <= 0x07FF) + { + // Surrogates range + if ((uint)(c - 0xD800) <= 0x03FF && idx + 1 < len) + { + int d = s[idx + 1]; + if ((uint)(d - 0xDC00) <= 0x03FF) { - if (idx + 1 < len && char.IsSurrogatePair(s[idx], s[idx + 1])) - { - var codePoint = char.ConvertToUtf32(s[idx], s[idx + 1]); - sb.Append(table[0xF0 | (codePoint >> 18)]); - sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); - sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); - sb.Append(table[0x80 | (codePoint & 0x3F)]); - idx++; - } - else - { - // Unpaired surrogate: emit U+FFFD (EF BF BD) - sb.Append(table[0xEF]); - sb.Append(table[0xBF]); - sb.Append(table[0xBD]); - } - - break; + var codePoint = 0x10000 + (((c - 0xD800) << 10) | (d - 0xDC00)); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + idx++; // consume low surrogate } + else + { + sb.Append(Utf8ReplacementPercent); // unpaired high surrogate + } + } + else + { + sb.Append(Utf8ReplacementPercent); // unpaired low surrogate + } + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); } lastSafe = idx + 1; @@ -809,46 +823,50 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo for (var j = i; j < len; j++) { int c = s[j]; - - switch (c) + if ((uint)c < 0x80) { - case <= 0x7F when IsUnreservedAscii3986((char)c): + if (IsUnreservedAscii3986((char)c)) + { sb.Append((char)c); continue; - case < 0x80: - sb.Append(table[c]); - break; - case < 0x800: - sb.Append(table[0xC0 | (c >> 6)]); - sb.Append(table[0x80 | (c & 0x3F)]); - break; - case < 0xD800: - case >= 0xE000: - sb.Append(table[0xE0 | (c >> 12)]); - sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); - sb.Append(table[0x80 | (c & 0x3F)]); - break; - default: - { - if (j + 1 < len && char.IsSurrogatePair(s[j], s[j + 1])) - { - var codePoint = char.ConvertToUtf32(s[j], s[j + 1]); - sb.Append(table[0xF0 | (codePoint >> 18)]); - sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); - sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); - sb.Append(table[0x80 | (codePoint & 0x3F)]); - j++; - } - else - { - // Unpaired surrogate: emit U+FFFD (EF BF BD) - sb.Append(table[0xEF]); - sb.Append(table[0xBF]); - sb.Append(table[0xBD]); - } + } - break; + sb.Append(table[c]); + } + else if (c < 0x800) + { + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + else if ((uint)(c - 0xD800) <= 0x07FF) + { + if ((uint)(c - 0xD800) <= 0x03FF && j + 1 < len) + { + int d = s[j + 1]; + if ((uint)(d - 0xDC00) <= 0x03FF) + { + var codePoint = 0x10000 + (((c - 0xD800) << 10) | (d - 0xDC00)); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + j++; + } + else + { + sb.Append(Utf8ReplacementPercent); } + } + else + { + sb.Append(Utf8ReplacementPercent); + } + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); } } } From 6056ec9189d173fa974a32218441866e5b060653 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Mon, 1 Sep 2025 19:11:03 +0100 Subject: [PATCH 29/37] :zap: optimize ASCII scanning for RFC1738 and RFC3986 by implementing precomputed lookup tables --- QsNet/Internal/Utils.cs | 47 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index d2e8b6a..8323182 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -385,6 +385,33 @@ private static bool IsSafeLatin1Ascii3986(char ch) return ch is '+' or '-' or '.' or '_' or >= '0' and <= '9' or >= 'A' and <= 'Z' or >= 'a' and <= 'z'; } + // Precomputed ASCII lookup tables to speed up the initial all-ASCII fast scan + private static readonly bool[] Unreserved3986Ascii = BuildUnreserved3986Ascii(); + private static readonly bool[] Unreserved1738Ascii = BuildUnreserved1738Ascii(); + + private static bool[] BuildUnreserved3986Ascii() + { + var t = new bool[128]; + // RFC 3986 unreserved: ALPHA / DIGIT / "-" / "." / "_" / "~" + for (var c = (int)'0'; c <= '9'; c++) t[c] = true; + for (var c = (int)'A'; c <= 'Z'; c++) t[c] = true; + for (var c = (int)'a'; c <= 'z'; c++) t[c] = true; + t['-'] = true; + t['.'] = true; + t['_'] = true; + t['~'] = true; + return t; + } + + private static bool[] BuildUnreserved1738Ascii() + { + // RFC1738 extends RFC3986's set with parentheses + var t = BuildUnreserved3986Ascii(); + t['('] = true; + t[')'] = true; + return t; + } + private const string Utf8ReplacementPercent = "%EF%BF%BD"; // percent-encoded UTF-8 for U+FFFD /// @@ -592,9 +619,15 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo if (fmt == Format.Rfc1738) { - // Scan to first unsafe ASCII (anything non-ASCII is unsafe-by-definition for this pass) + // Scan to first unsafe ASCII using precomputed table (fewer calls/bounds checks) var i = 0; - while (i < len && s[i] <= 0x7F && IsUnreservedAscii1738(s[i])) i++; + for (; i < len; i++) + { + var ch = s[i]; + if ((uint)ch >= 128 || !Unreserved1738Ascii[ch]) + break; + } + if (i == len) return s; // all safe ASCII @@ -735,9 +768,15 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo } else { - // RFC3986 path (no parentheses allowed) + // RFC3986 path (no parentheses allowed) — faster ASCII run scan var i = 0; - while (i < len && s[i] <= 0x7F && IsUnreservedAscii3986(s[i])) i++; + for (; i < len; i++) + { + var ch = s[i]; + if ((uint)ch >= 128 || !Unreserved3986Ascii[ch]) + break; + } + if (i == len) return s; From 3009eb2f751deef3df456e9b2314ccca639aa89f Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Mon, 1 Sep 2025 20:07:32 +0100 Subject: [PATCH 30/37] :zap: optimize ASCII checks by implementing precomputed lookup tables for RFC1738 and RFC3986 --- QsNet/Internal/Utils.cs | 131 ++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 86 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index 8323182..ca34ab2 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -351,64 +351,31 @@ out var code return sb.ToString(); } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsUnreservedAscii3986(char ch) - { - // -, ., _, ~, 0-9, A-Z, a-z - return ch is '-' or '.' or '_' or '~' or >= '0' and <= '9' or >= 'A' and <= 'Z' or >= 'a' and <= 'z'; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsUnreservedAscii1738(char ch) - { - // RFC1738 extends RFC3986 set with parentheses - return ch is '(' or ')' || IsUnreservedAscii3986(ch); - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsSafeLatin1Ascii1738(char ch) - { - // Legacy Latin-1 encode behavior: - // - treat '+' as safe (do not encode) - // - treat '~' as unsafe (percent-encode) - // - RFC1738 adds '(' and ')' - return ch is '+' or '(' or ')' or '-' or '.' or '_' or >= '0' and <= '9' or >= 'A' and <= 'Z' - or >= 'a' and <= 'z'; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool IsSafeLatin1Ascii3986(char ch) - { - // Legacy Latin-1 encode behavior: - // - treat '+' as safe (do not encode) - // - treat '~' as unsafe (percent-encode) - return ch is '+' or '-' or '.' or '_' or >= '0' and <= '9' or >= 'A' and <= 'Z' or >= 'a' and <= 'z'; - } - - // Precomputed ASCII lookup tables to speed up the initial all-ASCII fast scan - private static readonly bool[] Unreserved3986Ascii = BuildUnreserved3986Ascii(); - private static readonly bool[] Unreserved1738Ascii = BuildUnreserved1738Ascii(); - - private static bool[] BuildUnreserved3986Ascii() + // Precomputed ASCII membership tables for fast checks + // RFC 3986 unreserved: - . _ ~ 0-9 A-Z a-z + private static readonly bool[] UnreservedTable3986 = + CreateAsciiTable("-._~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + // RFC 1738 extends RFC 3986 with '(' and ')' + private static readonly bool[] UnreservedTable1738 = + CreateAsciiTable("()-._~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + // Legacy Latin-1 safe sets: + // - '+' is safe (NOT encoded) + // - '~' is NOT safe (WILL be encoded) + // RFC3986 (no parentheses) + private static readonly bool[] Latin1SafeTable3986 = + CreateAsciiTable("+-._0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + // RFC1738 adds '(' and ')' + private static readonly bool[] Latin1SafeTable1738 = + CreateAsciiTable("()+-._0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + private static bool[] CreateAsciiTable(string chars) { var t = new bool[128]; - // RFC 3986 unreserved: ALPHA / DIGIT / "-" / "." / "_" / "~" - for (var c = (int)'0'; c <= '9'; c++) t[c] = true; - for (var c = (int)'A'; c <= 'Z'; c++) t[c] = true; - for (var c = (int)'a'; c <= 'z'; c++) t[c] = true; - t['-'] = true; - t['.'] = true; - t['_'] = true; - t['~'] = true; - return t; - } - - private static bool[] BuildUnreserved1738Ascii() - { - // RFC1738 extends RFC3986's set with parentheses - var t = BuildUnreserved3986Ascii(); - t['('] = true; - t[')'] = true; + foreach (var ch in chars) + t[ch] = true; return t; } @@ -453,8 +420,9 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo { // Legacy behavior: in Latin-1 mode, treat '+' as safe (do not percent-encode) // Scan to first unsafe ASCII (anything non-ASCII is unsafe for this pass) + var asciiSafe = Latin1SafeTable1738; var i = 0; - while (i < len && s[i] <= 0x7F && IsSafeLatin1Ascii1738(s[i])) i++; + while (i < len && s[i] <= 0x7F && asciiSafe[s[i]]) i++; if (i == len) return s; // all safe ASCII @@ -464,7 +432,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo for (var k = i; k < sampleEnd; k++) { var ch = s[k]; - if (ch > 0x7F || !IsSafeLatin1Ascii1738(ch)) + if (ch > 0x7F || !asciiSafe[ch]) unsafeCount++; } @@ -478,7 +446,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo for (var idx = 0; idx < len; idx++) { int c = s[idx]; - var safeAscii = c <= 0x7F && IsSafeLatin1Ascii1738((char)c); + var safeAscii = c <= 0x7F && asciiSafe[c]; if (safeAscii) continue; @@ -515,7 +483,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo switch (c) { - case <= 0x7F when IsSafeLatin1Ascii1738((char)c): + case <= 0x7F when asciiSafe[c]: sb.Append((char)c); continue; case <= 0xFF: @@ -536,8 +504,9 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo { // Legacy behavior: in Latin-1 mode, treat '+' as safe (do not percent-encode) // RFC3986 path (no parentheses allowed) + var asciiSafe = Latin1SafeTable3986; var i = 0; - while (i < len && s[i] <= 0x7F && IsSafeLatin1Ascii3986(s[i])) i++; + while (i < len && s[i] <= 0x7F && asciiSafe[s[i]]) i++; if (i == len) return s; // all safe ASCII @@ -546,7 +515,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo for (var k = i; k < sampleEnd; k++) { var ch = s[k]; - if (ch > 0x7F || !IsSafeLatin1Ascii3986(ch)) + if (ch > 0x7F || !asciiSafe[ch]) unsafeCount++; } @@ -560,7 +529,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo for (var idx = 0; idx < len; idx++) { int c = s[idx]; - var safeAscii = c <= 0x7F && IsSafeLatin1Ascii3986((char)c); + var safeAscii = c <= 0x7F && asciiSafe[c]; if (safeAscii) continue; @@ -594,7 +563,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo switch (c) { - case <= 0x7F when IsSafeLatin1Ascii3986((char)c): + case <= 0x7F when asciiSafe[c]: sb.Append((char)c); continue; case <= 0xFF: @@ -619,15 +588,10 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo if (fmt == Format.Rfc1738) { - // Scan to first unsafe ASCII using precomputed table (fewer calls/bounds checks) + // Scan to first unsafe ASCII (anything non-ASCII is unsafe-by-definition for this pass) + var asciiUnreserved = UnreservedTable1738; var i = 0; - for (; i < len; i++) - { - var ch = s[i]; - if ((uint)ch >= 128 || !Unreserved1738Ascii[ch]) - break; - } - + while (i < len && s[i] <= 0x7F && asciiUnreserved[s[i]]) i++; if (i == len) return s; // all safe ASCII @@ -637,7 +601,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo for (var k = i; k < sampleEnd; k++) { var ch = s[k]; - if (ch > 0x7F || !IsUnreservedAscii1738(ch)) + if (ch > 0x7F || !asciiUnreserved[ch]) unsafeCount++; } @@ -652,7 +616,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo for (var idx = 0; idx < len; idx++) { int c = s[idx]; - var safeAscii = c <= 0x7F && IsUnreservedAscii1738((char)c); + var safeAscii = c <= 0x7F && asciiUnreserved[c]; if (safeAscii) continue; @@ -718,7 +682,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo int c = s[j]; if ((uint)c < 0x80) { - if (IsUnreservedAscii1738((char)c)) + if (asciiUnreserved[c]) { sb.Append((char)c); continue; @@ -768,15 +732,10 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo } else { - // RFC3986 path (no parentheses allowed) — faster ASCII run scan + // RFC3986 path (no parentheses allowed) + var asciiUnreserved = UnreservedTable3986; var i = 0; - for (; i < len; i++) - { - var ch = s[i]; - if ((uint)ch >= 128 || !Unreserved3986Ascii[ch]) - break; - } - + while (i < len && s[i] <= 0x7F && asciiUnreserved[s[i]]) i++; if (i == len) return s; @@ -785,7 +744,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo for (var k = i; k < sampleEnd; k++) { var ch = s[k]; - if (ch > 0x7F || !IsUnreservedAscii3986(ch)) + if (ch > 0x7F || !asciiUnreserved[ch]) unsafeCount++; } @@ -800,7 +759,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo for (var idx = 0; idx < len; idx++) { int c = s[idx]; - var safeAscii = c <= 0x7F && IsUnreservedAscii3986((char)c); + var safeAscii = c <= 0x7F && asciiUnreserved[c]; if (safeAscii) continue; @@ -864,7 +823,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo int c = s[j]; if ((uint)c < 0x80) { - if (IsUnreservedAscii3986((char)c)) + if (asciiUnreserved[c]) { sb.Append((char)c); continue; From 4a71214fdba86a0415cfc6fc8c37631a770c55b9 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Mon, 1 Sep 2025 21:33:44 +0100 Subject: [PATCH 31/37] :zap: enhance encoding logic by ensuring culture-invariant string conversion for percent-encoded numeric entities --- QsNet/Internal/Utils.cs | 63 +++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index ca34ab2..ccd6bae 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -462,7 +462,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo { // For non-Latin1 code units, emit percent-encoded numeric entity: %26%23{code}%3B sb.Append("%26%23"); - sb.Append(c); + sb.Append(c.ToString(CultureInfo.InvariantCulture)); sb.Append("%3B"); } @@ -491,7 +491,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo break; default: sb.Append("%26%23"); - sb.Append(c); + sb.Append(c.ToString(CultureInfo.InvariantCulture)); sb.Append("%3B"); break; } @@ -543,7 +543,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo else { sb.Append("%26%23"); - sb.Append(c); + sb.Append(c.ToString(CultureInfo.InvariantCulture)); sb.Append("%3B"); } @@ -571,7 +571,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo break; default: sb.Append("%26%23"); - sb.Append(c); + sb.Append(c.ToString(CultureInfo.InvariantCulture)); sb.Append("%3B"); break; } @@ -881,31 +881,52 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo /// The decoded string, or null if the input is null. public static string? Decode(string? str, Encoding? encoding = null) { - encoding ??= Encoding.UTF8; - var strWithoutPlus = str?.Replace('+', ' '); + { + encoding ??= Encoding.UTF8; - if (encoding.CodePage == 28591) // ISO-8859-1 (Latin-1) - try - { - return MyRegex() - .Replace(strWithoutPlus ?? string.Empty, + if (str is null) + return null; + + // Avoid allocating when there is no '+' to translate +#if NETSTANDARD2_0 + var hasPlus = str.IndexOf('+') >= 0; +#else + var hasPlus = str.Contains('+'); +#endif + var strWithoutPlus = hasPlus ? str.Replace('+', ' ') : str; + + // Fast path: if there is no percent, nothing to decode +#if NETSTANDARD2_0 + if (strWithoutPlus.IndexOf('%') == -1) + return strWithoutPlus; +#else + if (!strWithoutPlus.Contains('%')) + return strWithoutPlus; +#endif + + if (encoding.CodePage == 28591) // ISO-8859-1 (Latin-1) + try + { + return MyRegex() #pragma warning disable CS0618 - match => Unescape(match.Value) + .Replace(strWithoutPlus, + match => Unescape(match.Value) #pragma warning restore CS0618 - ); + ); + } + catch + { + return strWithoutPlus; + } + + try + { + return HttpUtility.UrlDecode(strWithoutPlus, encoding); } catch { return strWithoutPlus; } - - try - { - return strWithoutPlus != null ? HttpUtility.UrlDecode(strWithoutPlus, encoding) : null; - } - catch - { - return strWithoutPlus; } } From 18ebe0ca17faa8daa25e2044763118c54f9cd887 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Tue, 2 Sep 2025 07:14:45 +0100 Subject: [PATCH 32/37] Revert ":zap: enhance encoding logic by ensuring culture-invariant string conversion for percent-encoded numeric entities" This reverts commit 4a71214fdba86a0415cfc6fc8c37631a770c55b9. --- QsNet/Internal/Utils.cs | 63 ++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index ccd6bae..ca34ab2 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -462,7 +462,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo { // For non-Latin1 code units, emit percent-encoded numeric entity: %26%23{code}%3B sb.Append("%26%23"); - sb.Append(c.ToString(CultureInfo.InvariantCulture)); + sb.Append(c); sb.Append("%3B"); } @@ -491,7 +491,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo break; default: sb.Append("%26%23"); - sb.Append(c.ToString(CultureInfo.InvariantCulture)); + sb.Append(c); sb.Append("%3B"); break; } @@ -543,7 +543,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo else { sb.Append("%26%23"); - sb.Append(c.ToString(CultureInfo.InvariantCulture)); + sb.Append(c); sb.Append("%3B"); } @@ -571,7 +571,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo break; default: sb.Append("%26%23"); - sb.Append(c.ToString(CultureInfo.InvariantCulture)); + sb.Append(c); sb.Append("%3B"); break; } @@ -881,52 +881,31 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo /// The decoded string, or null if the input is null. public static string? Decode(string? str, Encoding? encoding = null) { - { - encoding ??= Encoding.UTF8; - - if (str is null) - return null; - - // Avoid allocating when there is no '+' to translate -#if NETSTANDARD2_0 - var hasPlus = str.IndexOf('+') >= 0; -#else - var hasPlus = str.Contains('+'); -#endif - var strWithoutPlus = hasPlus ? str.Replace('+', ' ') : str; - - // Fast path: if there is no percent, nothing to decode -#if NETSTANDARD2_0 - if (strWithoutPlus.IndexOf('%') == -1) - return strWithoutPlus; -#else - if (!strWithoutPlus.Contains('%')) - return strWithoutPlus; -#endif - - if (encoding.CodePage == 28591) // ISO-8859-1 (Latin-1) - try - { - return MyRegex() -#pragma warning disable CS0618 - .Replace(strWithoutPlus, - match => Unescape(match.Value) -#pragma warning restore CS0618 - ); - } - catch - { - return strWithoutPlus; - } + encoding ??= Encoding.UTF8; + var strWithoutPlus = str?.Replace('+', ' '); + if (encoding.CodePage == 28591) // ISO-8859-1 (Latin-1) try { - return HttpUtility.UrlDecode(strWithoutPlus, encoding); + return MyRegex() + .Replace(strWithoutPlus ?? string.Empty, +#pragma warning disable CS0618 + match => Unescape(match.Value) +#pragma warning restore CS0618 + ); } catch { return strWithoutPlus; } + + try + { + return strWithoutPlus != null ? HttpUtility.UrlDecode(strWithoutPlus, encoding) : null; + } + catch + { + return strWithoutPlus; } } From 58a3afeab072cddcb6aed9f62f2115599f3d9073 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Tue, 2 Sep 2025 08:02:52 +0100 Subject: [PATCH 33/37] :bulb: enhance URL encoding and decoding documentation to clarify behavior for UTF-8 and Latin-1 modes --- QsNet/Internal/Utils.cs | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index ca34ab2..9c4d90f 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -385,16 +385,26 @@ private static bool[] CreateAsciiTable(string chars) /// Encodes a value into a URL-encoded string. /// /// The value to encode. - /// The character encoding to use for encoding. Defaults to UTF-8. + /// + /// The character encoding to use for encoding. Defaults to UTF-8. If set to ISO‑8859‑1 (Latin‑1), + /// legacy rules apply (see remarks). + /// /// The encoding format to use. Defaults to RFC 3986. /// The encoded string. + /// + /// UTF‑8 mode uses precomputed ASCII lookups and a two‑strategy loop (copy runs of safe ASCII or escape‑heavy). + /// Latin‑1 mode preserves legacy behavior: '+' is considered safe; '~' is not. + /// Characters beyond 0xFF are emitted as percent‑encoded numeric entities (e.g., %26%23{code}%3B), + /// which decode back to &#{code};. Use after decoding + /// if you need those entities resolved to Unicode. + /// public static string Encode(object? value, Encoding? encoding = null, Format? format = null) { encoding ??= Encoding.UTF8; format ??= Format.Rfc3986; var fmt = format.GetValueOrDefault(); - // These cannot be encoded + // Non-scalar inputs (maps/sequences/Undefined) are not encoded by design: return empty. if (value is IEnumerable and not string and not byte[] or IDictionary or Undefined) return string.Empty; @@ -410,7 +420,13 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo var s = str!; var len = s.Length; - // Latin-1 (ISO-8859-1) path with fast skip when no %u-escapes are present + // Latin-1 (ISO-8859-1) path with an ASCII fast-path. + // Legacy rules in this mode: + // - '+' is treated as safe (never percent-encoded). + // - '~' is NOT safe. + // - Code points > 0xFF are emitted as percent-encoded numeric entities ("%26%23{code}%3B"), + // which decode back to "&#{code};". Call InterpretNumericEntities(...) afterwards + // if you need those resolved to Unicode characters. if (encoding.CodePage == 28591 || string.Equals(encoding.WebName, "iso-8859-1", StringComparison.OrdinalIgnoreCase)) { @@ -874,11 +890,17 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo } /// - /// Decodes a URL-encoded string into its original form. + /// Decodes a URL-encoded string. /// /// The URL-encoded string to decode. /// The character encoding to use for decoding. Defaults to UTF-8. /// The decoded string, or null if the input is null. + /// + /// In UTF‑8 mode this delegates to . + /// In Latin‑1 mode it decodes %XX byte escapes and leaves characters beyond 0xFF as numeric entities + /// (e.g., &#12345;) if they were produced by . Call + /// to convert those entities to Unicode code points if desired. + /// public static string? Decode(string? str, Encoding? encoding = null) { encoding ??= Encoding.UTF8; @@ -1122,7 +1144,10 @@ void AddOne(object? x) /// Checks if a value is a non-nullish primitive type. /// /// The value to check. - /// If true, empty strings and URIs are not considered non-nullish. + /// + /// If true, empty strings and values with an empty textual form are treated as + /// nullish. + /// /// True if the value is a non-nullish primitive, false otherwise. public static bool IsNonNullishPrimitive(object? value, bool skipNulls = false) { From 9aaf53eca2b75bb9951d9255b0e6b549834f5bd3 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Tue, 2 Sep 2025 08:05:40 +0100 Subject: [PATCH 34/37] :zap: simplify encoding check for ISO-8859-1 by removing redundant WebName comparison --- QsNet/Internal/Utils.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index 9c4d90f..6bf2171 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -427,8 +427,7 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo // - Code points > 0xFF are emitted as percent-encoded numeric entities ("%26%23{code}%3B"), // which decode back to "&#{code};". Call InterpretNumericEntities(...) afterwards // if you need those resolved to Unicode characters. - if (encoding.CodePage == 28591 || - string.Equals(encoding.WebName, "iso-8859-1", StringComparison.OrdinalIgnoreCase)) + if (encoding.CodePage == 28591) { var table = HexTable.Table; From e23be332c2261d02d60efbc2d8ebc364764ebf36 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Tue, 2 Sep 2025 08:16:49 +0100 Subject: [PATCH 35/37] :bulb: enhance documentation for Encoder class to clarify encoding behavior and performance considerations --- QsNet/Internal/Encoder.cs | 109 +++++++++++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 25 deletions(-) diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index c222e71..8b60f97 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -13,10 +13,39 @@ namespace QsNet.Internal; /// /// A helper class for encoding data into a query string format. /// +/// +/// +/// Performance notes: This type sits on hot paths. It relies on Utils.Encode for percent-encoding. +/// The UTF-8 encoder path uses precomputed ASCII lookup tables for RFC 3986/1738 unreserved sets to fast-scan +/// ASCII and avoid per-char predicate cost. Latin-1 branches are intentionally left unchanged to preserve legacy +/// behavior and measurements. +/// +/// +/// Semantics: RFC3986 by default; RFC1738 only maps space to '+' (other bytes identical). When list +/// format is comma, the separator comma between elements is written literally and never re-encoded; commas +/// originating inside element values are encoded as "%2C". When allowDots and encodeDotInKeys are +/// both true, '.' in keys is encoded as "%2E" to avoid ambiguity. +/// +/// +/// Safety: The implementation avoids unsafe code. If an unsafe micro-optimization is +/// considered in the future, only add it when dedicated benchmarks show a real win and all unit/compat tests pass. +/// Encoding semantics must remain identical. +/// +/// Thread-safety: Stateless; safe to use concurrently. +/// +/// Benchmarks: See UtilsEncodeBenchmarks. Any change here or in Utils.Encode should be +/// validated against the UTF-8 and Latin-1 datasets (ascii-safe, latin1-fallback, reserved-heavy, utf8-mixed) to +/// prevent regressions. +/// +/// internal static class Encoder { private static readonly Formatter IdentityFormatter = s => s; + /// + /// Converts to a culture-invariant string. + /// Booleans become "true"/"false"; numeric types use InvariantCulture; null becomes an empty string. + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] private static string ToInvariantString(object? value) { @@ -40,8 +69,11 @@ private static string ToInvariantString(object? value) }; } - // Encode a single value for the comma-values-only fast path, without re-encoding the comma separators. - // RFC3986 by default; RFC1738 maps space to '+'. Commas inside values are percent-encoded as %2C. + // Encodes a single element for the comma-join fast path. + // - Uses the provided encoder (or Utils.Encode) according to `format` and `cs`. + // - The comma separator between elements is appended by the caller and is never re‑encoded. + // - Any commas that originate *inside* a value are percent-encoded as "%2C" to preserve round‑trip semantics. + // - RFC3986 is the default; RFC1738 only changes space handling (space => '+'). [MethodImpl(MethodImplOptions.AggressiveInlining)] private static void AppendCommaEncodedValue( StringBuilder sb, @@ -72,29 +104,53 @@ private static bool IsLeaf(object? v, bool skipNulls) } /// - /// Encodes the given data into a query string format. + /// Encodes into query-string fragments. + /// Returns either a single "key=value" fragment (as a string), a sequence of fragments (as an IEnumerable boxed as + /// object), + /// or an empty array when nothing should be emitted. Callers are expected to flatten and join with '&'. /// - /// The data to encode; can be any type. - /// If true, will not encode undefined values. - /// A dictionary for tracking cyclic references. - /// An optional prefix for the encoded string. - /// A generator for array prefixes. - /// If true, uses comma for array encoding. - /// If true, allows empty lists in the output. - /// If true, handles nulls strictly. - /// If true, skips null values in the output. - /// If true, encodes dots in keys. - /// An optional custom encoder function. - /// An optional date serializer function. - /// An optional sorter for keys. - /// An optional filter to apply to the data. - /// If true, allows dots in keys. - /// The format to use for encoding (default is RFC3986). - /// A custom formatter function. - /// If true, only encodes values without keys. - /// The character encoding to use (default is UTF-8). - /// If true, adds a '?' prefix to the output. - /// The encoded result. + /// The value to encode; may be any object, dictionary, list/array, or primitive. + /// If true, treats the current value as logically undefined (missing) and emits nothing. + /// + /// Cycle-detection frame used across recursion; pass the current frame to detect + /// self-references. + /// + /// Optional prefix for the current key path (e.g., an existing query or parent key). + /// Function that produces the key for array elements (indices, brackets, or comma mode). + /// + /// When using the comma list format, if true, appends "[]" to the key for single-element + /// arrays to preserve round‑trip parsing. + /// + /// If true, encodes empty lists as "key[]"; otherwise, empty lists produce no output. + /// If true, encodes null as the bare key (e.g., "k"); otherwise encodes as "k=". + /// If true, omits pairs whose value is null; also enables a leaf fast-path for cycle detection. + /// + /// If true and is true, encodes '.' in keys as "%2E" + /// to avoid ambiguity. + /// + /// Optional custom value encoder; when null, falls back to Utils.Encode. + /// + /// Optional serializer for values (ISO 8601 by default); applied to + /// comma arrays as well. + /// + /// Optional key sort comparer; when null, a faster unsorted path is used. + /// + /// Optional filter. If a FunctionFilter, it's applied to the current object/value; if an + /// IterableFilter, its iterable provides the key set. + /// + /// + /// If true, uses dotted notation for object navigation (e.g., "a.b"); otherwise uses bracket + /// notation (e.g., "a[b]"). + /// + /// Target escaping rules (RFC3986 by default; RFC1738 maps spaces to '+'). + /// Post-processing applied to each emitted string fragment; default is identity. + /// If true, values are encoded but keys are not passed to . + /// Character encoding for the encoder (UTF-8 by default). + /// If true, prepends '?' to the very first fragment (useful for top-level calls). + /// + /// A string fragment, a sequence of fragments, or an empty array when no output is produced. The caller is responsible + /// for joining with '&'. + /// public static object Encode( object? data, bool undefined, @@ -128,6 +184,7 @@ public static object Encode( var keyPrefixStr = prefix ?? (addQueryPrefix ? "?" : ""); var obj = data; + // Only encode '.' when both AllowDots and EncodeDotInKeys are true (preserves legacy behavior when AllowDots == false). var dotsAndEncode = allowDots && encodeDotInKeys; var objKey = data; // identity key @@ -381,7 +438,9 @@ void AddKv(object? keyObj, object? val) var sbJoined = new StringBuilder(listC.Count * 8); for (var i = 0; i < listC.Count; i++) { - if (i > 0) sbJoined.Append(','); // separator comma is never re-encoded + if (i > 0) + sbJoined.Append( + ','); // The separator comma is literal and never re-encoded; only commas originating inside element values become "%2C". AppendCommaEncodedValue(sbJoined, listC[i], cs, format, encoder); } From 639cb80b34983b8cba4d84eacc4881753de5b429 Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Tue, 2 Sep 2025 09:07:43 +0100 Subject: [PATCH 36/37] :zap: optimize percent-encoding logic by refining case handling for space and Latin-1 characters --- QsNet/Internal/Utils.cs | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index 6bf2171..e997870 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -469,16 +469,20 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo if (idx > lastSafe) sb.Append(s, lastSafe, idx - lastSafe); - if (c <= 0xFF) - { - sb.Append(table[c]); // %XX for Latin-1 bytes - } - else + switch (c) { - // For non-Latin1 code units, emit percent-encoded numeric entity: %26%23{code}%3B - sb.Append("%26%23"); - sb.Append(c); - sb.Append("%3B"); + case 0x20: + sb.Append('+'); // RFC1738 space + break; + case <= 0xFF: + sb.Append(table[c]); // %XX for Latin-1 bytes + break; + default: + // For non-Latin1 code units, emit percent-encoded numeric entity: %26%23{code}%3B + sb.Append("%26%23"); + sb.Append(c.ToString(CultureInfo.InvariantCulture)); + sb.Append("%3B"); + break; } lastSafe = idx + 1; @@ -697,6 +701,11 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo int c = s[j]; if ((uint)c < 0x80) { + if (c == 0x20) + { + sb.Append('+'); // RFC1738 space + continue; + } if (asciiUnreserved[c]) { sb.Append((char)c); From d9f58bacc547253f04acff84f1656f95389dd11e Mon Sep 17 00:00:00 2001 From: Klemen Tusar Date: Tue, 2 Sep 2025 09:08:30 +0100 Subject: [PATCH 37/37] :zap: update UTF-8 encoding logic to handle spaces as '+' for RFC1738 compliance --- QsNet/Internal/Utils.cs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index e997870..aabdeab 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -644,7 +644,11 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo sb.Append(s, lastSafe, idx - lastSafe); // fast UTF-8 encode, surrogate-aware - if ((uint)c < 0x80) + if (c == 0x20) + { + sb.Append('+'); // RFC1738 space + } + else if ((uint)c < 0x80) { sb.Append(table[c]); }