diff --git a/QsNet/Internal/Encoder.cs b/QsNet/Internal/Encoder.cs index dd08133..8b60f97 100644 --- a/QsNet/Internal/Encoder.cs +++ b/QsNet/Internal/Encoder.cs @@ -1,7 +1,9 @@ using System; using System.Collections; using System.Collections.Generic; +using System.Globalization; using System.Linq; +using System.Runtime.CompilerServices; using System.Text; using QsNet.Enums; using QsNet.Models; @@ -11,34 +13,144 @@ namespace QsNet.Internal; /// /// A helper class for encoding data into a query string format. /// +/// +/// +/// Performance notes: This type sits on hot paths. It relies on Utils.Encode for percent-encoding. +/// The UTF-8 encoder path uses precomputed ASCII lookup tables for RFC 3986/1738 unreserved sets to fast-scan +/// ASCII and avoid per-char predicate cost. Latin-1 branches are intentionally left unchanged to preserve legacy +/// behavior and measurements. +/// +/// +/// Semantics: RFC3986 by default; RFC1738 only maps space to '+' (other bytes identical). When list +/// format is comma, the separator comma between elements is written literally and never re-encoded; commas +/// originating inside element values are encoded as "%2C". When allowDots and encodeDotInKeys are +/// both true, '.' in keys is encoded as "%2E" to avoid ambiguity. +/// +/// +/// Safety: The implementation avoids unsafe code. If an unsafe micro-optimization is +/// considered in the future, only add it when dedicated benchmarks show a real win and all unit/compat tests pass. +/// Encoding semantics must remain identical. +/// +/// Thread-safety: Stateless; safe to use concurrently. +/// +/// Benchmarks: See UtilsEncodeBenchmarks. Any change here or in Utils.Encode should be +/// validated against the UTF-8 and Latin-1 datasets (ascii-safe, latin1-fallback, reserved-heavy, utf8-mixed) to +/// prevent regressions. +/// +/// internal static class Encoder { private static readonly Formatter IdentityFormatter = s => s; /// - /// Encodes the given data into a query string format. + /// Converts to a culture-invariant string. + /// Booleans become "true"/"false"; numeric types use InvariantCulture; null becomes an empty string. /// - /// The data to encode; can be any type. - /// If true, will not encode undefined values. - /// A dictionary for tracking cyclic references. - /// An optional prefix for the encoded string. - /// A generator for array prefixes. - /// If true, uses comma for array encoding. - /// If true, allows empty lists in the output. - /// If true, handles nulls strictly. - /// If true, skips null values in the output. - /// If true, encodes dots in keys. - /// An optional custom encoder function. - /// An optional date serializer function. - /// An optional sorter for keys. - /// An optional filter to apply to the data. - /// If true, allows dots in keys. - /// The format to use for encoding (default is RFC3986). - /// A custom formatter function. - /// If true, only encodes values without keys. - /// The character encoding to use (default is UTF-8). - /// If true, adds a '?' prefix to the output. - /// The encoded result. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static string ToInvariantString(object? value) + { + if (value is null) return string.Empty; + return value switch + { + bool b => b ? "true" : "false", + sbyte v => v.ToString(CultureInfo.InvariantCulture), + byte v => v.ToString(CultureInfo.InvariantCulture), + short v => v.ToString(CultureInfo.InvariantCulture), + ushort v => v.ToString(CultureInfo.InvariantCulture), + int v => v.ToString(CultureInfo.InvariantCulture), + uint v => v.ToString(CultureInfo.InvariantCulture), + long v => v.ToString(CultureInfo.InvariantCulture), + ulong v => v.ToString(CultureInfo.InvariantCulture), + float v => v.ToString(CultureInfo.InvariantCulture), + double v => v.ToString(CultureInfo.InvariantCulture), + decimal v => v.ToString(CultureInfo.InvariantCulture), + char ch => ch.ToString(), + _ => value.ToString() ?? string.Empty + }; + } + + // Encodes a single element for the comma-join fast path. + // - Uses the provided encoder (or Utils.Encode) according to `format` and `cs`. + // - The comma separator between elements is appended by the caller and is never re‑encoded. + // - Any commas that originate *inside* a value are percent-encoded as "%2C" to preserve round‑trip semantics. + // - RFC3986 is the default; RFC1738 only changes space handling (space => '+'). + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void AppendCommaEncodedValue( + StringBuilder sb, + object? value, + Encoding cs, + Format format, + ValueEncoder? encoder + ) + { + var encoded = encoder != null ? encoder(value, cs, format) : Utils.Encode(value, cs, format); + +#if NETSTANDARD2_0 + if (encoded.IndexOf(',') >= 0) + encoded = encoded.Replace(",", "%2C"); // commas inside values must be encoded +#else + if (encoded.Contains(',', StringComparison.Ordinal)) + encoded = encoded.Replace(",", "%2C", StringComparison.Ordinal); +#endif + + sb.Append(encoded); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsLeaf(object? v, bool skipNulls) + { + if (v is null) return skipNulls; + return v is string || v is byte[] || Utils.IsNonNullishPrimitive(v, skipNulls); + } + + /// + /// Encodes into query-string fragments. + /// Returns either a single "key=value" fragment (as a string), a sequence of fragments (as an IEnumerable boxed as + /// object), + /// or an empty array when nothing should be emitted. Callers are expected to flatten and join with '&'. + /// + /// The value to encode; may be any object, dictionary, list/array, or primitive. + /// If true, treats the current value as logically undefined (missing) and emits nothing. + /// + /// Cycle-detection frame used across recursion; pass the current frame to detect + /// self-references. + /// + /// Optional prefix for the current key path (e.g., an existing query or parent key). + /// Function that produces the key for array elements (indices, brackets, or comma mode). + /// + /// When using the comma list format, if true, appends "[]" to the key for single-element + /// arrays to preserve round‑trip parsing. + /// + /// If true, encodes empty lists as "key[]"; otherwise, empty lists produce no output. + /// If true, encodes null as the bare key (e.g., "k"); otherwise encodes as "k=". + /// If true, omits pairs whose value is null; also enables a leaf fast-path for cycle detection. + /// + /// If true and is true, encodes '.' in keys as "%2E" + /// to avoid ambiguity. + /// + /// Optional custom value encoder; when null, falls back to Utils.Encode. + /// + /// Optional serializer for values (ISO 8601 by default); applied to + /// comma arrays as well. + /// + /// Optional key sort comparer; when null, a faster unsorted path is used. + /// + /// Optional filter. If a FunctionFilter, it's applied to the current object/value; if an + /// IterableFilter, its iterable provides the key set. + /// + /// + /// If true, uses dotted notation for object navigation (e.g., "a.b"); otherwise uses bracket + /// notation (e.g., "a[b]"). + /// + /// Target escaping rules (RFC3986 by default; RFC1738 maps spaces to '+'). + /// Post-processing applied to each emitted string fragment; default is identity. + /// If true, values are encoded but keys are not passed to . + /// Character encoding for the encoder (UTF-8 by default). + /// If true, prepends '?' to the very first fragment (useful for top-level calls). + /// + /// A string fragment, a sequence of fragments, or an empty array when no output is produced. The caller is responsible + /// for joining with '&'. + /// public static object Encode( object? data, bool undefined, @@ -66,33 +178,39 @@ public static object Encode( var cs = charset ?? Encoding.UTF8; var gen = generateArrayPrefix ?? ListFormat.Indices.GetGenerator(); - var isCommaGen = ReferenceEquals(gen, ListFormat.Comma.GetGenerator()); + var commaGen = ListFormat.Comma.GetGenerator(); + var isCommaGen = gen == commaGen; var crt = commaRoundTrip ?? isCommaGen; var keyPrefixStr = prefix ?? (addQueryPrefix ? "?" : ""); var obj = data; + // Only encode '.' when both AllowDots and EncodeDotInKeys are true (preserves legacy behavior when AllowDots == false). + var dotsAndEncode = allowDots && encodeDotInKeys; var objKey = data; // identity key var tmpSc = sideChannel; var step = 0; var found = false; - while (!found) - { - tmpSc = tmpSc.Parent; - if (tmpSc is null) - break; - step++; - if (objKey is not null && tmpSc.TryGet(objKey, out var pos)) + // Fast path (#3): skip cycle detection when the current value is a leaf. + // Leaves never recurse, so they can’t participate in cycles. + if (!IsLeaf(data, skipNulls)) + while (!found) { - if (pos == step) - throw new InvalidOperationException("Cyclic object value"); - found = true; - } + tmpSc = tmpSc.Parent; + if (tmpSc is null) + break; + step++; + if (objKey is not null && tmpSc.TryGet(objKey, out var pos)) + { + if (pos == step) + throw new InvalidOperationException("Cyclic object value"); + found = true; + } - if (tmpSc.Parent is null) - step = 0; - } + if (tmpSc.Parent is null) + step = 0; + } if (filter is FunctionFilter ff) obj = ff.Function(keyPrefixStr, obj); @@ -127,22 +245,17 @@ public static object Encode( { if (encoder == null) { - var s = obj switch - { - bool b => b ? "true" : "false", - _ => obj?.ToString() ?? "" - }; + var s = ToInvariantString(obj); return $"{fmt(keyPrefixStr)}={fmt(s)}"; } - var keyPart = encodeValuesOnly ? keyPrefixStr : encoder(keyPrefixStr, null, null); - var valuePart = encoder(obj, null, null); + var keyPart = encodeValuesOnly ? keyPrefixStr : encoder(keyPrefixStr, cs, format); + var valuePart = encoder(obj, cs, format); return $"{fmt(keyPart)}={fmt(valuePart)}"; } - var values = new List(); if (undefined) - return values; + return Array.Empty(); // Detect sequence once and cache materialization for index access / counts var isSeq = false; @@ -150,19 +263,252 @@ public static object Encode( if (obj is IEnumerable seq0 and not string and not IDictionary) { isSeq = true; - seqList = seq0.Cast().ToList(); + if (obj is List already) + seqList = already; + else + seqList = seq0.Cast().ToList(); + } + + // Fast path (#1): when no sorting is requested, avoid building objKeys and + // iterate the structure directly to eliminate extra allocations and lookups. + if (sort == null && !(isCommaGen && obj is IEnumerable and not string and not IDictionary) && + filter is not IterableFilter) + { +#if NETSTANDARD2_0 + // Intentionally gate on encodeDotInKeys only to preserve legacy behavior when AllowDots = false + var encodedPrefixFast = encodeDotInKeys && keyPrefixStr.IndexOf('.') >= 0 + ? keyPrefixStr.Replace(".", "%2E") + : keyPrefixStr; +#else + // Intentionally gate on encodeDotInKeys only to preserve legacy behavior when AllowDots = false + var encodedPrefixFast = encodeDotInKeys && keyPrefixStr.Contains('.', StringComparison.Ordinal) + ? keyPrefixStr.Replace(".", "%2E", StringComparison.Ordinal) + : keyPrefixStr; +#endif + var adjustedPrefixFast = + crt && isSeq && seqList is { Count: 1 } + ? $"{encodedPrefixFast}[]" + : encodedPrefixFast; + + if (allowEmptyLists && isSeq && seqList is { Count: 0 }) + return $"{adjustedPrefixFast}[]"; + + // Fast path (#5): mark side-channel once per parent instead of per child + var markSideChannelFast = objKey is not null && (obj is IDictionary || isSeq); + if (markSideChannelFast) + sideChannel.Set(objKey!, step); + + List valuesFast; + + void AddKv(object? keyObj, object? val) + { + if (skipNulls && val is null) + return; + + var keyStr = keyObj?.ToString() ?? string.Empty; + var encodedKey = keyStr; +#if NETSTANDARD2_0 + if (dotsAndEncode && keyStr.IndexOf('.') >= 0) + encodedKey = keyStr.Replace(".", "%2E"); +#else + if (dotsAndEncode && keyStr.Contains('.', StringComparison.Ordinal)) + encodedKey = keyStr.Replace(".", "%2E", StringComparison.Ordinal); +#endif + var keyPrefixFast = + isSeq + ? gen(adjustedPrefixFast, encodedKey) + : allowDots + ? $"{adjustedPrefixFast}.{encodedKey}" + : $"{adjustedPrefixFast}[{encodedKey}]"; + + // Removed per-iteration sideChannel.Set + + var childSc = IsLeaf(val, skipNulls) ? sideChannel : new SideChannelFrame(sideChannel); + + var encoded = Encode( + val, + false, + childSc, + keyPrefixFast, + gen, + crt, + allowEmptyLists, + strictNullHandling, + skipNulls, + encodeDotInKeys, + encoder, + serializeDate, + sort, + filter, + allowDots, + format, + fmt, + encodeValuesOnly, + cs, + addQueryPrefix + ); + + switch (encoded) + { + case List enList: + valuesFast.AddRange(enList); + break; + case IEnumerable en and not string: + { + foreach (var item in en) + valuesFast.Add(item); + break; + } + default: + valuesFast.Add(encoded); + break; + } + } + + switch (obj) + { + case IDictionary dObj: + valuesFast = new List(dObj.Count); + foreach (var kv in dObj) + AddKv(kv.Key, kv.Value); + return valuesFast; + case IDictionary dStr: + valuesFast = new List(dStr.Count); + foreach (var kv in dStr) + AddKv(kv.Key, kv.Value); + return valuesFast; + case IDictionary map: + valuesFast = new List(map.Count); + foreach (DictionaryEntry de in map) + AddKv(de.Key, de.Value); + return valuesFast; + case Array arr: + valuesFast = new List(arr.Length); + for (var i = 0; i < arr.Length; i++) + AddKv(i, arr.GetValue(i)); + return valuesFast; + case IList list: + valuesFast = new List(list.Count); + for (var i = 0; i < list.Count; i++) + AddKv(i, list[i]); + return valuesFast; + default: + if (isSeq && seqList != null) + { + valuesFast = new List(seqList.Count); + for (var i = 0; i < seqList.Count; i++) + AddKv(i, seqList[i]); + return valuesFast; + } + + break; + } + // If we fall through (very uncommon), continue with the generic path below. + } + + // Fast path (#2): comma-joined arrays -> build the joined value once and short-circuit the generic path. + if (isCommaGen && obj is IEnumerable enumerableC and not string and not IDictionary && sort == null && + filter is not IterableFilter) + { +#if NETSTANDARD2_0 + // Intentionally gate on encodeDotInKeys only to preserve legacy behavior when AllowDots = false + var encodedPrefixC = encodeDotInKeys && keyPrefixStr.IndexOf('.') >= 0 + ? keyPrefixStr.Replace(".", "%2E") + : keyPrefixStr; +#else + // Intentionally gate on encodeDotInKeys only to preserve legacy behavior when AllowDots = false + var encodedPrefixC = encodeDotInKeys && keyPrefixStr.Contains('.', StringComparison.Ordinal) + ? keyPrefixStr.Replace(".", "%2E", StringComparison.Ordinal) + : keyPrefixStr; +#endif + // Materialize once for count checks and iteration + var listC = seqList ?? enumerableC.Cast().ToList(); + var adjustedPrefixC = crt && listC.Count == 1 ? $"{encodedPrefixC}[]" : encodedPrefixC; + + // Honor empty list handling semantics + if (allowEmptyLists && listC.Count == 0) + return $"{adjustedPrefixC}[]"; + if (listC.Count == 0) + return Array.Empty(); + + string joinedC; + if (encodeValuesOnly && encoder != null) + { + // Stream-encode each element and append literal commas between them. + var sbJoined = new StringBuilder(listC.Count * 8); + for (var i = 0; i < listC.Count; i++) + { + if (i > 0) + sbJoined.Append( + ','); // The separator comma is literal and never re-encoded; only commas originating inside element values become "%2C". + AppendCommaEncodedValue(sbJoined, listC[i], cs, format, encoder); + } + + joinedC = sbJoined.ToString(); + + // Match legacy semantics: if the joined value is empty, treat it like `null`. + if (!string.IsNullOrEmpty(joinedC)) return $"{fmt(adjustedPrefixC)}={fmt(joinedC)}"; + if (skipNulls) + return Array.Empty(); + + if (strictNullHandling) + return !encodeValuesOnly + ? fmt(encoder(adjustedPrefixC, cs, format)) + : adjustedPrefixC; + // not strict: fall through to return `key=` below + + // In values-only mode we do not encode the key via `encoder`. + return $"{fmt(adjustedPrefixC)}={fmt(joinedC)}"; + } + + // Join raw string representations; apply encoder to the full result if provided. + var tmp = new List(listC.Count); + foreach (var el in listC) + tmp.Add(ToInvariantString(el)); + joinedC = string.Join(",", tmp); + + // Match legacy semantics: if the joined value is empty, treat it like `null`. + if (string.IsNullOrEmpty(joinedC)) + { + if (skipNulls) + return Array.Empty(); + + if (strictNullHandling) + return encoder != null && !encodeValuesOnly + ? fmt(encoder(adjustedPrefixC, cs, format)) + : adjustedPrefixC; + // not strict: fall through to return `key=` below + } + + if (encoder == null) return $"{fmt(adjustedPrefixC)}={fmt(joinedC)}"; + var keyPartC = encoder(adjustedPrefixC, cs, format); + var valuePartC = encoder(joinedC, cs, format); + return $"{fmt(keyPartC)}={fmt(valuePartC)}"; } List objKeys; + var commaElementsAlreadyEncoded = false; if (isCommaGen && obj is IEnumerable enumerable and not string and not IDictionary) { - List strings = []; + List strings; + if (obj is List listObj) + strings = new List(listObj.Count); + else if (enumerable is ICollection { Count: > 0 } coll0) + strings = new List(coll0.Count); + else + strings = []; + if (encodeValuesOnly && encoder != null) + { foreach (var el in enumerable) - strings.Add(el is null ? "" : encoder(el.ToString(), null, null)); + strings.Add(el is null ? "" : encoder(el, cs, format)); + commaElementsAlreadyEncoded = true; + } else + { foreach (var el in enumerable) strings.Add(el?.ToString() ?? ""); + } if (strings.Count != 0) { @@ -229,9 +575,19 @@ public static object Encode( objKeys.Sort(Comparer.Create(sort)); } - values.Capacity = Math.Max(values.Capacity, objKeys.Count); + var values = new List(objKeys.Count); - var encodedPrefix = encodeDotInKeys ? keyPrefixStr.Replace(".", "%2E") : keyPrefixStr; +#if NETSTANDARD2_0 + // Intentionally gate on encodeDotInKeys only to preserve legacy behavior when AllowDots = false + var encodedPrefix = encodeDotInKeys && keyPrefixStr.IndexOf('.') >= 0 + ? keyPrefixStr.Replace(".", "%2E") + : keyPrefixStr; +#else + // Intentionally gate on encodeDotInKeys only to preserve legacy behavior when AllowDots = false + var encodedPrefix = encodeDotInKeys && keyPrefixStr.Contains('.', StringComparison.Ordinal) + ? keyPrefixStr.Replace(".", "%2E", StringComparison.Ordinal) + : keyPrefixStr; +#endif var adjustedPrefix = crt && isSeq && seqList is { Count: 1 } ? $"{encodedPrefix}[]" @@ -240,6 +596,15 @@ public static object Encode( if (allowEmptyLists && isSeq && seqList is { Count: 0 }) return $"{adjustedPrefix}[]"; + // Fast path (#5): mark side-channel once per parent instead of per element + var markSideChannel = objKey is not null && (obj is IDictionary || isSeq); + if (markSideChannel) + sideChannel.Set(objKey!, step); + + // Fast path (#4): hoist child-encoder decision out of the loop. + // For comma-joined arrays in values-only mode, do not re-encode the joined string. + var childEncoderForElements = commaElementsAlreadyEncoded ? null : encoder; + for (var i = 0; i < objKeys.Count; i++) { var key = objKeys[i]; @@ -336,8 +701,7 @@ IConvertible when int.TryParse(key.ToString(), out var parsed) => break; } - case IEnumerable ie - and not string: + case IEnumerable and not string: { var idx = key switch { @@ -345,7 +709,7 @@ IConvertible when int.TryParse(key.ToString(), out var parsed) => IConvertible when int.TryParse(key.ToString(), out var parsed) => parsed, _ => -1 }; - var list2 = seqList ?? ie.Cast().ToList(); + var list2 = seqList!; if ((uint)idx < (uint)list2.Count) { value = list2[idx]; @@ -371,29 +735,23 @@ IConvertible when int.TryParse(key.ToString(), out var parsed) => parsed, var keyStr = key?.ToString() ?? ""; var encodedKey = keyStr; #if NETSTANDARD2_0 - if (allowDots && encodeDotInKeys && keyStr.IndexOf('.') >= 0) + if (dotsAndEncode && keyStr.IndexOf('.') >= 0) encodedKey = keyStr.Replace(".", "%2E"); #else - if (allowDots && encodeDotInKeys && keyStr.Contains('.', StringComparison.Ordinal)) + if (dotsAndEncode && keyStr.Contains('.', StringComparison.Ordinal)) encodedKey = keyStr.Replace(".", "%2E", StringComparison.Ordinal); #endif var keyPrefix = - obj is IEnumerable and not string and not IDictionary + isSeq ? gen(adjustedPrefix, encodedKey) : allowDots ? $"{adjustedPrefix}.{encodedKey}" : $"{adjustedPrefix}[{encodedKey}]"; - if (objKey is not null && obj is IDictionary or IEnumerable and not string) - sideChannel.Set(objKey, step); - - var childSc = new SideChannelFrame(sideChannel); + // Removed per-iteration sideChannel.Set - var childEncoder = - isCommaGen && encodeValuesOnly && obj is IEnumerable and not string - ? null - : encoder; + var childSc = IsLeaf(value, skipNulls) ? sideChannel : new SideChannelFrame(sideChannel); var encoded = Encode( value, @@ -406,7 +764,7 @@ obj is IEnumerable and not string and not IDictionary strictNullHandling, skipNulls, encodeDotInKeys, - childEncoder, + childEncoderForElements, serializeDate, sort, filter, @@ -418,7 +776,9 @@ obj is IEnumerable and not string and not IDictionary addQueryPrefix ); - if (encoded is IEnumerable en and not string) + if (encoded is List enList) + values.AddRange(enList); + else if (encoded is IEnumerable en and not string) foreach (var item in en) values.Add(item); else diff --git a/QsNet/Internal/Utils.cs b/QsNet/Internal/Utils.cs index e52d4b6..aabdeab 100644 --- a/QsNet/Internal/Utils.cs +++ b/QsNet/Internal/Utils.cs @@ -22,11 +22,6 @@ internal static class Utils internal static partial class Utils #endif { - /// - /// The maximum length of a segment to encode in a single pass. - /// - private const int SegmentLimit = 1024; - /// /// A regex to match percent-encoded characters in the format %XX. /// @@ -42,21 +37,6 @@ private static Regex MyRegex() private static partial Regex MyRegex(); #endif - /// - /// A regex to match Unicode percent-encoded characters in the format %uXXXX. - /// -#if NETSTANDARD2_0 - private static readonly Regex MyRegex1Instance = new("%u[0-9a-f]{4}", RegexOptions.IgnoreCase); - - private static Regex MyRegex1() - { - return MyRegex1Instance; - } -#else - [GeneratedRegex("%u[0-9a-f]{4}", RegexOptions.IgnoreCase, "en-GB")] - private static partial Regex MyRegex1(); -#endif - /// /// Merges two objects, where the source object overrides the target object. If the source is a /// Dictionary, it will merge its entries into the target. If the source is an IEnumerable, it will append @@ -371,20 +351,60 @@ out var code return sb.ToString(); } + // Precomputed ASCII membership tables for fast checks + // RFC 3986 unreserved: - . _ ~ 0-9 A-Z a-z + private static readonly bool[] UnreservedTable3986 = + CreateAsciiTable("-._~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + // RFC 1738 extends RFC 3986 with '(' and ')' + private static readonly bool[] UnreservedTable1738 = + CreateAsciiTable("()-._~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + // Legacy Latin-1 safe sets: + // - '+' is safe (NOT encoded) + // - '~' is NOT safe (WILL be encoded) + // RFC3986 (no parentheses) + private static readonly bool[] Latin1SafeTable3986 = + CreateAsciiTable("+-._0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + // RFC1738 adds '(' and ')' + private static readonly bool[] Latin1SafeTable1738 = + CreateAsciiTable("()+-._0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + private static bool[] CreateAsciiTable(string chars) + { + var t = new bool[128]; + foreach (var ch in chars) + t[ch] = true; + return t; + } + + private const string Utf8ReplacementPercent = "%EF%BF%BD"; // percent-encoded UTF-8 for U+FFFD + /// /// Encodes a value into a URL-encoded string. /// /// The value to encode. - /// The character encoding to use for encoding. Defaults to UTF-8. + /// + /// The character encoding to use for encoding. Defaults to UTF-8. If set to ISO‑8859‑1 (Latin‑1), + /// legacy rules apply (see remarks). + /// /// The encoding format to use. Defaults to RFC 3986. /// The encoded string. + /// + /// UTF‑8 mode uses precomputed ASCII lookups and a two‑strategy loop (copy runs of safe ASCII or escape‑heavy). + /// Latin‑1 mode preserves legacy behavior: '+' is considered safe; '~' is not. + /// Characters beyond 0xFF are emitted as percent‑encoded numeric entities (e.g., %26%23{code}%3B), + /// which decode back to &#{code};. Use after decoding + /// if you need those entities resolved to Unicode. + /// public static string Encode(object? value, Encoding? encoding = null, Format? format = null) { encoding ??= Encoding.UTF8; format ??= Format.Rfc3986; var fmt = format.GetValueOrDefault(); - // These cannot be encoded + // Non-scalar inputs (maps/sequences/Undefined) are not encoded by design: return empty. if (value is IEnumerable and not string and not byte[] or IDictionary or Undefined) return string.Empty; @@ -397,122 +417,508 @@ public static string Encode(object? value, Encoding? encoding = null, Format? fo if (string.IsNullOrEmpty(str)) return string.Empty; - var nonNullStr = str!; - - if (Equals(encoding, Encoding.GetEncoding("ISO-8859-1"))) + var s = str!; + var len = s.Length; + + // Latin-1 (ISO-8859-1) path with an ASCII fast-path. + // Legacy rules in this mode: + // - '+' is treated as safe (never percent-encoded). + // - '~' is NOT safe. + // - Code points > 0xFF are emitted as percent-encoded numeric entities ("%26%23{code}%3B"), + // which decode back to "&#{code};". Call InterpretNumericEntities(...) afterwards + // if you need those resolved to Unicode characters. + if (encoding.CodePage == 28591) { -#pragma warning disable CS0618 // Type or member is obsolete - return MyRegex1() - .Replace( - Escape(str!, fmt), - match => + var table = HexTable.Table; + + if (fmt == Format.Rfc1738) + { + // Legacy behavior: in Latin-1 mode, treat '+' as safe (do not percent-encode) + // Scan to first unsafe ASCII (anything non-ASCII is unsafe for this pass) + var asciiSafe = Latin1SafeTable1738; + var i = 0; + while (i < len && s[i] <= 0x7F && asciiSafe[s[i]]) i++; + if (i == len) + return s; // all safe ASCII + + // Sample to decide escape density + var sampleEnd = Math.Min(len, i + 64); + var unsafeCount = 0; + for (var k = i; k < sampleEnd; k++) + { + var ch = s[k]; + if (ch > 0x7F || !asciiSafe[ch]) + unsafeCount++; + } + + var escapeHeavy = unsafeCount * 4 >= (sampleEnd - i) * 3; // ≥75% unsafe + var cap = escapeHeavy ? len >= int.MaxValue / 3 ? int.MaxValue : len * 3 : len + 16; + var sb = new StringBuilder(cap); + + if (!escapeHeavy) + { + var lastSafe = 0; + for (var idx = 0; idx < len; idx++) { -#if NETSTANDARD2_0 - var code = int.Parse(match.Value.Substring(2), NumberStyles.HexNumber, - CultureInfo.InvariantCulture); -#else - var code = int.Parse(match.Value[2..], NumberStyles.HexNumber, CultureInfo.InvariantCulture); -#endif - return $"%26%23{code}%3B"; + int c = s[idx]; + var safeAscii = c <= 0x7F && asciiSafe[c]; + if (safeAscii) + continue; + + // flush preceding safe run + if (idx > lastSafe) + sb.Append(s, lastSafe, idx - lastSafe); + + switch (c) + { + case 0x20: + sb.Append('+'); // RFC1738 space + break; + case <= 0xFF: + sb.Append(table[c]); // %XX for Latin-1 bytes + break; + default: + // For non-Latin1 code units, emit percent-encoded numeric entity: %26%23{code}%3B + sb.Append("%26%23"); + sb.Append(c.ToString(CultureInfo.InvariantCulture)); + sb.Append("%3B"); + break; + } + + lastSafe = idx + 1; + } + + if (lastSafe < len) + sb.Append(s, lastSafe, len - lastSafe); + } + else + { + // Escape-heavy mode: no run bookkeeping + if (i > 0) sb.Append(s, 0, i); + + for (var j = i; j < len; j++) + { + int c = s[j]; + + switch (c) + { + case <= 0x7F when asciiSafe[c]: + sb.Append((char)c); + continue; + case <= 0xFF: + sb.Append(table[c]); + break; + default: + sb.Append("%26%23"); + sb.Append(c); + sb.Append("%3B"); + break; + } } - ); -#pragma warning restore CS0618 // Type or member is obsolete + } + + return sb.ToString(); + } + else + { + // Legacy behavior: in Latin-1 mode, treat '+' as safe (do not percent-encode) + // RFC3986 path (no parentheses allowed) + var asciiSafe = Latin1SafeTable3986; + var i = 0; + while (i < len && s[i] <= 0x7F && asciiSafe[s[i]]) i++; + if (i == len) + return s; // all safe ASCII + + var sampleEnd = Math.Min(len, i + 64); + var unsafeCount = 0; + for (var k = i; k < sampleEnd; k++) + { + var ch = s[k]; + if (ch > 0x7F || !asciiSafe[ch]) + unsafeCount++; + } + + var escapeHeavy = unsafeCount * 4 >= (sampleEnd - i) * 3; // ≥75% unsafe + var cap = escapeHeavy ? len >= int.MaxValue / 3 ? int.MaxValue : len * 3 : len + 16; + var sb = new StringBuilder(cap); + + if (!escapeHeavy) + { + var lastSafe = 0; + for (var idx = 0; idx < len; idx++) + { + int c = s[idx]; + var safeAscii = c <= 0x7F && asciiSafe[c]; + if (safeAscii) + continue; + + if (idx > lastSafe) + sb.Append(s, lastSafe, idx - lastSafe); + + if (c <= 0xFF) + { + sb.Append(table[c]); + } + else + { + sb.Append("%26%23"); + sb.Append(c); + sb.Append("%3B"); + } + + lastSafe = idx + 1; + } + + if (lastSafe < len) + sb.Append(s, lastSafe, len - lastSafe); + } + else + { + if (i > 0) sb.Append(s, 0, i); + + for (var j = i; j < len; j++) + { + int c = s[j]; + + switch (c) + { + case <= 0x7F when asciiSafe[c]: + sb.Append((char)c); + continue; + case <= 0xFF: + sb.Append(table[c]); + break; + default: + sb.Append("%26%23"); + sb.Append(c); + sb.Append("%3B"); + break; + } + } + } + + return sb.ToString(); + } } - var buffer = new StringBuilder(); - var j = 0; + // UTF-8 path with two strategies: + // 1) run-copy mode for mixed/mostly-safe inputs (lazy flush of safe runs) + // 2) escape-heavy mode for mostly-unsafe inputs (big prealloc, simpler loop) - while (j < nonNullStr.Length) + if (fmt == Format.Rfc1738) { - // Take up to SegmentLimit characters, but never split a surrogate pair across the boundary. - var remaining = nonNullStr.Length - j; - var segmentLen = remaining >= SegmentLimit ? SegmentLimit : remaining; + // Scan to first unsafe ASCII (anything non-ASCII is unsafe-by-definition for this pass) + var asciiUnreserved = UnreservedTable1738; + var i = 0; + while (i < len && s[i] <= 0x7F && asciiUnreserved[s[i]]) i++; + if (i == len) + return s; // all safe ASCII + + // Sample up to 64 chars after first unsafe to decide whether it's escape-heavy + var sampleEnd = Math.Min(len, i + 64); + var unsafeCount = 0; + for (var k = i; k < sampleEnd; k++) + { + var ch = s[k]; + if (ch > 0x7F || !asciiUnreserved[ch]) + unsafeCount++; + } - // If the last char of this segment is a high surrogate and the next char exists and is a low surrogate, - // shrink the segment by one so the pair is encoded together in the next iteration. - if ( - segmentLen < remaining && - char.IsHighSurrogate(nonNullStr[j + segmentLen - 1]) && - char.IsLowSurrogate(nonNullStr[j + segmentLen]) - ) - segmentLen--; // keep the high surrogate with its low surrogate in the next chunk + var escapeHeavy = unsafeCount * 4 >= (sampleEnd - i) * 3; // ≥75% unsafe + var cap = escapeHeavy ? len >= int.MaxValue / 3 ? int.MaxValue : len * 3 : len + 16; + var sb = new StringBuilder(cap); + var table = HexTable.Table; - var segment = nonNullStr.Substring(j, segmentLen); + if (!escapeHeavy) + { + var lastSafe = 0; + for (var idx = 0; idx < len; idx++) + { + int c = s[idx]; + var safeAscii = c <= 0x7F && asciiUnreserved[c]; + if (safeAscii) + continue; + + // flush preceding safe run + if (idx > lastSafe) + sb.Append(s, lastSafe, idx - lastSafe); + // fast UTF-8 encode, surrogate-aware + if (c == 0x20) + { + sb.Append('+'); // RFC1738 space + } + else if ((uint)c < 0x80) + { + sb.Append(table[c]); + } + else if (c < 0x800) + { + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + else if ((uint)(c - 0xD800) <= 0x07FF) + { + // Surrogates range + if ((uint)(c - 0xD800) <= 0x03FF && idx + 1 < len) + { + int d = s[idx + 1]; + if ((uint)(d - 0xDC00) <= 0x03FF) + { + var codePoint = 0x10000 + (((c - 0xD800) << 10) | (d - 0xDC00)); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + idx++; // consume low surrogate + } + else + { + sb.Append(Utf8ReplacementPercent); // unpaired high surrogate + } + } + else + { + sb.Append(Utf8ReplacementPercent); // unpaired low surrogate + } + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + + lastSafe = idx + 1; + } + + if (lastSafe < len) + sb.Append(s, lastSafe, len - lastSafe); + } + else + { + // Escape-heavy mode: no run bookkeeping, big prealloc + if (i > 0) sb.Append(s, 0, i); + + for (var j = i; j < len; j++) + { + int c = s[j]; + if ((uint)c < 0x80) + { + if (c == 0x20) + { + sb.Append('+'); // RFC1738 space + continue; + } + if (asciiUnreserved[c]) + { + sb.Append((char)c); + continue; + } + + sb.Append(table[c]); + } + else if (c < 0x800) + { + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + else if ((uint)(c - 0xD800) <= 0x07FF) + { + if ((uint)(c - 0xD800) <= 0x03FF && j + 1 < len) + { + int d = s[j + 1]; + if ((uint)(d - 0xDC00) <= 0x03FF) + { + var codePoint = 0x10000 + (((c - 0xD800) << 10) | (d - 0xDC00)); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + j++; + } + else + { + sb.Append(Utf8ReplacementPercent); + } + } + else + { + sb.Append(Utf8ReplacementPercent); + } + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + } + } + + return sb.ToString(); + } + else + { + // RFC3986 path (no parentheses allowed) + var asciiUnreserved = UnreservedTable3986; var i = 0; - while (i < segment.Length) + while (i < len && s[i] <= 0x7F && asciiUnreserved[s[i]]) i++; + if (i == len) + return s; + + var sampleEnd = Math.Min(len, i + 64); + var unsafeCount = 0; + for (var k = i; k < sampleEnd; k++) { - var c = (int)segment[i]; + var ch = s[k]; + if (ch > 0x7F || !asciiUnreserved[ch]) + unsafeCount++; + } - switch (c) + var escapeHeavy = unsafeCount * 4 >= (sampleEnd - i) * 3; // ≥75% unsafe + var cap = escapeHeavy ? len >= int.MaxValue / 3 ? int.MaxValue : len * 3 : len + 16; + var sb = new StringBuilder(cap); + var table = HexTable.Table; + + if (!escapeHeavy) + { + var lastSafe = 0; + for (var idx = 0; idx < len; idx++) { - case 0x2D or 0x2E or 0x5F or 0x7E: - case >= 0x30 and <= 0x39: - case >= 0x41 and <= 0x5A: - case >= 0x61 and <= 0x7A: - case 0x28 or 0x29 when fmt == Format.Rfc1738: - buffer.Append(segment[i]); - i++; - continue; - // ASCII - case < 0x80: - buffer.Append(HexTable.Table[c]); - i++; - continue; - // 2 bytes - case < 0x800: - buffer.Append(HexTable.Table[0xC0 | (c >> 6)]); - buffer.Append(HexTable.Table[0x80 | (c & 0x3F)]); - i++; - continue; - case < 0xD800: - // 3 bytes - case >= 0xE000: - buffer.Append(HexTable.Table[0xE0 | (c >> 12)]); - buffer.Append(HexTable.Table[0x80 | ((c >> 6) & 0x3F)]); - buffer.Append(HexTable.Table[0x80 | (c & 0x3F)]); - i++; + int c = s[idx]; + var safeAscii = c <= 0x7F && asciiUnreserved[c]; + if (safeAscii) continue; + + if (idx > lastSafe) + sb.Append(s, lastSafe, idx - lastSafe); + + // fast UTF-8 encode, surrogate-aware + if ((uint)c < 0x80) + { + sb.Append(table[c]); + } + else if (c < 0x800) + { + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + else if ((uint)(c - 0xD800) <= 0x07FF) + { + // Surrogates range + if ((uint)(c - 0xD800) <= 0x03FF && idx + 1 < len) + { + int d = s[idx + 1]; + if ((uint)(d - 0xDC00) <= 0x03FF) + { + var codePoint = 0x10000 + (((c - 0xD800) << 10) | (d - 0xDC00)); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + idx++; // consume low surrogate + } + else + { + sb.Append(Utf8ReplacementPercent); // unpaired high surrogate + } + } + else + { + sb.Append(Utf8ReplacementPercent); // unpaired low surrogate + } + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + + lastSafe = idx + 1; } - // 4 bytes (surrogate pair) – only if valid pair; otherwise treat as 3-byte fallback - if (i + 1 >= segment.Length || !char.IsSurrogatePair(segment[i], segment[i + 1])) + if (lastSafe < len) + sb.Append(s, lastSafe, len - lastSafe); + } + else + { + if (i > 0) sb.Append(s, 0, i); + + for (var j = i; j < len; j++) { - // Fallback: percent-encode the single surrogate code unit to remain lossless - buffer.Append(HexTable.Table[0xE0 | (c >> 12)]); - buffer.Append(HexTable.Table[0x80 | ((c >> 6) & 0x3F)]); - buffer.Append(HexTable.Table[0x80 | (c & 0x3F)]); - i++; - continue; - } + int c = s[j]; + if ((uint)c < 0x80) + { + if (asciiUnreserved[c]) + { + sb.Append((char)c); + continue; + } - var nextC = segment[i + 1]; - var codePoint = char.ConvertToUtf32((char)c, nextC); - buffer.Append(HexTable.Table[0xF0 | (codePoint >> 18)]); - buffer.Append(HexTable.Table[0x80 | ((codePoint >> 12) & 0x3F)]); - buffer.Append(HexTable.Table[0x80 | ((codePoint >> 6) & 0x3F)]); - buffer.Append(HexTable.Table[0x80 | (codePoint & 0x3F)]); - i += 2; // Skip the next character as it's part of the surrogate pair + sb.Append(table[c]); + } + else if (c < 0x800) + { + sb.Append(table[0xC0 | (c >> 6)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + else if ((uint)(c - 0xD800) <= 0x07FF) + { + if ((uint)(c - 0xD800) <= 0x03FF && j + 1 < len) + { + int d = s[j + 1]; + if ((uint)(d - 0xDC00) <= 0x03FF) + { + var codePoint = 0x10000 + (((c - 0xD800) << 10) | (d - 0xDC00)); + sb.Append(table[0xF0 | (codePoint >> 18)]); + sb.Append(table[0x80 | ((codePoint >> 12) & 0x3F)]); + sb.Append(table[0x80 | ((codePoint >> 6) & 0x3F)]); + sb.Append(table[0x80 | (codePoint & 0x3F)]); + j++; + } + else + { + sb.Append(Utf8ReplacementPercent); + } + } + else + { + sb.Append(Utf8ReplacementPercent); + } + } + else + { + sb.Append(table[0xE0 | (c >> 12)]); + sb.Append(table[0x80 | ((c >> 6) & 0x3F)]); + sb.Append(table[0x80 | (c & 0x3F)]); + } + } } - j += segment.Length; // advance by the actual processed count + return sb.ToString(); } - - return buffer.ToString(); } /// - /// Decodes a URL-encoded string into its original form. + /// Decodes a URL-encoded string. /// /// The URL-encoded string to decode. /// The character encoding to use for decoding. Defaults to UTF-8. /// The decoded string, or null if the input is null. + /// + /// In UTF‑8 mode this delegates to . + /// In Latin‑1 mode it decodes %XX byte escapes and leaves characters beyond 0xFF as numeric entities + /// (e.g., &#12345;) if they were produced by . Call + /// to convert those entities to Unicode code points if desired. + /// public static string? Decode(string? str, Encoding? encoding = null) { encoding ??= Encoding.UTF8; var strWithoutPlus = str?.Replace('+', ' '); - if (Equals(encoding, Encoding.GetEncoding("ISO-8859-1"))) + if (encoding.CodePage == 28591) // ISO-8859-1 (Latin-1) try { return MyRegex() @@ -750,7 +1156,10 @@ void AddOne(object? x) /// Checks if a value is a non-nullish primitive type. /// /// The value to check. - /// If true, empty strings and URIs are not considered non-nullish. + /// + /// If true, empty strings and values with an empty textual form are treated as + /// nullish. + /// /// True if the value is a non-nullish primitive, false otherwise. public static bool IsNonNullishPrimitive(object? value, bool skipNulls = false) { @@ -844,14 +1253,13 @@ public static string InterpretNumericEntities(string str) if (j < n && str[j] == ';' && j > startDigits) { - int code; #if NETSTANDARD2_0 var digits = str.Substring(startDigits, j - startDigits); var ok = int.TryParse( digits, hex ? NumberStyles.HexNumber : NumberStyles.Integer, CultureInfo.InvariantCulture, - out code + out var code ); #else var digits = str.AsSpan(startDigits, j - startDigits); @@ -859,7 +1267,7 @@ out code digits, hex ? NumberStyles.HexNumber : NumberStyles.Integer, CultureInfo.InvariantCulture, - out code + out var code ); #endif if (!ok) diff --git a/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs b/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs new file mode 100644 index 0000000..aa7f459 --- /dev/null +++ b/benchmarks/QsNet.Benchmarks/EncodeBenchmarks.cs @@ -0,0 +1,109 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Order; +using QsNet; +using QsNet.Models; +using QsNet.Enums; + +namespace QsNet.Benchmarks; + +[MemoryDiagnoser] +[SimpleJob(RuntimeMoniker.Net80)] +[Orderer(SummaryOrderPolicy.FastestToSlowest)] +public class EncodeBenchmarks +{ + public enum DotMode + { + None, // AllowDots=false, EncodeDotInKeys=false + AllowDots, // AllowDots=true, EncodeDotInKeys=false + AllowDotsAndEncode // AllowDots=true, EncodeDotInKeys=true + } + + // Size & shape + [Params(10, 100, 1000)] public int Count { get; set; } + [Params(8, 40)] public int ValueLen { get; set; } + [Params(0, 50)] public int NeedsEscPercent { get; set; } + + // Option toggles that materially affect Encode() + [Params(false, true)] public bool CommaLists { get; set; } + [Params(false, true)] public bool EncodeValuesOnly { get; set; } + [Params(DotMode.None, DotMode.AllowDots, DotMode.AllowDotsAndEncode)] public DotMode Dots { get; set; } + + private static string MakeValue(int len, int escPercent, Random rnd) + { + if (escPercent <= 0) + { + return new string('x', len); + } + + var chars = new char[len]; + for (int i = 0; i < len; i++) + { + bool needsEsc = rnd.Next(0, 100) < escPercent; + if (!needsEsc) + { + chars[i] = 'x'; + continue; + } + + // Mix of characters that typically require escaping + switch (rnd.Next(0, 4)) + { + case 0: chars[i] = ' '; break; // space -> %20 or + + case 1: chars[i] = '%'; break; // percent -> %25 + case 2: chars[i] = '\u00E4'; break; // non-ASCII -> UTF-8 percent-encoded + default: chars[i] = ','; break; // comma (should be encoded inside list items) + } + } + return new string(chars); + } + + private object _data = default!; + private EncodeOptions _options = default!; + + [GlobalSetup] + public void Setup() + { + var rnd = new Random(12345); + + // Build a realistic object graph to exercise the encoder: + // - list under key "a" (affected by ListFormat) + // - dotted key under nested dictionary (affected by EncodeDotInKeys) + // - a date and a boolean for primitive branches + var list = Enumerable.Range(0, Count) + .Select(_ => (object?)MakeValue(ValueLen, NeedsEscPercent, rnd)) + .ToList(); + + _data = new Dictionary + { + ["a"] = list, + ["a_empty"] = new List(), + ["a_nested"] = new List { new List { MakeValue(ValueLen, NeedsEscPercent, rnd) } }, + ["b"] = new Dictionary + { + ["x.y"] = MakeValue(ValueLen, NeedsEscPercent, rnd), + ["inner"] = new Dictionary + { + ["z"] = MakeValue(ValueLen, NeedsEscPercent, rnd) + } + }, + ["c"] = new DateTimeOffset(2024, 1, 2, 3, 4, 5, TimeSpan.Zero), + ["d"] = true + }; + + _options = new EncodeOptions + { + ListFormat = CommaLists ? ListFormat.Comma : ListFormat.Indices, + EncodeValuesOnly = EncodeValuesOnly, + AllowDots = Dots != DotMode.None, + EncodeDotInKeys = Dots == DotMode.AllowDotsAndEncode, + // Leave other toggles at defaults to mirror common usage. + }; + } + + [Benchmark] + public string Encode_Public() => Qs.Encode(_data, _options); +} \ No newline at end of file diff --git a/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs b/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs new file mode 100644 index 0000000..3227278 --- /dev/null +++ b/benchmarks/QsNet.Benchmarks/UtilsEncodeBenchmarks.cs @@ -0,0 +1,62 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Order; +using QsNet.Enums; +using QsNet.Models; +using QsNet.Internal; + +[MemoryDiagnoser] +public class UtilsEncodeBenchmarks +{ + [Params(0, 8, 40, 512, 4096)] + public int Len; + + [Params(Format.Rfc3986, Format.Rfc1738)] + public Format Fmt; + + // Encoding under test + [Params("UTF8", "Latin1")] + public string EncName { get; set; } = "UTF8"; + private Encoding _enc = default!; + + // Workload shape + [Params("ascii-safe", "utf8-mixed", "latin1-fallback", "reserved-heavy")] + public string DataKind { get; set; } = "ascii-safe"; + + private string _input = default!; + + [GlobalSetup] + public void Setup() + { + _enc = EncName == "Latin1" ? Encoding.GetEncoding("ISO-8859-1") : new UTF8Encoding(false); + + // note: () included to exercise RFC1738 paren allowance + var asciiSafeBase = "abcDEF-_.~0123456789() "; + var utfMixedBase = "Café 北京 – ☕️ 😀 "; + var latin1Fallback = "Café – € àèìòù "; // '€' not in ISO-8859-1 -> numeric-entity fallback + var reservedHeavy = "name=obj[a]&b=c d/%[]()+="; + + var seed = DataKind switch + { + "ascii-safe" => asciiSafeBase, + "utf8-mixed" => utfMixedBase, + "latin1-fallback" => latin1Fallback, + "reserved-heavy" => reservedHeavy, + _ => asciiSafeBase + }; + + _input = string.Concat(Enumerable.Repeat(seed, Math.Max(1, (Len + seed.Length - 1) / seed.Length))) + .Substring(0, Len); + } + + [Benchmark(Baseline = true)] + public string Encode() => QsNet.Internal.Utils.Encode(_input, _enc, Fmt); + + // Orientation-only reference (different semantics for spaces/legacy, but useful for perf smell tests) + [Benchmark] + public string UriEscape() => Uri.EscapeDataString(_input); +} \ No newline at end of file