Skip to content

Commit 0db0748

Browse files
committed
Parse UTF-8 streams as bytes for precise error offsets
1 parent 10f0364 commit 0db0748

File tree

2 files changed

+108
-6
lines changed

2 files changed

+108
-6
lines changed
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
using System.IO;
2+
using NUnit.Framework;
3+
using Tomlyn.Model;
4+
5+
namespace Tomlyn.Tests;
6+
7+
public sealed class NewApiUtf8StreamExceptionLocationTests
8+
{
9+
[Test]
10+
public void Deserialize_Utf8Stream_InvalidUtf8_ThrowsTomlExceptionWithByteOffset()
11+
{
12+
// a = "<invalid utf-8>"
13+
var bytes = new byte[]
14+
{
15+
(byte)'a',
16+
(byte)' ',
17+
(byte)'=',
18+
(byte)' ',
19+
(byte)'"',
20+
0xC3, // invalid sequence start (expects continuation byte in 0x80..0xBF, but 0x28 is '(')
21+
0x28,
22+
(byte)'"',
23+
};
24+
25+
using var stream = new MemoryStream(bytes);
26+
27+
var ex = Assert.Throws<TomlException>(() => TomlSerializer.Deserialize<TomlTable>(stream));
28+
Assert.NotNull(ex);
29+
Assert.NotNull(ex!.Span);
30+
31+
Assert.AreEqual(5, ex.Offset, "Offset must be the byte position of the invalid sequence.");
32+
Assert.AreEqual(1, ex.Line);
33+
Assert.AreEqual(6, ex.Column);
34+
}
35+
36+
[Test]
37+
public void Deserialize_Utf8Stream_WithBom_Succeeds()
38+
{
39+
var bytes = new byte[]
40+
{
41+
0xEF, 0xBB, 0xBF, // UTF-8 BOM
42+
(byte)'a', (byte)' ', (byte)'=', (byte)' ', (byte)'1', (byte)'\n',
43+
};
44+
45+
using var stream = new MemoryStream(bytes);
46+
var table = TomlSerializer.Deserialize<TomlTable>(stream);
47+
48+
Assert.NotNull(table);
49+
Assert.AreEqual(1L, (long)table!["a"]);
50+
}
51+
}
52+

src/Tomlyn/TomlSerializer.cs

Lines changed: 56 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ public static class TomlSerializer
1919

2020
private static readonly bool ReflectionEnabledByDefault = TomlSerializerFeatureSwitches.IsReflectionEnabledByDefaultCalculated;
2121
private static readonly Encoding DefaultStreamEncoding = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
22+
private static ReadOnlySpan<byte> Utf8Bom => [0xEF, 0xBB, 0xBF];
2223

2324
/// <summary>
2425
/// Gets a value indicating whether reflection-based serialization is enabled by default.
@@ -246,8 +247,11 @@ public static void Serialize(Stream utf8Stream, object? value, Type inputType, T
246247
public static T? Deserialize<T>(Stream utf8Stream, TomlSerializerOptions? options = null)
247248
{
248249
ArgumentGuard.ThrowIfNull(utf8Stream, nameof(utf8Stream));
249-
using var reader = new StreamReader(utf8Stream, DefaultStreamEncoding, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true);
250-
return Deserialize<T>(reader, options);
250+
251+
var utf8Bytes = ReadUtf8Bytes(utf8Stream);
252+
var effectiveOptions = options ?? TomlSerializerOptions.Default;
253+
var typeInfo = ResolveTypeInfo(effectiveOptions, typeof(T));
254+
return (T?)Deserialize(utf8Bytes, typeInfo);
251255
}
252256

253257
/// <summary>
@@ -257,8 +261,8 @@ public static void Serialize(Stream utf8Stream, object? value, Type inputType, T
257261
{
258262
ArgumentGuard.ThrowIfNull(utf8Stream, nameof(utf8Stream));
259263
ArgumentGuard.ThrowIfNull(typeInfo, nameof(typeInfo));
260-
using var reader = new StreamReader(utf8Stream, DefaultStreamEncoding, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true);
261-
return Deserialize<T>(reader, typeInfo);
264+
var utf8Bytes = ReadUtf8Bytes(utf8Stream);
265+
return (T?)Deserialize(utf8Bytes, (TomlTypeInfo)typeInfo);
262266
}
263267

264268
/// <summary>
@@ -270,8 +274,11 @@ public static void Serialize(Stream utf8Stream, object? value, Type inputType, T
270274
{
271275
ArgumentGuard.ThrowIfNull(utf8Stream, nameof(utf8Stream));
272276
ArgumentGuard.ThrowIfNull(returnType, nameof(returnType));
273-
using var reader = new StreamReader(utf8Stream, DefaultStreamEncoding, detectEncodingFromByteOrderMarks: true, bufferSize: 1024, leaveOpen: true);
274-
return Deserialize(reader, returnType, options);
277+
278+
var utf8Bytes = ReadUtf8Bytes(utf8Stream);
279+
var effectiveOptions = options ?? TomlSerializerOptions.Default;
280+
var typeInfo = ResolveTypeInfo(effectiveOptions, returnType);
281+
return Deserialize(utf8Bytes, typeInfo);
275282
}
276283

277284
/// <summary>
@@ -325,6 +332,21 @@ public static bool TryDeserialize(string toml, Type returnType, out object? valu
325332

326333
var options = typeInfo.Options;
327334
var reader = TomlReader.Create(toml, options);
335+
return DeserializeCore(reader, typeInfo, options);
336+
}
337+
338+
private static object? Deserialize(byte[] utf8Toml, TomlTypeInfo typeInfo)
339+
{
340+
ArgumentGuard.ThrowIfNull(utf8Toml, nameof(utf8Toml));
341+
ArgumentGuard.ThrowIfNull(typeInfo, nameof(typeInfo));
342+
343+
var options = typeInfo.Options;
344+
var reader = TomlReader.Create(utf8Toml, options);
345+
return DeserializeCore(reader, typeInfo, options);
346+
}
347+
348+
private static object? DeserializeCore(TomlReader reader, TomlTypeInfo typeInfo, TomlSerializerOptions options)
349+
{
328350
reader.Read(); // StartDocument
329351
reader.Read(); // value start
330352

@@ -359,6 +381,34 @@ public static bool TryDeserialize(string toml, Type returnType, out object? valu
359381
return typeInfo.ReadAsObject(reader);
360382
}
361383

384+
private static byte[] ReadUtf8Bytes(Stream utf8Stream)
385+
{
386+
if (utf8Stream is MemoryStream memoryStream && memoryStream.TryGetBuffer(out var buffer) &&
387+
memoryStream.Position <= int.MaxValue && memoryStream.Length - memoryStream.Position <= int.MaxValue)
388+
{
389+
var start = buffer.Offset + (int)memoryStream.Position;
390+
var length = (int)(memoryStream.Length - memoryStream.Position);
391+
var data = new ReadOnlySpan<byte>(buffer.Array!, start, length);
392+
return StripUtf8Bom(data).ToArray();
393+
}
394+
395+
using var copy = new MemoryStream();
396+
utf8Stream.CopyTo(copy);
397+
var bytes = copy.ToArray();
398+
var stripped = StripUtf8Bom(bytes);
399+
return stripped.Length == bytes.Length ? bytes : stripped.ToArray();
400+
}
401+
402+
private static ReadOnlySpan<byte> StripUtf8Bom(ReadOnlySpan<byte> data)
403+
{
404+
if (data.Length >= 3 && data[0] == Utf8Bom[0] && data[1] == Utf8Bom[1] && data[2] == Utf8Bom[2])
405+
{
406+
return data.Slice(3);
407+
}
408+
409+
return data;
410+
}
411+
362412
/// <summary>
363413
/// Serializes a value to a writer using explicit metadata.
364414
/// </summary>

0 commit comments

Comments
 (0)