Skip to content

Commit 48438ea

Browse files
anarazeljaki
authored andcommitted
Add pg_encoding_set_invalid()
There are cases where we cannot / do not want to error out for invalidly encoded input. In such cases it can be useful to replace e.g. an incomplete multi-byte characters with bytes that will trigger an error when getting validated as part of a larger string. Unfortunately, until now, for some encoding no such sequence existed. For those encodings this commit removes one previously accepted input combination - we consider that to be ok, as the chosen bytes are outside of the valid ranges for the encodings, we just previously failed to detect that. As we cannot add a new field to pg_wchar_table without breaking ABI, this is implemented "in-line" in the newly added function. Author: Noah Misch <noah@leadboat.com> Reviewed-by: Andres Freund <andres@anarazel.de> Backpatch-through: 13 Security: CVE-2025-1094 (cherry picked from commit 703b3fd)
1 parent 8e241fd commit 48438ea

File tree

5 files changed

+117
-1
lines changed

5 files changed

+117
-1
lines changed

src/common/wchar.c

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,25 @@
1515
#include "mb/pg_wchar.h"
1616

1717

18+
/*
19+
* In today's multibyte encodings other than UTF8, this two-byte sequence
20+
* ensures pg_encoding_mblen() == 2 && pg_encoding_verifymbstr() == 0.
21+
*
22+
* For historical reasons, several verifychar implementations opt to reject
23+
* this pair specifically. Byte pair range constraints, in encoding
24+
* originator documentation, always excluded this pair. No core conversion
25+
* could translate it. However, longstanding verifychar implementations
26+
* accepted any non-NUL byte. big5_to_euc_tw and big5_to_mic even translate
27+
* pairs not valid per encoding originator documentation. To avoid tightening
28+
* core or non-core conversions in a security patch, we sought this one pair.
29+
*
30+
* PQescapeString() historically used spaces for BYTE1; many other values
31+
* could suffice for BYTE1.
32+
*/
33+
#define NONUTF8_INVALID_BYTE0 (0x8d)
34+
#define NONUTF8_INVALID_BYTE1 (' ')
35+
36+
1837
/*
1938
* Operations on multi-byte encodings are driven by a table of helper
2039
* functions.
@@ -1525,6 +1544,11 @@ pg_big5_verifychar(const unsigned char *s, int len)
15251544
if (len < l)
15261545
return -1;
15271546

1547+
if (l == 2 &&
1548+
s[0] == NONUTF8_INVALID_BYTE0 &&
1549+
s[1] == NONUTF8_INVALID_BYTE1)
1550+
return -1;
1551+
15281552
while (--l > 0)
15291553
{
15301554
if (*++s == '\0')
@@ -1574,6 +1598,11 @@ pg_gbk_verifychar(const unsigned char *s, int len)
15741598
if (len < l)
15751599
return -1;
15761600

1601+
if (l == 2 &&
1602+
s[0] == NONUTF8_INVALID_BYTE0 &&
1603+
s[1] == NONUTF8_INVALID_BYTE1)
1604+
return -1;
1605+
15771606
while (--l > 0)
15781607
{
15791608
if (*++s == '\0')
@@ -1623,6 +1652,11 @@ pg_uhc_verifychar(const unsigned char *s, int len)
16231652
if (len < l)
16241653
return -1;
16251654

1655+
if (l == 2 &&
1656+
s[0] == NONUTF8_INVALID_BYTE0 &&
1657+
s[1] == NONUTF8_INVALID_BYTE1)
1658+
return -1;
1659+
16261660
while (--l > 0)
16271661
{
16281662
if (*++s == '\0')
@@ -2066,6 +2100,19 @@ pg_utf8_islegal(const unsigned char *source, int length)
20662100
}
20672101

20682102

2103+
/*
2104+
* Fills the provided buffer with two bytes such that:
2105+
* pg_encoding_mblen(dst) == 2 && pg_encoding_verifymbstr(dst) == 0
2106+
*/
2107+
void
2108+
pg_encoding_set_invalid(int encoding, char *dst)
2109+
{
2110+
Assert(pg_encoding_max_length(encoding) > 1);
2111+
2112+
dst[0] = (encoding == PG_UTF8 ? 0xc0 : NONUTF8_INVALID_BYTE0);
2113+
dst[1] = NONUTF8_INVALID_BYTE1;
2114+
}
2115+
20692116
/*
20702117
*-------------------------------------------------------------------
20712118
* encoding info table
@@ -2188,5 +2235,11 @@ pg_encoding_max_length(int encoding)
21882235
{
21892236
Assert(PG_VALID_ENCODING(encoding));
21902237

2191-
return pg_wchar_table[encoding].maxmblen;
2238+
/*
2239+
* Check for the encoding despite the assert, due to some mingw versions
2240+
* otherwise issuing bogus warnings.
2241+
*/
2242+
return PG_VALID_ENCODING(encoding) ?
2243+
pg_wchar_table[encoding].maxmblen :
2244+
pg_wchar_table[PG_SQL_ASCII].maxmblen;
21922245
}

src/include/mb/pg_wchar.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,7 @@ extern int pg_valid_server_encoding_id(int encoding);
573573
* (in addition to the ones just above). The constant tables declared
574574
* earlier in this file are also available from libpgcommon.
575575
*/
576+
extern void pg_encoding_set_invalid(int encoding, char *dst);
576577
extern int pg_encoding_mblen(int encoding, const char *mbstr);
577578
extern int pg_encoding_mblen_bounded(int encoding, const char *mbstr);
578579
extern int pg_encoding_dsplen(int encoding, const char *mbstr);

src/test/regress/expected/conversion.out

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@
55
\getenv libdir PG_LIBDIR
66
\getenv dlsuffix PG_DLSUFFIX
77
\set regresslib :libdir '/regress' :dlsuffix
8+
CREATE FUNCTION test_enc_setup() RETURNS void
9+
AS :'regresslib', 'test_enc_setup'
10+
LANGUAGE C STRICT;
11+
SELECT FROM test_enc_setup();
12+
--
13+
(1 row)
14+
815
CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
916
AS :'regresslib', 'test_enc_conversion'
1017
LANGUAGE C STRICT;

src/test/regress/regress.c

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,6 +1076,56 @@ test_opclass_options_func(PG_FUNCTION_ARGS)
10761076
PG_RETURN_NULL();
10771077
}
10781078

1079+
/* one-time tests for encoding infrastructure */
1080+
PG_FUNCTION_INFO_V1(test_enc_setup);
1081+
Datum
1082+
test_enc_setup(PG_FUNCTION_ARGS)
1083+
{
1084+
/* Test pg_encoding_set_invalid() */
1085+
for (int i = 0; i < _PG_LAST_ENCODING_; i++)
1086+
{
1087+
char buf[2],
1088+
bigbuf[16];
1089+
int len,
1090+
mblen,
1091+
valid;
1092+
1093+
if (pg_encoding_max_length(i) == 1)
1094+
continue;
1095+
pg_encoding_set_invalid(i, buf);
1096+
len = strnlen(buf, 2);
1097+
if (len != 2)
1098+
elog(WARNING,
1099+
"official invalid string for encoding \"%s\" has length %d",
1100+
pg_enc2name_tbl[i].name, len);
1101+
mblen = pg_encoding_mblen(i, buf);
1102+
if (mblen != 2)
1103+
elog(WARNING,
1104+
"official invalid string for encoding \"%s\" has mblen %d",
1105+
pg_enc2name_tbl[i].name, mblen);
1106+
valid = pg_encoding_verifymbstr(i, buf, len);
1107+
if (valid != 0)
1108+
elog(WARNING,
1109+
"official invalid string for encoding \"%s\" has valid prefix of length %d",
1110+
pg_enc2name_tbl[i].name, valid);
1111+
valid = pg_encoding_verifymbstr(i, buf, 1);
1112+
if (valid != 0)
1113+
elog(WARNING,
1114+
"first byte of official invalid string for encoding \"%s\" has valid prefix of length %d",
1115+
pg_enc2name_tbl[i].name, valid);
1116+
memset(bigbuf, ' ', sizeof(bigbuf));
1117+
bigbuf[0] = buf[0];
1118+
bigbuf[1] = buf[1];
1119+
valid = pg_encoding_verifymbstr(i, bigbuf, sizeof(bigbuf));
1120+
if (valid != 0)
1121+
elog(WARNING,
1122+
"trailing data changed official invalid string for encoding \"%s\" to have valid prefix of length %d",
1123+
pg_enc2name_tbl[i].name, valid);
1124+
}
1125+
1126+
PG_RETURN_VOID();
1127+
}
1128+
10791129
/*
10801130
* Call an encoding conversion or verification function.
10811131
*

src/test/regress/sql/conversion.sql

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@
88

99
\set regresslib :libdir '/regress' :dlsuffix
1010

11+
CREATE FUNCTION test_enc_setup() RETURNS void
12+
AS :'regresslib', 'test_enc_setup'
13+
LANGUAGE C STRICT;
14+
SELECT FROM test_enc_setup();
15+
1116
CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
1217
AS :'regresslib', 'test_enc_conversion'
1318
LANGUAGE C STRICT;

0 commit comments

Comments
 (0)