-
Notifications
You must be signed in to change notification settings - Fork 158
/
Copy pathhtml_entity_decode.sql
120 lines (106 loc) · 4.95 KB
/
html_entity_decode.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
--Depends on function public.html_entities() !
create or replace function public.html_entity_decode(str text)
returns text
immutable
returns null on null input
parallel safe -- postgres 10 or later
language plpgsql
set search_path = ''
as $func$
declare
rec record;
protect_char constant char(1) not null default e'\u0001'; --protect incorrect chain process, like &quot
/*
Protect chr() errors:
[54000] ERROR: null character not permitted
[54000] ERROR: requested character not valid for encoding: DDDDD
The maximum valid code point in Unicode is U+10FFFF
There are also a set of code points that are the surrogates for UTF-16. These are in the range U+D800 .. U+DFFF.
Details at https://stackoverflow.com/questions/27415935/does-unicode-have-a-defined-maximum-number-of-code-points
*/
codepoints_ranges constant int4range[] not null default array[
int4range(1, x'D800'::int, '[)'),
int4range(x'DFFF'::int, x'10FFFF'::int, '(]')
];
is_replaced bool not null default false;
begin
if position('&' in str) = 0 then --speed improvements
return str;
end if;
for rec in
with t as materialized (
select distinct
r.m[1] as entity
from regexp_matches(str, '&[a-zA-Z][a-zA-Z\d]+;?', 'g') as r(m)
)
select *
from t
cross join lateral (select public.html_entities()->t.entity->>'characters') as x(characters)
where x.characters is not null
loop
str := replace(str, rec.entity, concat(protect_char, rec.characters, protect_char));
is_replaced := true;
end loop;
if position('&#' in str) > 0 then --speed improvements
for rec in
select distinct
r.m[1] as entity,
r.m[2]::int as codepoint
from regexp_matches(str, '(&#(\d{1,7});)', 'g') as r(m) -- maximum valid code point in Unicode is 1114111 (U+10FFFF)
where r.m[2]::int <@ any (codepoints_ranges) -- https://postgrespro.ru/docs/postgresql/12/functions-range
loop
str := replace(str, rec.entity, concat(protect_char, chr(rec.codepoint), protect_char));
is_replaced := true;
end loop;
if position('&#x' in str) > 0 then --speed improvements
for rec in
select distinct
r.m[1] as entity,
x.codepoint
from regexp_matches(str, '(&#x([\da-fA-F]{1,6});)', 'g') as r(m) -- maximum valid code point in Unicode is U+10FFFF
-- https://stackoverflow.com/questions/8316164/convert-hex-in-text-representation-to-decimal-number
cross join lateral (select ('x' || lpad(r.m[2], 8, '0'))::bit(32)::int) as x(codepoint)
where x.codepoint <@ any (codepoints_ranges) -- https://postgrespro.ru/docs/postgresql/12/functions-range
loop
str := replace(str, rec.entity, concat(protect_char, chr(rec.codepoint), protect_char));
is_replaced := true;
end loop;
end if;
end if;
if is_replaced then
return replace(str, protect_char, '');
end if;
return str;
end;
$func$;
comment on function public.html_entity_decode(str text) is $$
Convert HTML entities to their corresponding characters.
Depends on function public.html_entities()
$$;
-- TEST
DO $$
DECLARE
rec record;
str_out_returned text;
BEGIN
for rec in select * from (values
(null, null),
-- Positive
('𝔄 ∾̳ ½ " &, &quot, > Ö 𝔹', '𝔄 ∾̳ ½ " &, ", > Ö 𝔹'), --Named entities
('" 7 𝕫 ', '" 7 𝕫 '), --Dec code entities
('■ ˜ 𝔹 ', '■ ˜ 𝔹 '), --Hex code entities
-- Negative
('abcde', 'abcde'),
('&unknown; &unk', '&unknown; &unk'), --Named entities
('� � � � � _5', '� � � � � _5'), --Dec code entities
('� � � � � %g0;', '� � � � � %g0;') --Hex code entities
) as t(str_in, str_out_expected)
loop
str_out_returned = public.html_entity_decode(rec.str_in);
assert
-- the result of the comparison should return boolean
str_out_returned is not distinct from rec.str_out_expected,
-- if the comparison result is not true, an error message will be returned
format('in %L, out expected %L, out returned %L', rec.str_in, rec.str_out_expected, str_out_returned);
end loop;
END $$;