Skip to content

Commit fe597b7

Browse files
authored
encoding.utf8: add is_number (#15931)
1 parent 3d2588f commit fe597b7

File tree

3 files changed

+169
-0
lines changed

3 files changed

+169
-0
lines changed

vlib/encoding/utf8/utf8_tables.v

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,7 @@ const (
538538
max_latin_1 = rune(0x00ff) // '\u00FF' // `ÿ`
539539
)
540540

541+
// Represents all unicode in unicode category L.
541542
const letter_table = RangeTable{
542543
r16: [
543544
Range16{0x0041, 0x005a, 1},
@@ -1132,6 +1133,7 @@ const letter_table = RangeTable{
11321133
latin_offset: 6
11331134
}
11341135

1136+
// Represents all unicodes in unicode category Z with property white space.
11351137
const white_space_table = RangeTable{
11361138
r16: [
11371139
Range16{0x0009, 0x000d, 1},
@@ -1146,6 +1148,146 @@ const white_space_table = RangeTable{
11461148
latin_offset: 2
11471149
}
11481150

1151+
// Represents all unicodes in unicode category N.
1152+
const number_table = RangeTable{
1153+
r16: [
1154+
Range16{0x0030, 0x0039, 1},
1155+
Range16{0x00b2, 0x00b3, 1},
1156+
Range16{0x00b9, 0x00bc, 3},
1157+
Range16{0x00bd, 0x00be, 1},
1158+
Range16{0x0660, 0x0669, 1},
1159+
Range16{0x06f0, 0x06f9, 1},
1160+
Range16{0x07c0, 0x07c9, 1},
1161+
Range16{0x0966, 0x096f, 1},
1162+
Range16{0x09e6, 0x09ef, 1},
1163+
Range16{0x09f4, 0x09f9, 1},
1164+
Range16{0x0a66, 0x0a6f, 1},
1165+
Range16{0x0ae6, 0x0aef, 1},
1166+
Range16{0x0b66, 0x0b6f, 1},
1167+
Range16{0x0b72, 0x0b77, 1},
1168+
Range16{0x0be6, 0x0bf2, 1},
1169+
Range16{0x0c66, 0x0c6f, 1},
1170+
Range16{0x0c78, 0x0c7e, 1},
1171+
Range16{0x0ce6, 0x0cef, 1},
1172+
Range16{0x0d58, 0x0d5e, 1},
1173+
Range16{0x0d66, 0x0d78, 1},
1174+
Range16{0x0de6, 0x0def, 1},
1175+
Range16{0x0e50, 0x0e59, 1},
1176+
Range16{0x0ed0, 0x0ed9, 1},
1177+
Range16{0x0f20, 0x0f33, 1},
1178+
Range16{0x1040, 0x1049, 1},
1179+
Range16{0x1090, 0x1099, 1},
1180+
Range16{0x1369, 0x137c, 1},
1181+
Range16{0x16ee, 0x16f0, 1},
1182+
Range16{0x17e0, 0x17e9, 1},
1183+
Range16{0x17f0, 0x17f9, 1},
1184+
Range16{0x1810, 0x1819, 1},
1185+
Range16{0x1946, 0x194f, 1},
1186+
Range16{0x19d0, 0x19da, 1},
1187+
Range16{0x1a80, 0x1a89, 1},
1188+
Range16{0x1a90, 0x1a99, 1},
1189+
Range16{0x1b50, 0x1b59, 1},
1190+
Range16{0x1bb0, 0x1bb9, 1},
1191+
Range16{0x1c40, 0x1c49, 1},
1192+
Range16{0x1c50, 0x1c59, 1},
1193+
Range16{0x2070, 0x2074, 4},
1194+
Range16{0x2075, 0x2079, 1},
1195+
Range16{0x2080, 0x2089, 1},
1196+
Range16{0x2150, 0x2182, 1},
1197+
Range16{0x2185, 0x2189, 1},
1198+
Range16{0x2460, 0x249b, 1},
1199+
Range16{0x24ea, 0x24ff, 1},
1200+
Range16{0x2776, 0x2793, 1},
1201+
Range16{0x2cfd, 0x3007, 778},
1202+
Range16{0x3021, 0x3029, 1},
1203+
Range16{0x3038, 0x303a, 1},
1204+
Range16{0x3192, 0x3195, 1},
1205+
Range16{0x3220, 0x3229, 1},
1206+
Range16{0x3248, 0x324f, 1},
1207+
Range16{0x3251, 0x325f, 1},
1208+
Range16{0x3280, 0x3289, 1},
1209+
Range16{0x32b1, 0x32bf, 1},
1210+
Range16{0xa620, 0xa629, 1},
1211+
Range16{0xa6e6, 0xa6ef, 1},
1212+
Range16{0xa830, 0xa835, 1},
1213+
Range16{0xa8d0, 0xa8d9, 1},
1214+
Range16{0xa900, 0xa909, 1},
1215+
Range16{0xa9d0, 0xa9d9, 1},
1216+
Range16{0xa9f0, 0xa9f9, 1},
1217+
Range16{0xaa50, 0xaa59, 1},
1218+
Range16{0xabf0, 0xabf9, 1},
1219+
Range16{0xff10, 0xff19, 1},
1220+
]
1221+
r32: [
1222+
Range32{0x10107, 0x10133, 1},
1223+
Range32{0x10140, 0x10178, 1},
1224+
Range32{0x1018a, 0x1018b, 1},
1225+
Range32{0x102e1, 0x102fb, 1},
1226+
Range32{0x10320, 0x10323, 1},
1227+
Range32{0x10341, 0x1034a, 9},
1228+
Range32{0x103d1, 0x103d5, 1},
1229+
Range32{0x104a0, 0x104a9, 1},
1230+
Range32{0x10858, 0x1085f, 1},
1231+
Range32{0x10879, 0x1087f, 1},
1232+
Range32{0x108a7, 0x108af, 1},
1233+
Range32{0x108fb, 0x108ff, 1},
1234+
Range32{0x10916, 0x1091b, 1},
1235+
Range32{0x109bc, 0x109bd, 1},
1236+
Range32{0x109c0, 0x109cf, 1},
1237+
Range32{0x109d2, 0x109ff, 1},
1238+
Range32{0x10a40, 0x10a48, 1},
1239+
Range32{0x10a7d, 0x10a7e, 1},
1240+
Range32{0x10a9d, 0x10a9f, 1},
1241+
Range32{0x10aeb, 0x10aef, 1},
1242+
Range32{0x10b58, 0x10b5f, 1},
1243+
Range32{0x10b78, 0x10b7f, 1},
1244+
Range32{0x10ba9, 0x10baf, 1},
1245+
Range32{0x10cfa, 0x10cff, 1},
1246+
Range32{0x10d30, 0x10d39, 1},
1247+
Range32{0x10e60, 0x10e7e, 1},
1248+
Range32{0x10f1d, 0x10f26, 1},
1249+
Range32{0x10f51, 0x10f54, 1},
1250+
Range32{0x10fc5, 0x10fcb, 1},
1251+
Range32{0x11052, 0x1106f, 1},
1252+
Range32{0x110f0, 0x110f9, 1},
1253+
Range32{0x11136, 0x1113f, 1},
1254+
Range32{0x111d0, 0x111d9, 1},
1255+
Range32{0x111e1, 0x111f4, 1},
1256+
Range32{0x112f0, 0x112f9, 1},
1257+
Range32{0x11450, 0x11459, 1},
1258+
Range32{0x114d0, 0x114d9, 1},
1259+
Range32{0x11650, 0x11659, 1},
1260+
Range32{0x116c0, 0x116c9, 1},
1261+
Range32{0x11730, 0x1173b, 1},
1262+
Range32{0x118e0, 0x118f2, 1},
1263+
Range32{0x11950, 0x11959, 1},
1264+
Range32{0x11c50, 0x11c6c, 1},
1265+
Range32{0x11d50, 0x11d59, 1},
1266+
Range32{0x11da0, 0x11da9, 1},
1267+
Range32{0x11fc0, 0x11fd4, 1},
1268+
Range32{0x12400, 0x1246e, 1},
1269+
Range32{0x16a60, 0x16a69, 1},
1270+
Range32{0x16b50, 0x16b59, 1},
1271+
Range32{0x16b5b, 0x16b61, 1},
1272+
Range32{0x16e80, 0x16e96, 1},
1273+
Range32{0x1d2e0, 0x1d2f3, 1},
1274+
Range32{0x1d360, 0x1d378, 1},
1275+
Range32{0x1d7ce, 0x1d7ff, 1},
1276+
Range32{0x1e140, 0x1e149, 1},
1277+
Range32{0x1e2f0, 0x1e2f9, 1},
1278+
Range32{0x1e8c7, 0x1e8cf, 1},
1279+
Range32{0x1e950, 0x1e959, 1},
1280+
Range32{0x1ec71, 0x1ecab, 1},
1281+
Range32{0x1ecad, 0x1ecaf, 1},
1282+
Range32{0x1ecb1, 0x1ecb4, 1},
1283+
Range32{0x1ed01, 0x1ed2d, 1},
1284+
Range32{0x1ed2f, 0x1ed3d, 1},
1285+
Range32{0x1f100, 0x1f10c, 1},
1286+
Range32{0x1fbf0, 0x1fbf9, 1},
1287+
]
1288+
latin_offset: 4
1289+
}
1290+
11491291
struct RangeTable {
11501292
pub:
11511293
r16 []Range16
@@ -1167,6 +1309,7 @@ pub:
11671309
stride u32
11681310
}
11691311

1312+
// tests if rune is in the given range table.
11701313
fn is_excluding_latin(table &RangeTable, r rune) bool {
11711314
r16 := &table.r16
11721315
off := table.latin_offset

vlib/encoding/utf8/utf8_util.v

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,14 @@ pub fn is_space(r rune) bool {
178178
return is_excluding_latin(white_space_table, r)
179179
}
180180

181+
// is_number returns true if the rune is unicode number or in unicode category N
182+
pub fn is_number(r rune) bool {
183+
if r <= max_latin_1 {
184+
return props[u8(r)] & p_n != 0
185+
}
186+
return is_excluding_latin(number_table, r)
187+
}
188+
181189
// is_uchar_punct return true if the input unicode is a western unicode punctuation
182190
pub fn is_uchar_punct(uchar int) bool {
183191
return find_punct_in_table(uchar, utf8.unicode_punct_western) != 0

vlib/encoding/utf8/utf8_util_test.v

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,21 @@ fn test_is_space() {
105105
assert utf8.is_space(`\u2009`) == true
106106
assert utf8.is_space(`\u00A0`) == true
107107
}
108+
109+
fn test_is_number() {
110+
for ra in `a` .. `z` {
111+
assert utf8.is_number(ra) == false
112+
}
113+
114+
for ra in `A` .. `Z` {
115+
assert utf8.is_number(ra) == false
116+
}
117+
118+
for ra in `0` .. `1` {
119+
assert utf8.is_number(ra) == true
120+
}
121+
122+
assert utf8.is_number(`\u2164`) == true
123+
assert utf8.is_number(`\u2188`) == true
124+
assert utf8.is_number(`\u3029`) == true
125+
}

0 commit comments

Comments
 (0)