Skip to content

Commit e251662

Browse files
authored
fix: add FLAG num support for agglutinative languages (#1090)
1 parent 831ded9 commit e251662

3 files changed

Lines changed: 244 additions & 23 deletions

File tree

internal/spell/aff.go

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,39 @@ type dictConfig struct {
6262
TryChars string
6363
WordChars string
6464
CompoundOnly string
65-
AffixMap map[rune]affix
65+
AffixMap map[string]affix
6666
CamelCase int
6767
CompoundMin int64
68-
compoundMap map[rune][]string
68+
compoundMap map[string][]string
6969
NoSuggestFlag string
7070
}
7171

72+
// parseFlags splits a flag string into individual flags based on the FLAG type.
73+
//
74+
// Hunspell supports several flag formats:
75+
// - "ASCII" (default): each character is a flag
76+
// - "num": flags are comma-separated numbers (e.g., "14308,10482,4720")
77+
// - "UTF-8": each UTF-8 character is a flag
78+
// - "long": each pair of ASCII characters is a flag
79+
func (a dictConfig) parseFlags(flagStr string) []string {
80+
switch a.Flag {
81+
case "num":
82+
return strings.Split(flagStr, ",")
83+
case "long":
84+
flags := make([]string, 0, len(flagStr)/2)
85+
for i := 0; i+1 < len(flagStr); i += 2 {
86+
flags = append(flags, flagStr[i:i+2])
87+
}
88+
return flags
89+
default: // "ASCII" or "UTF-8"
90+
flags := make([]string, 0, len(flagStr))
91+
for _, r := range flagStr {
92+
flags = append(flags, string(r))
93+
}
94+
return flags
95+
}
96+
}
97+
7298
// expand expands a word/affix using dictionary/affix rules
7399
//
74100
// This also supports CompoundRule flags
@@ -87,11 +113,13 @@ func (a dictConfig) expand(wordAffix string, out []string) ([]string, error) {
87113
// safe
88114
word, keyString := wordAffix[:idx], wordAffix[idx+1:]
89115

116+
flags := a.parseFlags(keyString)
117+
90118
// check to see if any of the flags are in the
91119
// "compound only". If so then nothing to add
92120
compoundOnly := false
93-
for _, key := range keyString {
94-
if strings.ContainsRune(a.CompoundOnly, key) {
121+
for _, key := range flags {
122+
if key == a.CompoundOnly {
95123
compoundOnly = true
96124
continue
97125
}
@@ -110,12 +138,9 @@ func (a dictConfig) expand(wordAffix string, out []string) ([]string, error) {
110138
out = append(out, word)
111139
prefixes := make([]affix, 0, 5)
112140
suffixes := make([]affix, 0, 5)
113-
for _, key := range keyString {
114-
// want keyString to []?something?
115-
// then iterate over that
141+
for _, key := range flags {
116142
af, ok := a.AffixMap[key]
117143
if !ok {
118-
// TODO: How should we handle this?
119144
continue
120145
}
121146
if !af.CrossProduct {
@@ -161,8 +186,8 @@ func isCrossProduct(val string) (bool, error) {
161186
func newDictConfig(file io.Reader) (*dictConfig, error) { //nolint:funlen
162187
aff := dictConfig{
163188
Flag: "ASCII",
164-
AffixMap: make(map[rune]affix),
165-
compoundMap: make(map[rune][]string),
189+
AffixMap: make(map[string]affix),
190+
compoundMap: make(map[string][]string),
166191
CompoundMin: 3, // default in Hunspell
167192
}
168193
scanner := bufio.NewScanner(file)
@@ -219,9 +244,9 @@ func newDictConfig(file io.Reader) (*dictConfig, error) { //nolint:funlen
219244
aff.CompoundRule = make([]string, 0, val)
220245
} else {
221246
aff.CompoundRule = append(aff.CompoundRule, parts[1])
222-
for _, char := range parts[1] {
223-
if _, ok := aff.compoundMap[char]; !ok {
224-
aff.compoundMap[char] = []string{}
247+
for _, flag := range aff.parseFlags(parts[1]) {
248+
if _, ok := aff.compoundMap[flag]; !ok {
249+
aff.compoundMap[flag] = []string{}
225250
}
226251
}
227252
}
@@ -248,8 +273,7 @@ func newDictConfig(file io.Reader) (*dictConfig, error) { //nolint:funlen
248273

249274
sections := len(parts)
250275
if sections > 4 {
251-
// does this need to be split out into suffix and prefix?
252-
flag := rune(parts[1][0])
276+
flag := parts[1]
253277
a, ok := aff.AffixMap[flag]
254278
if !ok {
255279
return nil, fmt.Errorf("got rules for flag %q but no definition", flag)
@@ -299,7 +323,7 @@ func newDictConfig(file io.Reader) (*dictConfig, error) { //nolint:funlen
299323
Type: atype,
300324
CrossProduct: cross,
301325
}
302-
flag := rune(parts[1][0])
326+
flag := parts[1]
303327
aff.AffixMap[flag] = a
304328
}
305329
default:

internal/spell/aff_test.go

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
package spell
2+
3+
import (
4+
"strings"
5+
"testing"
6+
)
7+
8+
func TestParseFlagsASCII(t *testing.T) {
9+
dc := dictConfig{Flag: "ASCII"}
10+
flags := dc.parseFlags("ABC")
11+
if len(flags) != 3 || flags[0] != "A" || flags[1] != "B" || flags[2] != "C" {
12+
t.Errorf("ASCII parseFlags(%q) = %v, want [A B C]", "ABC", flags)
13+
}
14+
}
15+
16+
func TestParseFlagsNum(t *testing.T) {
17+
dc := dictConfig{Flag: "num"}
18+
flags := dc.parseFlags("14308,10482,4720")
19+
if len(flags) != 3 || flags[0] != "14308" || flags[1] != "10482" || flags[2] != "4720" {
20+
t.Errorf("num parseFlags(%q) = %v, want [14308 10482 4720]", "14308,10482,4720", flags)
21+
}
22+
}
23+
24+
func TestParseFlagsLong(t *testing.T) {
25+
dc := dictConfig{Flag: "long"}
26+
flags := dc.parseFlags("AABB")
27+
if len(flags) != 2 || flags[0] != "AA" || flags[1] != "BB" {
28+
t.Errorf("long parseFlags(%q) = %v, want [AA BB]", "AABB", flags)
29+
}
30+
}
31+
32+
func TestParseFlagsUTF8(t *testing.T) {
33+
dc := dictConfig{Flag: "UTF-8"}
34+
flags := dc.parseFlags("AğB")
35+
if len(flags) != 3 || flags[0] != "A" || flags[1] != "ğ" || flags[2] != "B" {
36+
t.Errorf("UTF-8 parseFlags(%q) = %v, want [A ğ B]", "AğB", flags)
37+
}
38+
}
39+
40+
func TestFlagNumAffixParsing(t *testing.T) {
41+
// Minimal FLAG num AFF file
42+
affContent := `SET UTF-8
43+
FLAG num
44+
45+
SFX 100 N 1
46+
SFX 100 0 ler .
47+
48+
SFX 200 N 1
49+
SFX 200 0 in .
50+
`
51+
aff, err := newDictConfig(strings.NewReader(affContent))
52+
if err != nil {
53+
t.Fatalf("newDictConfig error: %v", err)
54+
}
55+
56+
if aff.Flag != "num" {
57+
t.Errorf("Flag = %q, want %q", aff.Flag, "num")
58+
}
59+
60+
// Check that affix 100 exists with "ler" suffix
61+
a100, ok := aff.AffixMap["100"]
62+
if !ok {
63+
t.Fatal("AffixMap missing flag 100")
64+
}
65+
if len(a100.Rules) != 1 || a100.Rules[0].AffixText != "ler" {
66+
t.Errorf("flag 100 rules = %v, want [{ler}]", a100.Rules)
67+
}
68+
69+
// Check that affix 200 exists with "in" suffix
70+
a200, ok := aff.AffixMap["200"]
71+
if !ok {
72+
t.Fatal("AffixMap missing flag 200")
73+
}
74+
if len(a200.Rules) != 1 || a200.Rules[0].AffixText != "in" {
75+
t.Errorf("flag 200 rules = %v, want [{in}]", a200.Rules)
76+
}
77+
}
78+
79+
func TestFlagNumExpand(t *testing.T) {
80+
affContent := `SET UTF-8
81+
FLAG num
82+
83+
SFX 100 N 1
84+
SFX 100 0 ler .
85+
86+
SFX 200 N 1
87+
SFX 200 0 in .
88+
`
89+
aff, err := newDictConfig(strings.NewReader(affContent))
90+
if err != nil {
91+
t.Fatalf("newDictConfig error: %v", err)
92+
}
93+
94+
// "belge/100,200" should expand to: belge, belgeler, belgein
95+
words, err := aff.expand("belge/100,200", nil)
96+
if err != nil {
97+
t.Fatalf("expand error: %v", err)
98+
}
99+
100+
expected := map[string]bool{"belge": true, "belgeler": true, "belgein": true}
101+
for _, w := range words {
102+
if !expected[w] {
103+
t.Errorf("unexpected word %q in expansion", w)
104+
}
105+
delete(expected, w)
106+
}
107+
for w := range expected {
108+
t.Errorf("missing expected word %q", w)
109+
}
110+
}
111+
112+
func TestFlagNumGoSpellReader(t *testing.T) {
113+
affContent := `SET UTF-8
114+
FLAG num
115+
116+
SFX 100 N 1
117+
SFX 100 0 ler .
118+
119+
SFX 200 N 1
120+
SFX 200 0 nin .
121+
`
122+
dicContent := `2
123+
belge/100,200
124+
sistem/100,200
125+
`
126+
127+
gs, err := newGoSpellReader(
128+
strings.NewReader(affContent),
129+
strings.NewReader(dicContent),
130+
)
131+
if err != nil {
132+
t.Fatalf("newGoSpellReader error: %v", err)
133+
}
134+
135+
tests := []struct {
136+
word string
137+
want bool
138+
}{
139+
{"belge", true},
140+
{"belgeler", true},
141+
{"belgenin", true},
142+
{"sistem", true},
143+
{"sistemler", true},
144+
{"sistemnin", true},
145+
{"bilinmeyen", false},
146+
}
147+
148+
for _, tt := range tests {
149+
got := gs.spell(tt.word)
150+
if got != tt.want {
151+
t.Errorf("spell(%q) = %v, want %v", tt.word, got, tt.want)
152+
}
153+
}
154+
}
155+
156+
func TestASCIFlagBackwardCompatibility(t *testing.T) {
157+
// Original ASCII flag format must still work
158+
affContent := `SET UTF-8
159+
160+
SFX A N 1
161+
SFX A 0 s .
162+
163+
SFX B N 1
164+
SFX B 0 ed .
165+
`
166+
dicContent := `1
167+
test/AB
168+
`
169+
170+
gs, err := newGoSpellReader(
171+
strings.NewReader(affContent),
172+
strings.NewReader(dicContent),
173+
)
174+
if err != nil {
175+
t.Fatalf("newGoSpellReader error: %v", err)
176+
}
177+
178+
tests := []struct {
179+
word string
180+
want bool
181+
}{
182+
{"test", true},
183+
{"tests", true},
184+
{"tested", true},
185+
{"testing", false},
186+
}
187+
188+
for _, tt := range tests {
189+
got := gs.spell(tt.word)
190+
if got != tt.want {
191+
t.Errorf("spell(%q) = %v, want %v", tt.word, got, tt.want)
192+
}
193+
}
194+
}

internal/spell/gospell.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -220,14 +220,17 @@ func newGoSpellReader(aff, dic io.Reader) (*goSpell, error) {
220220

221221
for _, compoundRule := range affix.CompoundRule {
222222
pattern := "^"
223-
for _, key := range compoundRule {
224-
switch key {
225-
case '(', ')', '+', '?', '*':
226-
pattern += regexp.QuoteMeta(string(key))
227-
default:
228-
groups := affix.compoundMap[key]
229-
pattern = pattern + "(" + strings.Join(groups, "|") + ")"
223+
for _, key := range affix.parseFlags(compoundRule) {
224+
if len(key) == 1 {
225+
r := rune(key[0])
226+
switch r {
227+
case '(', ')', '+', '?', '*':
228+
pattern += regexp.QuoteMeta(key)
229+
continue
230+
}
230231
}
232+
groups := affix.compoundMap[key]
233+
pattern = pattern + "(" + strings.Join(groups, "|") + ")"
231234
}
232235
pattern += "$"
233236

0 commit comments

Comments
 (0)