/
bleve_write.go
167 lines (148 loc) · 4.49 KB
/
bleve_write.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
package index
import (
"fmt"
"log"
"os"
"path/filepath"
"slices"
"strings"
"time"
"github.com/gosimple/slug"
"github.com/spf13/afero"
"github.com/svera/coreander/v3/internal/metadata"
)
// AddFile adds a file to the index
func (b *BleveIndexer) AddFile(file string) error {
ext := strings.ToLower(filepath.Ext(file))
if _, ok := b.reader[ext]; !ok {
return fmt.Errorf("file extension %s not supported", ext)
}
meta, err := b.reader[ext].Metadata(file)
if err != nil {
return fmt.Errorf("error extracting metadata from file %s: %s", file, err)
}
document := b.createDocument(meta, file, nil)
err = b.idx.Index(document.ID, document)
if err != nil {
return fmt.Errorf("error indexing file %s: %s", file, err)
}
return nil
}
// RemoveFile removes a file from the index
func (b *BleveIndexer) RemoveFile(file string) error {
file = strings.Replace(file, b.libraryPath, "", 1)
file = strings.TrimPrefix(file, string(filepath.Separator))
if err := b.idx.Delete(file); err != nil {
return err
}
return nil
}
// AddLibrary scans <libraryPath> for documents and adds them to the index in batches of <bathSize>
func (b *BleveIndexer) AddLibrary(batchSize int) error {
batch := b.idx.NewBatch()
batchSlugs := make(map[string]struct{}, batchSize)
languages := []string{}
b.indexStartTime = float64(time.Now().UnixNano())
e := afero.Walk(b.fs, b.libraryPath, func(fullPath string, f os.FileInfo, err error) error {
ext := strings.ToLower(filepath.Ext(fullPath))
if _, ok := b.reader[ext]; !ok {
return nil
}
meta, err := b.reader[ext].Metadata(fullPath)
if err != nil {
log.Printf("Error extracting metadata from file %s: %s\n", fullPath, err)
return nil
}
document := b.createDocument(meta, fullPath, batchSlugs)
batchSlugs[document.Slug] = struct{}{}
languages = addLanguage(meta.Language, languages)
err = batch.Index(document.ID, document)
if err != nil {
log.Printf("Error indexing file %s: %s\n", fullPath, err)
return nil
}
b.indexedDocuments += 1
if batch.Size() == batchSize {
b.idx.Batch(batch)
batch.Reset()
batchSlugs = make(map[string]struct{}, batchSize)
}
return nil
})
b.indexStartTime = 0
b.indexedDocuments = 0
batch.SetInternal(internalLanguages, []byte(strings.Join(languages, ",")))
b.idx.Batch(batch)
return e
}
func addLanguage(lang string, languages []string) []string {
if !slices.Contains(languages, defaultAnalyzer) && lang == "" {
return append(languages, defaultAnalyzer)
}
if _, ok := noStopWordsFilters[lang]; ok {
found := false
for i := range languages {
if languages[i] == lang {
found = true
break
}
}
if !found {
languages = append(languages, lang)
}
}
return languages
}
func (b *BleveIndexer) createDocument(meta metadata.Metadata, fullPath string, batchSlugs map[string]struct{}) DocumentWrite {
document := DocumentWrite{
Document: Document{
Metadata: meta,
},
SeriesEq: strings.ReplaceAll(slug.Make(meta.Series), "-", ""),
AuthorsEq: make([]string, len(meta.Authors)),
SubjectsEq: make([]string, len(meta.Subjects)),
}
document.ID = b.ID(document, fullPath)
document.Slug = b.Slug(document, batchSlugs)
copy(document.AuthorsEq, meta.Authors)
for i := range document.AuthorsEq {
document.AuthorsEq[i] = strings.ReplaceAll(slug.Make(document.AuthorsEq[i]), "-", "")
}
copy(document.SubjectsEq, meta.Subjects)
for i := range document.SubjectsEq {
document.SubjectsEq[i] = strings.ReplaceAll(slug.Make(document.SubjectsEq[i]), "-", "")
}
return document
}
// As Bleve index is not updated until the batch is executed, we need to store the slugs
// processed in the current batch in memory to also compare the current doc slug against them.
func (b *BleveIndexer) Slug(document DocumentWrite, batchSlugs map[string]struct{}) string {
docSlug := makeSlug(document)
i := 1
existsInBatch := false
for {
doc, _ := b.Document(docSlug)
if batchSlugs != nil {
_, existsInBatch = batchSlugs[docSlug]
}
if doc.Slug == docSlug && doc.ID == document.ID {
return docSlug
}
if doc.Slug == "" && !existsInBatch {
return docSlug
}
i++
docSlug = fmt.Sprintf("%s-%d", docSlug, i)
}
}
func (b *BleveIndexer) ID(meta DocumentWrite, file string) string {
ID := strings.ReplaceAll(file, b.libraryPath, "")
return strings.TrimPrefix(ID, string(filepath.Separator))
}
func makeSlug(meta DocumentWrite) string {
docSlug := meta.Title
if len(meta.Authors) > 0 {
docSlug = strings.Join(meta.Authors, ", ") + "-" + docSlug
}
return slug.MakeLang(docSlug, meta.Language)
}