-
-
Notifications
You must be signed in to change notification settings - Fork 537
/
csv.go
83 lines (70 loc) · 1.68 KB
/
csv.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
package documentloaders
import (
"context"
"encoding/csv"
"errors"
"fmt"
"io"
"strings"
"github.com/tmc/langchaingo/schema"
"github.com/tmc/langchaingo/textsplitter"
"golang.org/x/exp/slices"
)
// CSV represents a CSV document loader.
type CSV struct {
r io.Reader
columns []string
}
var _ Loader = CSV{}
// NewCSV creates a new csv loader with an io.Reader and optional column names for filtering.
func NewCSV(r io.Reader, columns ...string) CSV {
return CSV{
r: r,
columns: columns,
}
}
// Load reads from the io.Reader and returns a single document with the data.
func (c CSV) Load(_ context.Context) ([]schema.Document, error) {
var header []string
var docs []schema.Document
var rown int
rd := csv.NewReader(c.r)
for {
row, err := rd.Read()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
return nil, err
}
if len(header) == 0 {
header = append(header, row...)
continue
}
var content []string
for i, value := range row {
if c.columns != nil &&
len(c.columns) > 0 &&
!slices.Contains(c.columns, header[i]) {
continue
}
line := fmt.Sprintf("%s: %s", header[i], value)
content = append(content, line)
}
rown++
docs = append(docs, schema.Document{
PageContent: strings.Join(content, "\n"),
Metadata: map[string]any{"row": rown},
})
}
return docs, nil
}
// LoadAndSplit reads text data from the io.Reader and splits it into multiple
// documents using a text splitter.
func (c CSV) LoadAndSplit(ctx context.Context, splitter textsplitter.TextSplitter) ([]schema.Document, error) {
docs, err := c.Load(ctx)
if err != nil {
return nil, err
}
return textsplitter.SplitDocuments(splitter, docs)
}