Skip to content

Commit

Permalink
Refactor lenFunc to use utf8.RuneCountInString
Browse files Browse the repository at this point in the history
  • Loading branch information
whyiug committed Feb 19, 2024
1 parent a63d3a9 commit ba73dea
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 7 deletions.
4 changes: 3 additions & 1 deletion textsplitter/options.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package textsplitter

import "unicode/utf8"

// Options is a struct that contains options for a text splitter.
type Options struct {
ChunkSize int
Expand Down Expand Up @@ -118,5 +120,5 @@ func WithReferenceLinks(referenceLinks bool) Option {
}

func defaultLenFunc(s string) int {
return len(s)
return utf8.RuneCountInString(s)
}
6 changes: 0 additions & 6 deletions textsplitter/recursive_character_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package textsplitter

import (
"testing"
"unicode/utf8"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand All @@ -18,15 +17,13 @@ func TestRecursiveCharacterSplitter(t *testing.T) {
chunkSize int
separators []string
expectedDocs []schema.Document
lenFunc func(string) int
}
testCases := []testCase{
{
text: "哈里森\n很高兴遇见你\n欢迎来中国",
chunkOverlap: 0,
chunkSize: 10,
separators: []string{"\n\n", "\n", " "},
lenFunc: utf8.RuneCountInString,
expectedDocs: []schema.Document{
{PageContent: "哈里森\n很高兴遇见你", Metadata: map[string]any{}},
{PageContent: "欢迎来中国", Metadata: map[string]any{}},
Expand Down Expand Up @@ -115,9 +112,6 @@ Bye!
splitter.ChunkOverlap = tc.chunkOverlap
splitter.ChunkSize = tc.chunkSize
splitter.Separators = tc.separators
if tc.lenFunc != nil {
splitter.LenFunc = tc.lenFunc
}

docs, err := CreateDocuments(splitter, []string{tc.text}, nil)
require.NoError(t, err)
Expand Down

0 comments on commit ba73dea

Please sign in to comment.