Skip to content

Commit

Permalink
feat: support tokenizer encoding options
Browse files Browse the repository at this point in the history
Signed-off-by: Yaohui Wang <wangyaohuicn@gmail.com>
  • Loading branch information
wangyaohui committed Oct 4, 2023
1 parent 228bbf9 commit 9d4886e
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 23 deletions.
34 changes: 21 additions & 13 deletions cmd/ctoc/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"strings"

"github.com/jessevdk/go-flags"
"github.com/pkoukk/tiktoken-go"
"github.com/yaohui-wyh/ctoc"
)

Expand Down Expand Up @@ -41,19 +42,20 @@ var rowLen = 96
// CmdOptions is gocloc command options.
// It is necessary to use notation that follows go-flags.
type CmdOptions struct {
ByFile bool `long:"by-file" description:"report results for every encountered source file"`
SortTag string `long:"sort" default:"code" description:"sort based on a certain column" choice:"name" choice:"files" choice:"blank" choice:"comment" choice:"code" choice:"tokens"`
OutputType string `long:"output-type" default:"default" description:"output type [values: default,cloc-xml,sloccount,json]"`
ExcludeExt string `long:"exclude-ext" description:"exclude file name extensions (separated commas)"`
IncludeLang string `long:"include-lang" description:"include language name (separated commas)"`
Match string `long:"match" description:"include file name (regex)"`
NotMatch string `long:"not-match" description:"exclude file name (regex)"`
MatchDir string `long:"match-d" description:"include dir name (regex)"`
NotMatchDir string `long:"not-match-d" description:"exclude dir name (regex)"`
Debug bool `long:"debug" description:"dump debug log for developer"`
SkipDuplicated bool `long:"skip-duplicated" description:"skip duplicated files"`
ShowLang bool `long:"show-lang" description:"print about all languages and extensions"`
ShowVersion bool `long:"version" description:"print version info"`
ByFile bool `long:"by-file" description:"report results for every encountered source file"`
SortTag string `long:"sort" default:"code" description:"sort based on a certain column" choice:"name" choice:"files" choice:"blank" choice:"comment" choice:"code" choice:"tokens"`
OutputType string `long:"output-type" default:"default" description:"output type [values: default,cloc-xml,sloccount,json]"`
ExcludeExt string `long:"exclude-ext" description:"exclude file name extensions (separated commas)"`
IncludeLang string `long:"include-lang" description:"include language name (separated commas)"`
Match string `long:"match" description:"include file name (regex)"`
NotMatch string `long:"not-match" description:"exclude file name (regex)"`
MatchDir string `long:"match-d" description:"include dir name (regex)"`
NotMatchDir string `long:"not-match-d" description:"exclude dir name (regex)"`
Debug bool `long:"debug" description:"dump debug log for developer"`
SkipDuplicated bool `long:"skip-duplicated" description:"skip duplicated files"`
ShowLang bool `long:"show-lang" description:"print about all languages and extensions"`
ShowVersion bool `long:"version" description:"print version info"`
TokenizerEncoding string `long:"encoding" default:"cl100k_base" description:"specify tokenizer encoding" choice:"cl100k_base" choice:"p50k_base" choice:"p50k_edit" choice:"r50k_base"`
}

type outputBuilder struct {
Expand Down Expand Up @@ -293,6 +295,12 @@ func main() {

clocOpts.Debug = opts.Debug
clocOpts.SkipDuplicated = opts.SkipDuplicated
tke, err := tiktoken.GetEncoding(opts.TokenizerEncoding)
if err != nil {
fmt.Printf("failed to initialize tokenizer. error: %v\n", err)
return
}
clocOpts.Tokenizer = tke

processor := ctoc.NewProcessor(languages, clocOpts)
result, err := processor.Analyze(paths)
Expand Down
10 changes: 1 addition & 9 deletions file.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,8 @@ import (
"sort"
"strings"
"unicode"

"github.com/pkoukk/tiktoken-go"
)

var tke *tiktoken.Tiktoken

func init() {
tke, _ = tiktoken.GetEncoding("cl100k_base")
}

// ClocFile is collecting to line count result.
type ClocFile struct {
Code int32 `xml:"code,attr" json:"code"`
Expand Down Expand Up @@ -105,7 +97,7 @@ func AnalyzeReader(filename string, language *Language, file io.Reader, opts *Cl
scannerloop:
for scanner.Scan() {
lineOrg := scanner.Text()
tokens := tke.Encode(lineOrg, nil, nil)
tokens := opts.Tokenizer.Encode(lineOrg, nil, nil)
clocFile.Tokens += int32(len(tokens))
line := strings.TrimSpace(lineOrg)

Expand Down
9 changes: 8 additions & 1 deletion option.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package ctoc

import "regexp"
import (
"regexp"

"github.com/pkoukk/tiktoken-go"
)

// ClocOptions is gocloc processor options.
type ClocOptions struct {
Expand All @@ -12,6 +16,7 @@ type ClocOptions struct {
ReMatch *regexp.Regexp
ReNotMatchDir *regexp.Regexp
ReMatchDir *regexp.Regexp
Tokenizer *tiktoken.Tiktoken

// OnCode is triggered for each line of code.
OnCode func(line string)
Expand All @@ -23,10 +28,12 @@ type ClocOptions struct {

// NewClocOptions create new ClocOptions with default values.
func NewClocOptions() *ClocOptions {
tke, _ := tiktoken.GetEncoding("cl100k_base")
return &ClocOptions{
Debug: false,
SkipDuplicated: false,
ExcludeExts: make(map[string]struct{}),
IncludeLangs: make(map[string]struct{}),
Tokenizer: tke,
}
}

0 comments on commit 9d4886e

Please sign in to comment.