forked from Azure/draft-classic
-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyse.go
76 lines (67 loc) · 1.91 KB
/
analyse.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
package linguist
import (
"bytes"
"log"
"math"
"github.com/Azure/draft/pkg/linguist/data"
"github.com/Azure/draft/pkg/linguist/tokenizer"
"github.com/jbrukh/bayesian"
)
var classifier *bayesian.Classifier
var classifierInitialized = false
// Gets the baysian.Classifier which has been trained on programming language
// samples from github.com/github/linguist after running the generator
//
// See also cmd/generate-classifier
func getClassifier() *bayesian.Classifier {
// NOTE(tso): this could probably go into an init() function instead
// but this lazy loading approach works, and it's conceivable that the
// analyse() function might not invoked in an actual runtime anyway
if !classifierInitialized {
d, err := data.Asset("classifier")
if err != nil {
log.Panicln(err)
}
reader := bytes.NewReader(d)
classifier, err = bayesian.NewClassifierFromReader(reader)
if err != nil {
log.Panicln(err)
}
classifierInitialized = true
}
return classifier
}
// Analyse returns the name of a programming language, or the empty string if one could
// not be determined.
//
// Uses Naive Bayesian Classification on the file contents provided.
//
// It is recommended to use LanguageByContents() instead of this function directly.
//
// Obtain hints from LanguageHints()
//
// NOTE(tso): May yield inaccurate results
func Analyse(contents []byte, hints []string) (language string) {
document := tokenizer.Tokenize(contents)
classifier := getClassifier()
scores, idx, _ := classifier.LogScores(document)
if len(hints) == 0 {
return string(classifier.Classes[idx])
}
langs := map[string]struct{}{}
for _, hint := range hints {
langs[hint] = struct{}{}
}
bestScore := math.Inf(-1)
bestAnswer := ""
for id, score := range scores {
answer := string(classifier.Classes[id])
if _, ok := langs[answer]; ok {
if score >= bestScore {
bestScore = score
bestAnswer = answer
}
}
}
return bestAnswer
}