/
helper.go
81 lines (65 loc) · 1.74 KB
/
helper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
package feature
import (
"strings"
)
func seqCombinations(sequence, separator string, minLength, maxLength int) []string {
var combs []string
splitted := strings.Split(sequence, separator)
if matchesLength(sequence, minLength, maxLength) {
combs = append(combs, sequence)
}
for i := range splitted {
comb := splitted[i]
if !containsString(combs, comb) && matchesLength(comb, minLength, maxLength) {
combs = append(combs, comb)
}
j := i
for range splitted {
j++
if j > len(splitted) {
break
}
comb := strings.Join(splitted[i:j], separator)
if !containsString(combs, comb) && matchesLength(comb, minLength, maxLength) {
combs = append(combs, comb)
}
}
}
return combs
}
func matchesLength(seq string, min, max int) bool {
if min != -1 && len(seq) < min {
return false
}
if max != -1 && len(seq) > max {
return false
}
return true
}
func seqPositions(sequence, seq string) [][]float64 {
var positions [][]float64
ll := float64(len(sequence))
rdsf := 100 / ll
rdef := float64(len(seq)) * 100 / ll
for i := range sequence {
if strings.HasPrefix(sequence[i:], seq) {
// Position must be represented by its relative dimensions. When trying
// to find out if a period is always at the end of a sentence, the
// feature position needs to reflect that. E.g. there are sentences that
// are 10 or 100 characters long. The feature detected at the end of such
// sequences need to be represented by a position indicating 100 percent.
rds := float64(i) * rdsf
rde := rds + rdef
positions = append(positions, []float64{rds, rde})
}
}
return positions
}
func containsString(combs []string, comb string) bool {
for _, c := range combs {
if c == comb {
return true
}
}
return false
}