/
tokenizer.rb
117 lines (93 loc) · 2.88 KB
/
tokenizer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
module TwitterCldr
module Tokenizers
class TokenRecognizer
attr_reader :token_type, :regex, :content, :cleaner
def initialize(token_type, regex, content = nil, &block)
@token_type = token_type
@regex = regex
@content = content
@cleaner = block
end
def recognizes?(text)
!!(text =~ regex)
end
def clean(val)
if cleaner
cleaner.call(val)
else
val
end
end
end
class Tokenizer
attr_reader :recognizers, :custom_splitter, :remove_empty_entries
def self.union(*tokenizers)
recognizers = tokenizers.inject([]) do |ret, tokenizer|
ret + tokenizer.recognizers.inject([]) do |recog_ret, recognizer|
if (block_given? && yield(recognizer)) || !block_given?
recog_ret << recognizer
end
recog_ret
end
end
splitter = if tokenizers.all?(&:custom_splitter)
Regexp.compile(
tokenizers.map do |tokenizer|
tokenizer.custom_splitter.source
end.join("|"), nil, 'u'
)
end
new(recognizers, splitter)
end
def initialize(recognizers, splitter = nil, remove_empty_entries = true)
@recognizers = recognizers
@custom_splitter = splitter
@remove_empty_entries = remove_empty_entries
end
def recognizer_at(token_type)
recognizers.find { |r| r.token_type == token_type }
end
def insert_before(token_type, *new_recognizers)
idx = recognizers.find_index { |rec| rec.token_type == token_type }
recognizers.insert(idx, *new_recognizers)
clear_splitter
nil
end
def tokenize(text)
text.split(splitter).inject([]) do |ret, token_text|
recognizer = recognizers.find do |recognizer|
recognizer.recognizes?(token_text)
end
if recognizer
if recognizer.token_type == :composite
content = token_text.match(recognizer.content)[1]
ret << CompositeToken.new(tokenize(content))
else
cleaned_text = recognizer.clean(token_text)
if (remove_empty_entries && cleaned_text.size > 0) || !remove_empty_entries
ret << Token.new(
value: cleaned_text,
type: recognizer.token_type
)
end
end
end
ret
end
end
private
def splitter
@splitter ||= (@custom_splitter || begin
sources = recognizers.map { |rec| rec.regex.source }
Regexp.new("(" + sources.join("|") + ")")
end)
end
def clear_splitter
@splitter = nil
end
end
end
end