-
Notifications
You must be signed in to change notification settings - Fork 209
/
tokenizer.h
134 lines (115 loc) · 3.5 KB
/
tokenizer.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
// MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
//
//
// Copyright(C) 2001-2011 Taku Kudo <taku@chasen.org>
// Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
#ifndef MECAB_TOKENIZER_H_
#define MECAB_TOKENIZER_H_
#include "mecab.h"
#include "freelist.h"
#include "dictionary.h"
#include "char_property.h"
#include "nbest_generator.h"
#include "scoped_ptr.h"
namespace MeCab {
class Param;
class NBestGenerator;
template <typename N, typename P>
class Allocator {
public:
N *newNode() {
N *node = node_freelist_->alloc();
std::memset(node, 0, sizeof(N));
node->id = id_++;
return node;
}
P *newPath() {
if (!path_freelist_.get()) {
path_freelist_.reset(new FreeList<P>(PATH_FREELIST_SIZE));
}
return path_freelist_->alloc();
}
Dictionary::result_type *mutable_results() {
return results_.get();
}
char *alloc(size_t size) {
if (!char_freelist_.get()) {
char_freelist_.reset(new ChunkFreeList<char>(BUF_SIZE));
}
return char_freelist_->alloc(size + 1);
}
char *strdup(const char *str, size_t size) {
char *n = alloc(size + 1);
std::strncpy(n, str, size + 1);
return n;
}
NBestGenerator *nbest_generator() {
if (!nbest_generator_.get()) {
nbest_generator_.reset(new NBestGenerator);
}
return nbest_generator_.get();
}
char *partial_buffer(size_t size) {
partial_buffer_.resize(size);
return &partial_buffer_[0];
}
size_t results_size() const {
return kResultsSize;
}
void free() {
id_ = 0;
node_freelist_->free();
if (path_freelist_.get()) {
path_freelist_->free();
}
if (char_freelist_.get()) {
char_freelist_->free();
}
}
Allocator()
: id_(0),
node_freelist_(new FreeList<N>(NODE_FREELIST_SIZE)),
path_freelist_(0),
char_freelist_(0),
nbest_generator_(0),
results_(new Dictionary::result_type[kResultsSize]) {}
virtual ~Allocator() {}
private:
static const size_t kResultsSize = 512;
size_t id_;
scoped_ptr<FreeList<N> > node_freelist_;
scoped_ptr<FreeList<P> > path_freelist_;
scoped_ptr<ChunkFreeList<char> > char_freelist_;
scoped_ptr<NBestGenerator> nbest_generator_;
std::vector<char> partial_buffer_;
scoped_array<Dictionary::result_type> results_;
};
template <typename N, typename P>
class Tokenizer {
private:
std::vector<Dictionary *> dic_;
Dictionary unkdic_;
scoped_string bos_feature_;
scoped_string unk_feature_;
FreeList<DictionaryInfo> dictionary_info_freelist_;
std::vector<std::pair<const Token *, size_t> > unk_tokens_;
DictionaryInfo *dictionary_info_;
CharInfo space_;
CharProperty property_;
size_t max_grouping_size_;
whatlog what_;
public:
N *getBOSNode(Allocator<N, P> *allocator) const;
N *getEOSNode(Allocator<N, P> *allocator) const;
template <bool IsPartial> N *lookup(const char *begin, const char *end,
Allocator<N, P> *allocator,
Lattice *lattice) const;
bool open(const Param ¶m);
void close();
const DictionaryInfo *dictionary_info() const;
const char *what() { return what_.str(); }
explicit Tokenizer();
virtual ~Tokenizer() { this->close(); }
};
}
#endif // MECAB_TOKENIZER_H_