Skip to content

Commit

Permalink
update user dict loading method about word weight, and add unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
yanyiwu committed Jul 22, 2016
1 parent e45ac01 commit 0984c9e
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 9 deletions.
1 change: 1 addition & 0 deletions dict/user.dict.utf8
@@ -1,3 +1,4 @@
云计算
韩玉鉴赏
蓝翔 nz
区块链 10 nz
27 changes: 19 additions & 8 deletions include/cppjieba/DictTrie.hpp
Expand Up @@ -72,7 +72,8 @@ class DictTrie {
private:
void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
LoadDict(dict_path);
CalculateWeight(static_node_infos_);
freq_sum_ = CalcFreqSum(static_node_infos_);
CalculateWeight(static_node_infos_, freq_sum_);
SetStaticWordWeights(user_word_weight_opt);

if (user_dict_paths.size()) {
Expand Down Expand Up @@ -115,11 +116,16 @@ class DictTrie {
buf[0],
user_word_default_weight_,
UNKNOWN_TAG);
} else {
} else if (buf.size() == 2) {
MakeNodeInfo(node_info,
buf[0],
(buf.size() == 2 ? user_word_default_weight_ : atoi(buf[1].c_str())),
(buf.size() == 3 ? buf[2] : buf[1]));
user_word_default_weight_,
buf[1]);
} else if (buf.size() == 3) {
int freq = atoi(buf[1].c_str());
assert(freq_sum_ > 0.0);
double weight = log(1.0 * freq / freq_sum_);
MakeNodeInfo(node_info, buf[0], weight, buf[2]);
}
static_node_infos_.push_back(node_info);
if (node_info.word.size() == 1) {
Expand Down Expand Up @@ -184,16 +190,20 @@ class DictTrie {
}
}

void CalculateWeight(vector<DictUnit>& node_infos) const {
double CalcFreqSum(const vector<DictUnit>& node_infos) const {
double sum = 0.0;
for (size_t i = 0; i < node_infos.size(); i++) {
sum += node_infos[i].weight;
}
assert(sum);
return sum;
}

void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
assert(sum > 0.0);
for (size_t i = 0; i < node_infos.size(); i++) {
DictUnit& node_info = node_infos[i];
assert(node_info.weight);
node_info.weight = log(double(node_info.weight)/double(sum));
assert(node_info.weight > 0.0);
node_info.weight = log(double(node_info.weight)/sum);
}
}

Expand All @@ -205,6 +215,7 @@ class DictTrie {
deque<DictUnit> active_node_infos_; // must not be vector
Trie * trie_;

double freq_sum_;
double min_weight_;
double max_weight_;
double median_weight_;
Expand Down
1 change: 1 addition & 0 deletions test/testdata/userdict.utf8
Expand Up @@ -5,3 +5,4 @@ B
iPhone6
蓝翔 nz
忽如一夜春风来
区块链 10 nz
16 changes: 15 additions & 1 deletion test/unittest/trie_test.cpp
Expand Up @@ -74,8 +74,22 @@ TEST(DictTrieTest, UserDict) {
cppjieba::RuneStrArray unicode;
ASSERT_TRUE(DecodeRunesInString(word, unicode));
const DictUnit * unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit);
ASSERT_TRUE(unit != NULL);
ASSERT_NEAR(unit->weight, -14.100, 0.001);

word = "蓝翔";
ASSERT_TRUE(DecodeRunesInString(word, unicode));
unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit != NULL);
ASSERT_EQ(unit->tag, "nz");
ASSERT_NEAR(unit->weight, -14.100, 0.001);

word = "区块链";
ASSERT_TRUE(DecodeRunesInString(word, unicode));
unit = trie.Find(unicode.begin(), unicode.end());
ASSERT_TRUE(unit != NULL);
ASSERT_EQ(unit->tag, "nz");
ASSERT_NEAR(unit->weight, -15.6478, 0.001);
}

TEST(DictTrieTest, UserDictWithMaxWeight) {
Expand Down

0 comments on commit 0984c9e

Please sign in to comment.