Skip to content

Commit

Permalink
upgrade cppjieba -> v4.8.1 and support (word,freq,tag) format of user…
Browse files Browse the repository at this point in the history
… dict
  • Loading branch information
yanyiwu committed Jul 23, 2016
1 parent d8ae0da commit fd7aae6
Show file tree
Hide file tree
Showing 13 changed files with 344 additions and 54 deletions.
4 changes: 4 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# ChangeLog

## next version

+ upgrade cppjieba -> v4.8.1 and support (word,freq,tag) format of user dict

## v0.15.0

+ upgrade cppjieba -> v4.8.0, and make CutForSearch api behaves the same as jieba by Python
Expand Down
38 changes: 29 additions & 9 deletions deps/cppjieba/DictTrie.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
#include <iostream>
#include <fstream>
#include <map>
#include <string>
#include <cstring>
#include <cstdlib>
#include <stdint.h>
#include <cmath>
#include <limits>
Expand Down Expand Up @@ -70,7 +72,8 @@ class DictTrie {
private:
void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
LoadDict(dict_path);
CalculateWeight(static_node_infos_);
freq_sum_ = CalcFreqSum(static_node_infos_);
CalculateWeight(static_node_infos_, freq_sum_);
SetStaticWordWeights(user_word_weight_opt);

if (user_dict_paths.size()) {
Expand Down Expand Up @@ -108,10 +111,22 @@ class DictTrie {
buf.clear();
Split(line, buf, " ");
DictUnit node_info;
MakeNodeInfo(node_info,
buf[0],
user_word_default_weight_,
(buf.size() == 2 ? buf[1] : UNKNOWN_TAG));
if(buf.size() == 1){
MakeNodeInfo(node_info,
buf[0],
user_word_default_weight_,
UNKNOWN_TAG);
} else if (buf.size() == 2) {
MakeNodeInfo(node_info,
buf[0],
user_word_default_weight_,
buf[1]);
} else if (buf.size() == 3) {
int freq = atoi(buf[1].c_str());
assert(freq_sum_ > 0.0);
double weight = log(1.0 * freq / freq_sum_);
MakeNodeInfo(node_info, buf[0], weight, buf[2]);
}
static_node_infos_.push_back(node_info);
if (node_info.word.size() == 1) {
user_dict_single_chinese_word_.insert(node_info.word[0]);
Expand Down Expand Up @@ -175,16 +190,20 @@ class DictTrie {
}
}

void CalculateWeight(vector<DictUnit>& node_infos) const {
double CalcFreqSum(const vector<DictUnit>& node_infos) const {
double sum = 0.0;
for (size_t i = 0; i < node_infos.size(); i++) {
sum += node_infos[i].weight;
}
assert(sum);
return sum;
}

void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
assert(sum > 0.0);
for (size_t i = 0; i < node_infos.size(); i++) {
DictUnit& node_info = node_infos[i];
assert(node_info.weight);
node_info.weight = log(double(node_info.weight)/double(sum));
assert(node_info.weight > 0.0);
node_info.weight = log(double(node_info.weight)/sum);
}
}

Expand All @@ -196,6 +215,7 @@ class DictTrie {
deque<DictUnit> active_node_infos_; // must not be vector
Trie * trie_;

double freq_sum_;
double min_weight_;
double max_weight_;
double median_weight_;
Expand Down
23 changes: 16 additions & 7 deletions deps/cppjieba/Jieba.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#define CPPJIEAB_JIEBA_H

#include "QuerySegment.hpp"
#include "PosTagger.hpp"
//#include "LevelSegment.hpp"

namespace cppjieba {
Expand All @@ -16,9 +15,9 @@ class Jieba {
hmm_seg_(&model_),
mix_seg_(&dict_trie_, &model_),
full_seg_(&dict_trie_),
query_seg_(&dict_trie_, &model_),
query_seg_(&dict_trie_, &model_)
//level_seg_(&dict_trie_),
pos_tagger_(&dict_trie_, &model_) {
{
}
~Jieba() {
}
Expand Down Expand Up @@ -61,12 +60,24 @@ class Jieba {
}

void Tag(const string& sentence, vector<pair<string, string> >& words) const {
pos_tagger_.Tag(sentence, words);
mix_seg_.Tag(sentence, words);
}
string LookupTag(const string &str) const {
return mix_seg_.LookupTag(str);
}
bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
return dict_trie_.InsertUserWord(word, tag);
}

void ResetSeparators(const string& s) {
//TODO
mp_seg_.ResetSeparators(s);
hmm_seg_.ResetSeparators(s);
mix_seg_.ResetSeparators(s);
full_seg_.ResetSeparators(s);
query_seg_.ResetSeparators(s);
}

const DictTrie* GetDictTrie() const {
return &dict_trie_;
}
Expand All @@ -85,9 +96,7 @@ class Jieba {
FullSegment full_seg_;
QuerySegment query_seg_;
//LevelSegment level_seg_;

PosTagger pos_tagger_;


}; // class Jieba

} // namespace cppjieba
Expand Down
2 changes: 1 addition & 1 deletion deps/cppjieba/KeywordExtractor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ class KeywordExtractor {
double idfAverage_;

unordered_set<string> stopWords_;
}; // class Jieba
}; // class KeywordExtractor

inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
Expand Down
21 changes: 16 additions & 5 deletions deps/cppjieba/MPSegment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
#include <cassert>
#include "limonp/Logging.hpp"
#include "DictTrie.hpp"
#include "SegmentBase.hpp"
#include "SegmentTagged.hpp"
#include "PosTagger.hpp"

namespace cppjieba {

class MPSegment: public SegmentBase {
class MPSegment: public SegmentTagged {
public:
MPSegment(const string& dictPath, const string& userDictPath = "")
: dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
Expand All @@ -25,9 +26,13 @@ class MPSegment: public SegmentBase {
}
}

void Cut(const string& sentence,
vector<string>& words,
size_t max_word_len = MAX_WORD_LENGTH) const {
void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, MAX_WORD_LENGTH);
}

void Cut(const string& sentence,
vector<string>& words,
size_t max_word_len) const {
vector<Word> tmp;
Cut(sentence, tmp, max_word_len);
GetStringsFromWords(tmp, words);
Expand Down Expand Up @@ -64,6 +69,10 @@ class MPSegment: public SegmentBase {
return dictTrie_;
}

bool Tag(const string& src, vector<pair<string, string> >& res) const {
return tagger_.Tag(src, res, *this);
}

bool IsUserDictSingleChineseWord(const Rune& value) const {
return dictTrie_->IsUserDictSingleChineseWord(value);
}
Expand Down Expand Up @@ -119,6 +128,8 @@ class MPSegment: public SegmentBase {

const DictTrie* dictTrie_;
bool isNeedDestroy_;
PosTagger tagger_;

}; // class MPSegment

} // namespace cppjieba
Expand Down
18 changes: 16 additions & 2 deletions deps/cppjieba/MixSegment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
#include "MPSegment.hpp"
#include "HMMSegment.hpp"
#include "limonp/StringUtil.hpp"
#include "PosTagger.hpp"

namespace cppjieba {
class MixSegment: public SegmentBase {
class MixSegment: public SegmentTagged {
public:
MixSegment(const string& mpSegDict, const string& hmmSegDict,
const string& userDict = "")
Expand All @@ -20,7 +21,10 @@ class MixSegment: public SegmentBase {
~MixSegment() {
}

void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, true);
}
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
vector<Word> tmp;
Cut(sentence, tmp, hmm);
GetStringsFromWords(tmp, words);
Expand Down Expand Up @@ -84,9 +88,19 @@ class MixSegment: public SegmentBase {
const DictTrie* GetDictTrie() const {
return mpSeg_.GetDictTrie();
}

bool Tag(const string& src, vector<pair<string, string> >& res) const {
return tagger_.Tag(src, res, *this);
}

string LookupTag(const string &str) const {
return tagger_.LookupTag(str, *this);
}

private:
MPSegment mpSeg_;
HMMSegment hmmSeg_;
PosTagger tagger_;

}; // class MixSegment

Expand Down
36 changes: 17 additions & 19 deletions deps/cppjieba/PosTagger.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#ifndef CPPJIEBA_POS_TAGGING_H
#define CPPJIEBA_POS_TAGGING_H

#include "MixSegment.hpp"
#include "limonp/StringUtil.hpp"
#include "SegmentTagged.hpp"
#include "DictTrie.hpp"

namespace cppjieba {
Expand All @@ -14,39 +14,38 @@ static const char* const POS_X = "x";

class PosTagger {
public:
PosTagger(const string& dictPath,
const string& hmmFilePath,
const string& userDictPath = "")
: segment_(dictPath, hmmFilePath, userDictPath) {
}
PosTagger(const DictTrie* dictTrie, const HMMModel* model)
: segment_(dictTrie, model) {
PosTagger() {
}
~PosTagger() {
}

bool Tag(const string& src, vector<pair<string, string> >& res) const {
bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
vector<string> CutRes;
segment_.Cut(src, CutRes);
segment.Cut(src, CutRes);

for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
}
return !res.empty();
}

string LookupTag(const string &str, const SegmentTagged& segment) const {
const DictUnit *tmp = NULL;
RuneStrArray runes;
const DictTrie * dict = segment_.GetDictTrie();
const DictTrie * dict = segment.GetDictTrie();
assert(dict != NULL);
for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
if (!DecodeRunesInString(*itr, runes)) {
if (!DecodeRunesInString(str, runes)) {
XLOG(ERROR) << "Decode failed.";
return false;
return POS_X;
}
tmp = dict->Find(runes.begin(), runes.end());
if (tmp == NULL || tmp->tag.empty()) {
res.push_back(make_pair(*itr, SpecialRule(runes)));
return SpecialRule(runes);
} else {
res.push_back(make_pair(*itr, tmp->tag));
return tmp->tag;
}
}
return !res.empty();
}

private:
const char* SpecialRule(const RuneStrArray& unicode) const {
size_t m = 0;
Expand All @@ -71,7 +70,6 @@ class PosTagger {
return POS_ENG;
}

MixSegment segment_;
}; // class PosTagger

} // namespace cppjieba
Expand Down
6 changes: 5 additions & 1 deletion deps/cppjieba/QuerySegment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ class QuerySegment: public SegmentBase {
}
~QuerySegment() {
}
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {

void Cut(const string& sentence, vector<string>& words) const {
Cut(sentence, words, true);
}
void Cut(const string& sentence, vector<string>& words, bool hmm) const {
vector<Word> tmp;
Cut(sentence, tmp, hmm);
GetStringsFromWords(tmp, words);
Expand Down
29 changes: 19 additions & 10 deletions deps/cppjieba/SegmentBase.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,36 @@

namespace cppjieba {

//const char* const SPECIAL_CHARS = " \t\n,。";
const Rune SPECIAL_SYMBOL[] = {32u, 9u, 10u, 65292u, 12290u};
const char* const SPECIAL_SEPARATORS = " \t\n,。";

using namespace limonp;

class SegmentBase {
public:
SegmentBase() {
LoadSpecialSymbols();
XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
}
~SegmentBase() {
virtual ~SegmentBase() {
}

protected:
void LoadSpecialSymbols() {
size_t size = sizeof(SPECIAL_SYMBOL)/sizeof(*SPECIAL_SYMBOL);
for (size_t i = 0; i < size; i ++) {
symbols_.insert(SPECIAL_SYMBOL[i]);
virtual void Cut(const string& sentence, vector<string>& words) const = 0;

bool ResetSeparators(const string& s) {
symbols_.clear();
RuneStrArray runes;
if (!DecodeRunesInString(s, runes)) {
XLOG(ERROR) << "decode " << s << " failed";
return false;
}
assert(symbols_.size());
for (size_t i = 0; i < runes.size(); i++) {
if (!symbols_.insert(runes[i].rune).second) {
XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
return false;
}
}
return true;
}
protected:
unordered_set<Rune> symbols_;
}; // class SegmentBase

Expand Down
Loading

0 comments on commit fd7aae6

Please sign in to comment.