Skip to content

Commit

Permalink
upgrade cppjieba -> v4.7.0
Browse files Browse the repository at this point in the history
  • Loading branch information
yanyiwu committed Apr 21, 2016
1 parent 7611255 commit e1a5851
Show file tree
Hide file tree
Showing 17 changed files with 459 additions and 330 deletions.
4 changes: 4 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# ChangeLog

## next version

+ upgrade cppjieba -> v4.7.0

## v0.13.0

+ NewJieba(...string) support variable arguments
Expand Down
11 changes: 5 additions & 6 deletions deps/cppjieba/DictTrie.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <limits>
#include "limonp/StringUtil.hpp"
#include "limonp/Logging.hpp"
#include "TransCode.hpp"
#include "Unicode.hpp"
#include "Trie.hpp"

namespace cppjieba {
Expand Down Expand Up @@ -48,12 +48,12 @@ class DictTrie {
return true;
}

const DictUnit* Find(Unicode::const_iterator begin, Unicode::const_iterator end) const {
const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
return trie_->Find(begin, end);
}

void Find(Unicode::const_iterator begin,
Unicode::const_iterator end,
void Find(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<struct Dag>&res,
size_t max_word_len = MAX_WORD_LENGTH) const {
trie_->Find(begin, end, res, max_word_len);
Expand Down Expand Up @@ -118,14 +118,13 @@ class DictTrie {
}
}
}
XLOG(INFO) << "load userdicts " << filePaths << ", lines: " << lineno;
}

bool MakeNodeInfo(DictUnit& node_info,
const string& word,
double weight,
const string& tag) {
if (!TransCode::Decode(word, node_info.word)) {
if (!DecodeRunesInString(word, node_info.word)) {
XLOG(ERROR) << "Decode " << word << " failed.";
return false;
}
Expand Down
32 changes: 22 additions & 10 deletions deps/cppjieba/FullSegment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "limonp/Logging.hpp"
#include "DictTrie.hpp"
#include "SegmentBase.hpp"
#include "TransCode.hpp"
#include "Unicode.hpp"

namespace cppjieba {
class FullSegment: public SegmentBase {
Expand All @@ -27,19 +27,27 @@ class FullSegment: public SegmentBase {
}
void Cut(const string& sentence,
vector<string>& words) const {
vector<Word> tmp;
Cut(sentence, tmp);
GetStringsFromWords(tmp, words);
}
void Cut(const string& sentence,
vector<Word>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<Unicode> uwords;
uwords.reserve(sentence.size());
vector<WordRange> wrs;
wrs.reserve(sentence.size()/2);
while (pre_filter.HasNext()) {
range = pre_filter.Next();
Cut(range.begin, range.end, uwords);
Cut(range.begin, range.end, wrs);
}
TransCode::Encode(uwords, words);
words.clear();
words.reserve(wrs.size());
GetWordsFromWordRanges(sentence, wrs, words);
}
void Cut(Unicode::const_iterator begin,
Unicode::const_iterator end,
vector<Unicode>& res) const {
void Cut(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<WordRange>& res) const {
//resut of searching in trie tree
LocalVector<pair<size_t, const DictUnit*> > tRes;

Expand All @@ -56,15 +64,19 @@ class FullSegment: public SegmentBase {
dictTrie_->Find(begin, end, dags);
for (size_t i = 0; i < dags.size(); i++) {
for (size_t j = 0; j < dags[i].nexts.size(); j++) {
size_t nextoffset = dags[i].nexts[j].first;
assert(nextoffset < dags.size());
const DictUnit* du = dags[i].nexts[j].second;
if (du == NULL) {
if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
res.push_back(Unicode(1, dags[i].rune));
WordRange wr(begin + i, begin + nextoffset);
res.push_back(wr);
}
} else {
wordLen = du->word.size();
if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
res.push_back(du->word);
WordRange wr(begin + i, begin + nextoffset);
res.push_back(wr);
}
}
maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
Expand Down
2 changes: 1 addition & 1 deletion deps/cppjieba/HMMModel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ struct HMMModel {
XLOG(ERROR) << "emitProb illegal.";
return false;
}
if (!TransCode::Decode(tmp2[0], unicode) || unicode.size() != 1) {
if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
XLOG(ERROR) << "TransCode failed.";
return false;
}
Expand Down
56 changes: 33 additions & 23 deletions deps/cppjieba/HMMSegment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,29 @@ class HMMSegment: public SegmentBase {

void Cut(const string& sentence,
vector<string>& words) const {
vector<Word> tmp;
Cut(sentence, tmp);
GetStringsFromWords(tmp, words);
}
void Cut(const string& sentence,
vector<Word>& words) const {
PreFilter pre_filter(symbols_, sentence);
PreFilter::Range range;
vector<Unicode> uwords;
uwords.reserve(sentence.size());
vector<WordRange> wrs;
wrs.reserve(sentence.size()/2);
while (pre_filter.HasNext()) {
range = pre_filter.Next();
Cut(range.begin, range.end, uwords);
Cut(range.begin, range.end, wrs);
}
TransCode::Encode(uwords, words);
words.clear();
words.reserve(wrs.size());
GetWordsFromWordRanges(sentence, wrs, words);
}
void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
Unicode::const_iterator left = begin;
Unicode::const_iterator right = begin;
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
RuneStrArray::const_iterator left = begin;
RuneStrArray::const_iterator right = begin;
while (right != end) {
if (*right < 0x80) {
if (right->rune < 0x80) {
if (left != right) {
InternalCut(left, right, res);
}
Expand All @@ -55,7 +63,8 @@ class HMMSegment: public SegmentBase {
}
right ++;
} while (false);
res.push_back(Unicode(left, right));
WordRange wr(left, right - 1);
res.push_back(wr);
left = right;
} else {
right++;
Expand All @@ -67,15 +76,15 @@ class HMMSegment: public SegmentBase {
}
private:
// sequential letters rule
Unicode::const_iterator SequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
Rune x = *begin;
RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
Rune x = begin->rune;
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
begin ++;
} else {
return begin;
}
while (begin != end) {
x = *begin;
x = begin->rune;
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
begin ++;
} else {
Expand All @@ -85,15 +94,15 @@ class HMMSegment: public SegmentBase {
return begin;
}
//
Unicode::const_iterator NumbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const {
Rune x = *begin;
RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
Rune x = begin->rune;
if ('0' <= x && x <= '9') {
begin ++;
} else {
return begin;
}
while (begin != end) {
x = *begin;
x = begin->rune;
if ( ('0' <= x && x <= '9') || x == '.') {
begin++;
} else {
Expand All @@ -102,23 +111,24 @@ class HMMSegment: public SegmentBase {
}
return begin;
}
void InternalCut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const {
void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
vector<size_t> status;
Viterbi(begin, end, status);

Unicode::const_iterator left = begin;
Unicode::const_iterator right;
RuneStrArray::const_iterator left = begin;
RuneStrArray::const_iterator right;
for (size_t i = 0; i < status.size(); i++) {
if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
right = begin + i + 1;
res.push_back(Unicode(left, right));
WordRange wr(left, right - 1);
res.push_back(wr);
left = right;
}
}
}

void Viterbi(Unicode::const_iterator begin,
Unicode::const_iterator end,
void Viterbi(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<size_t>& status) const {
size_t Y = HMMModel::STATUS_SUM;
size_t X = end - begin;
Expand All @@ -132,7 +142,7 @@ class HMMSegment: public SegmentBase {

//start
for (size_t y = 0; y < Y; y++) {
weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE);
weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
path[0 + y * X] = -1;
}

Expand All @@ -143,7 +153,7 @@ class HMMSegment: public SegmentBase {
now = x + y*X;
weight[now] = MIN_DOUBLE;
path[now] = HMMModel::E; // warning
emitProb = model_->GetEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE);
emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE);
for (size_t preY = 0; preY < Y; preY++) {
old = x - 1 + preY * X;
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
Expand Down
34 changes: 16 additions & 18 deletions deps/cppjieba/Jieba.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include "QuerySegment.hpp"
#include "PosTagger.hpp"
#include "LevelSegment.hpp"
//#include "LevelSegment.hpp"

namespace cppjieba {

Expand All @@ -17,7 +17,7 @@ class Jieba {
mix_seg_(&dict_trie_, &model_),
full_seg_(&dict_trie_),
query_seg_(&dict_trie_, &model_),
level_seg_(&dict_trie_),
//level_seg_(&dict_trie_),
pos_tagger_(&dict_trie_, &model_) {
}
~Jieba() {
Expand All @@ -32,34 +32,32 @@ class Jieba {
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
mix_seg_.Cut(sentence, words, hmm);
}
void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
mix_seg_.Cut(sentence, words, hmm);
}
void CutAll(const string& sentence, vector<string>& words) const {
full_seg_.Cut(sentence, words);
}
void CutAll(const string& sentence, vector<Word>& words) const {
full_seg_.Cut(sentence, words);
}
void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
query_seg_.Cut(sentence, words, hmm);
}
void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
query_seg_.Cut(sentence, words, hmm);
}
void CutHMM(const string& sentence, vector<string>& words) const {
hmm_seg_.Cut(sentence, words);
}
void CutLevel(const string& sentence, vector<string>& words) const {
level_seg_.Cut(sentence, words);
}
void CutLevel(const string& sentence, vector<pair<string, size_t> >& words) const {
level_seg_.Cut(sentence, words);
void CutHMM(const string& sentence, vector<Word>& words) const {
hmm_seg_.Cut(sentence, words);
}
void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
mp_seg_.Cut(sentence, words, max_word_len);
}
void Locate(const vector<string>& words, vector<LocWord>& loc_words) const {
loc_words.resize(words.size());
size_t begin = 0;
for (size_t i = 0; i < words.size(); i++) {
size_t len = TransCode::Decode(words[i]).size();
loc_words[i].word = words[i];
loc_words[i].begin = begin;
loc_words[i].end = loc_words[i].begin + len;
begin = loc_words[i].end;
}
void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
mp_seg_.Cut(sentence, words, max_word_len);
}

void Tag(const string& sentence, vector<pair<string, string> >& words) const {
Expand Down Expand Up @@ -89,7 +87,7 @@ class Jieba {
MixSegment mix_seg_;
FullSegment full_seg_;
QuerySegment query_seg_;
LevelSegment level_seg_;
//LevelSegment level_seg_;

PosTagger pos_tagger_;

Expand Down
Loading

0 comments on commit e1a5851

Please sign in to comment.