diff --git a/ChangeLog.md b/ChangeLog.md index a546025..b68ede3 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,5 +1,9 @@ # ChangeLog +## next version + ++ upgrade cppjieba -> v4.7.0 + ## v0.13.0 + NewJieba(...string) support variable arguments diff --git a/deps/cppjieba/DictTrie.hpp b/deps/cppjieba/DictTrie.hpp index c0e75c4..82add4b 100644 --- a/deps/cppjieba/DictTrie.hpp +++ b/deps/cppjieba/DictTrie.hpp @@ -10,7 +10,7 @@ #include #include "limonp/StringUtil.hpp" #include "limonp/Logging.hpp" -#include "TransCode.hpp" +#include "Unicode.hpp" #include "Trie.hpp" namespace cppjieba { @@ -48,12 +48,12 @@ class DictTrie { return true; } - const DictUnit* Find(Unicode::const_iterator begin, Unicode::const_iterator end) const { + const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { return trie_->Find(begin, end); } - void Find(Unicode::const_iterator begin, - Unicode::const_iterator end, + void Find(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, vector&res, size_t max_word_len = MAX_WORD_LENGTH) const { trie_->Find(begin, end, res, max_word_len); @@ -118,14 +118,13 @@ class DictTrie { } } } - XLOG(INFO) << "load userdicts " << filePaths << ", lines: " << lineno; } bool MakeNodeInfo(DictUnit& node_info, const string& word, double weight, const string& tag) { - if (!TransCode::Decode(word, node_info.word)) { + if (!DecodeRunesInString(word, node_info.word)) { XLOG(ERROR) << "Decode " << word << " failed."; return false; } diff --git a/deps/cppjieba/FullSegment.hpp b/deps/cppjieba/FullSegment.hpp index 7847b1b..fc7aab2 100644 --- a/deps/cppjieba/FullSegment.hpp +++ b/deps/cppjieba/FullSegment.hpp @@ -7,7 +7,7 @@ #include "limonp/Logging.hpp" #include "DictTrie.hpp" #include "SegmentBase.hpp" -#include "TransCode.hpp" +#include "Unicode.hpp" namespace cppjieba { class FullSegment: public SegmentBase { @@ -27,19 +27,27 @@ class FullSegment: public SegmentBase { } void Cut(const string& sentence, vector& words) const { + vector tmp; + Cut(sentence, tmp); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, + vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector uwords; - uwords.reserve(sentence.size()); + vector wrs; + wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); - Cut(range.begin, range.end, uwords); + Cut(range.begin, range.end, wrs); } - TransCode::Encode(uwords, words); + words.clear(); + words.reserve(wrs.size()); + GetWordsFromWordRanges(sentence, wrs, words); } - void Cut(Unicode::const_iterator begin, - Unicode::const_iterator end, - vector& res) const { + void Cut(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + vector& res) const { //resut of searching in trie tree LocalVector > tRes; @@ -56,15 +64,19 @@ class FullSegment: public SegmentBase { dictTrie_->Find(begin, end, dags); for (size_t i = 0; i < dags.size(); i++) { for (size_t j = 0; j < dags[i].nexts.size(); j++) { + size_t nextoffset = dags[i].nexts[j].first; + assert(nextoffset < dags.size()); const DictUnit* du = dags[i].nexts[j].second; if (du == NULL) { if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) { - res.push_back(Unicode(1, dags[i].rune)); + WordRange wr(begin + i, begin + nextoffset); + res.push_back(wr); } } else { wordLen = du->word.size(); if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) { - res.push_back(du->word); + WordRange wr(begin + i, begin + nextoffset); + res.push_back(wr); } } maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx; diff --git a/deps/cppjieba/HMMModel.hpp b/deps/cppjieba/HMMModel.hpp index d83a45a..27e6b66 100644 --- a/deps/cppjieba/HMMModel.hpp +++ b/deps/cppjieba/HMMModel.hpp @@ -105,7 +105,7 @@ struct HMMModel { XLOG(ERROR) << "emitProb illegal."; return false; } - if (!TransCode::Decode(tmp2[0], unicode) || unicode.size() != 1) { + if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { XLOG(ERROR) << "TransCode failed."; return false; } diff --git a/deps/cppjieba/HMMSegment.hpp b/deps/cppjieba/HMMSegment.hpp index 7467f0d..d515c04 100644 --- a/deps/cppjieba/HMMSegment.hpp +++ b/deps/cppjieba/HMMSegment.hpp @@ -25,21 +25,29 @@ class HMMSegment: public SegmentBase { void Cut(const string& sentence, vector& words) const { + vector tmp; + Cut(sentence, tmp); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, + vector& words) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector uwords; - uwords.reserve(sentence.size()); + vector wrs; + wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); - Cut(range.begin, range.end, uwords); + Cut(range.begin, range.end, wrs); } - TransCode::Encode(uwords, words); + words.clear(); + words.reserve(wrs.size()); + GetWordsFromWordRanges(sentence, wrs, words); } - void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { - Unicode::const_iterator left = begin; - Unicode::const_iterator right = begin; + void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { + RuneStrArray::const_iterator left = begin; + RuneStrArray::const_iterator right = begin; while (right != end) { - if (*right < 0x80) { + if (right->rune < 0x80) { if (left != right) { InternalCut(left, right, res); } @@ -55,7 +63,8 @@ class HMMSegment: public SegmentBase { } right ++; } while (false); - res.push_back(Unicode(left, right)); + WordRange wr(left, right - 1); + res.push_back(wr); left = right; } else { right++; @@ -67,15 +76,15 @@ class HMMSegment: public SegmentBase { } private: // sequential letters rule - Unicode::const_iterator SequentialLetterRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { - Rune x = *begin; + RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { + Rune x = begin->rune; if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) { begin ++; } else { return begin; } while (begin != end) { - x = *begin; + x = begin->rune; if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) { begin ++; } else { @@ -85,15 +94,15 @@ class HMMSegment: public SegmentBase { return begin; } // - Unicode::const_iterator NumbersRule(Unicode::const_iterator begin, Unicode::const_iterator end) const { - Rune x = *begin; + RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { + Rune x = begin->rune; if ('0' <= x && x <= '9') { begin ++; } else { return begin; } while (begin != end) { - x = *begin; + x = begin->rune; if ( ('0' <= x && x <= '9') || x == '.') { begin++; } else { @@ -102,23 +111,24 @@ class HMMSegment: public SegmentBase { } return begin; } - void InternalCut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res) const { + void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { vector status; Viterbi(begin, end, status); - Unicode::const_iterator left = begin; - Unicode::const_iterator right; + RuneStrArray::const_iterator left = begin; + RuneStrArray::const_iterator right; for (size_t i = 0; i < status.size(); i++) { if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i]) right = begin + i + 1; - res.push_back(Unicode(left, right)); + WordRange wr(left, right - 1); + res.push_back(wr); left = right; } } } - void Viterbi(Unicode::const_iterator begin, - Unicode::const_iterator end, + void Viterbi(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, vector& status) const { size_t Y = HMMModel::STATUS_SUM; size_t X = end - begin; @@ -132,7 +142,7 @@ class HMMSegment: public SegmentBase { //start for (size_t y = 0; y < Y; y++) { - weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], *begin, MIN_DOUBLE); + weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE); path[0 + y * X] = -1; } @@ -143,7 +153,7 @@ class HMMSegment: public SegmentBase { now = x + y*X; weight[now] = MIN_DOUBLE; path[now] = HMMModel::E; // warning - emitProb = model_->GetEmitProb(model_->emitProbVec[y], *(begin+x), MIN_DOUBLE); + emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE); for (size_t preY = 0; preY < Y; preY++) { old = x - 1 + preY * X; tmp = weight[old] + model_->transProb[preY][y] + emitProb; diff --git a/deps/cppjieba/Jieba.hpp b/deps/cppjieba/Jieba.hpp index b931fad..12f4358 100644 --- a/deps/cppjieba/Jieba.hpp +++ b/deps/cppjieba/Jieba.hpp @@ -3,7 +3,7 @@ #include "QuerySegment.hpp" #include "PosTagger.hpp" -#include "LevelSegment.hpp" +//#include "LevelSegment.hpp" namespace cppjieba { @@ -17,7 +17,7 @@ class Jieba { mix_seg_(&dict_trie_, &model_), full_seg_(&dict_trie_), query_seg_(&dict_trie_, &model_), - level_seg_(&dict_trie_), + //level_seg_(&dict_trie_), pos_tagger_(&dict_trie_, &model_) { } ~Jieba() { @@ -32,34 +32,32 @@ class Jieba { void Cut(const string& sentence, vector& words, bool hmm = true) const { mix_seg_.Cut(sentence, words, hmm); } + void Cut(const string& sentence, vector& words, bool hmm = true) const { + mix_seg_.Cut(sentence, words, hmm); + } void CutAll(const string& sentence, vector& words) const { full_seg_.Cut(sentence, words); } + void CutAll(const string& sentence, vector& words) const { + full_seg_.Cut(sentence, words); + } void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { query_seg_.Cut(sentence, words, hmm); } + void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { + query_seg_.Cut(sentence, words, hmm); + } void CutHMM(const string& sentence, vector& words) const { hmm_seg_.Cut(sentence, words); } - void CutLevel(const string& sentence, vector& words) const { - level_seg_.Cut(sentence, words); - } - void CutLevel(const string& sentence, vector >& words) const { - level_seg_.Cut(sentence, words); + void CutHMM(const string& sentence, vector& words) const { + hmm_seg_.Cut(sentence, words); } void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { mp_seg_.Cut(sentence, words, max_word_len); } - void Locate(const vector& words, vector& loc_words) const { - loc_words.resize(words.size()); - size_t begin = 0; - for (size_t i = 0; i < words.size(); i++) { - size_t len = TransCode::Decode(words[i]).size(); - loc_words[i].word = words[i]; - loc_words[i].begin = begin; - loc_words[i].end = loc_words[i].begin + len; - begin = loc_words[i].end; - } + void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { + mp_seg_.Cut(sentence, words, max_word_len); } void Tag(const string& sentence, vector >& words) const { @@ -89,7 +87,7 @@ class Jieba { MixSegment mix_seg_; FullSegment full_seg_; QuerySegment query_seg_; - LevelSegment level_seg_; + //LevelSegment level_seg_; PosTagger pos_tagger_; diff --git a/deps/cppjieba/KeywordExtractor.hpp b/deps/cppjieba/KeywordExtractor.hpp index e8cf18c..da67ea2 100644 --- a/deps/cppjieba/KeywordExtractor.hpp +++ b/deps/cppjieba/KeywordExtractor.hpp @@ -11,6 +11,12 @@ using namespace limonp; /*utf8*/ class KeywordExtractor { public: + struct Word { + string word; + vector offsets; + double weight; + }; // struct Word + KeywordExtractor(const string& dictPath, const string& hmmFilePath, const string& idfPath, @@ -38,50 +44,57 @@ class KeywordExtractor { ~KeywordExtractor() { } - bool Extract(const string& sentence, vector& keywords, size_t topN) const { - vector > topWords; - if (!Extract(sentence, topWords, topN)) { - return false; + void Extract(const string& sentence, vector& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(topWords[i].word); } + } + + void Extract(const string& sentence, vector >& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); for (size_t i = 0; i < topWords.size(); i++) { - keywords.push_back(topWords[i].first); + keywords.push_back(pair(topWords[i].word, topWords[i].weight)); } - return true; } - bool Extract(const string& sentence, vector >& keywords, size_t topN) const { + void Extract(const string& sentence, vector& keywords, size_t topN) const { vector words; segment_.Cut(sentence, words); - map wordmap; - for (vector::iterator iter = words.begin(); iter != words.end(); iter++) { - if (IsSingleWord(*iter)) { + map wordmap; + size_t offset = 0; + for (size_t i = 0; i < words.size(); ++i) { + size_t t = offset; + offset += words[i].size(); + if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { continue; } - wordmap[*iter] += 1.0; + wordmap[words[i]].offsets.push_back(t); + wordmap[words[i]].weight += 1.0; + } + if (offset != sentence.size()) { + XLOG(ERROR) << "words illegal"; + return; } - for (map::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { - if (stopWords_.end() != stopWords_.find(itr->first)) { - wordmap.erase(itr++); - continue; - } - + keywords.clear(); + keywords.reserve(wordmap.size()); + for (map::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { unordered_map::const_iterator cit = idfMap_.find(itr->first); if (cit != idfMap_.end()) { - itr->second *= cit->second; + itr->second.weight *= cit->second; } else { - itr->second *= idfAverage_; + itr->second.weight *= idfAverage_; } - itr ++; + itr->second.word = itr->first; + keywords.push_back(itr->second); } - - keywords.clear(); - std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); topN = min(topN, keywords.size()); partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); keywords.resize(topN); - return true; } private: void LoadIdfDict(const string& idfPath) { @@ -123,16 +136,8 @@ class KeywordExtractor { assert(stopWords_.size()); } - bool IsSingleWord(const string& str) const { - Unicode unicode; - TransCode::Decode(str, unicode); - if (unicode.size() == 1) - return true; - return false; - } - - static bool Compare(const pair& lhs, const pair& rhs) { - return lhs.second > rhs.second; + static bool Compare(const Word& lhs, const Word& rhs) { + return lhs.weight > rhs.weight; } MixSegment segment_; @@ -141,6 +146,11 @@ class KeywordExtractor { unordered_set stopWords_; }; // class Jieba + +inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) { + return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; +} + } // namespace cppjieba #endif diff --git a/deps/cppjieba/LevelSegment.hpp b/deps/cppjieba/LevelSegment.hpp deleted file mode 100644 index 7c1155d..0000000 --- a/deps/cppjieba/LevelSegment.hpp +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef CPPJIEBA_LEVELSEGMENT_H -#define CPPJIEBA_LEVELSEGMENT_H - -#include "MPSegment.hpp" - -namespace cppjieba { - -class LevelSegment: public SegmentBase{ - public: - LevelSegment(const string& dictPath, - const string& userDictPath = "") - : mpSeg_(dictPath, userDictPath) { - } - LevelSegment(const DictTrie* dictTrie) - : mpSeg_(dictTrie) { - } - ~LevelSegment() { - } - - void Cut(Unicode::const_iterator begin, - Unicode::const_iterator end, - vector >& res) const { - res.clear(); - vector words; - vector smallerWords; - words.reserve(end - begin); - mpSeg_.Cut(begin, end, words); - smallerWords.reserve(words.size()); - res.reserve(words.size()); - - size_t level = 0; - while (!words.empty()) { - smallerWords.clear(); - for (size_t i = 0; i < words.size(); i++) { - if (words[i].size() >= 3) { - size_t len = words[i].size() - 1; - mpSeg_.Cut(words[i].begin(), words[i].end(), smallerWords, len); // buffer.push_back without clear - } - if (words[i].size() > 1) { - res.push_back(pair(words[i], level)); - } - } - - words.swap(smallerWords); - level++; - } - } - - void Cut(const string& sentence, - vector >& words) const { - words.clear(); - Unicode unicode; - TransCode::Decode(sentence, unicode); - vector > unicodeWords; - Cut(unicode.begin(), unicode.end(), unicodeWords); - words.resize(unicodeWords.size()); - for (size_t i = 0; i < words.size(); i++) { - TransCode::Encode(unicodeWords[i].first, words[i].first); - words[i].second = unicodeWords[i].second; - } - } - - bool Cut(const string& sentence, - vector& res) const { - vector > words; - Cut(sentence, words); - res.clear(); - res.reserve(words.size()); - for (size_t i = 0; i < words.size(); i++) { - res.push_back(words[i].first); - } - return true; - } - - private: - MPSegment mpSeg_; -}; // class LevelSegment - -} // namespace cppjieba - -#endif // CPPJIEBA_LEVELSEGMENT_H diff --git a/deps/cppjieba/MPSegment.hpp b/deps/cppjieba/MPSegment.hpp index a9d2100..07e1223 100644 --- a/deps/cppjieba/MPSegment.hpp +++ b/deps/cppjieba/MPSegment.hpp @@ -28,19 +28,28 @@ class MPSegment: public SegmentBase { void Cut(const string& sentence, vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { + vector tmp; + Cut(sentence, tmp, max_word_len); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, + vector& words, + size_t max_word_len = MAX_WORD_LENGTH) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector uwords; - uwords.reserve(sentence.size()); + vector wrs; + wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); - Cut(range.begin, range.end, uwords, max_word_len); + Cut(range.begin, range.end, wrs, max_word_len); } - TransCode::Encode(uwords, words); + words.clear(); + words.reserve(wrs.size()); + GetWordsFromWordRanges(sentence, wrs, words); } - void Cut(Unicode::const_iterator begin, - Unicode::const_iterator end, - vector& words, + void Cut(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + vector& words, size_t max_word_len = MAX_WORD_LENGTH) const { vector dags; dictTrie_->Find(begin, @@ -48,7 +57,7 @@ class MPSegment: public SegmentBase { dags, max_word_len); CalcDP(dags); - CutByDag(dags, words); + CutByDag(begin, end, dags, words); } const DictTrie* GetDictTrie() const { @@ -88,16 +97,21 @@ class MPSegment: public SegmentBase { } } } - void CutByDag(const vector& dags, - vector& words) const { + void CutByDag(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + const vector& dags, + vector& words) const { size_t i = 0; while (i < dags.size()) { const DictUnit* p = dags[i].pInfo; if (p) { - words.push_back(p->word); + assert(p->word.size() >= 1); + WordRange wr(begin + i, begin + i + p->word.size() - 1); + words.push_back(wr); i += p->word.size(); } else { //single chinese word - words.push_back(Unicode(1, dags[i].rune)); + WordRange wr(begin + i, begin + i); + words.push_back(wr); i++; } } diff --git a/deps/cppjieba/MixSegment.hpp b/deps/cppjieba/MixSegment.hpp index 6b69c3a..ced8849 100644 --- a/deps/cppjieba/MixSegment.hpp +++ b/deps/cppjieba/MixSegment.hpp @@ -21,54 +21,59 @@ class MixSegment: public SegmentBase { } void Cut(const string& sentence, vector& words, bool hmm = true) const { + vector tmp; + Cut(sentence, tmp, hmm); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector uwords; - uwords.reserve(sentence.size()); + vector wrs; + wrs.reserve(sentence.size() / 2); while (pre_filter.HasNext()) { range = pre_filter.Next(); - Cut(range.begin, range.end, uwords, hmm); + Cut(range.begin, range.end, wrs, hmm); } - TransCode::Encode(uwords, words); + words.clear(); + words.reserve(wrs.size()); + GetWordsFromWordRanges(sentence, wrs, words); } - void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res, bool hmm) const { + void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { if (!hmm) { mpSeg_.Cut(begin, end, res); return; } - vector words; + vector words; + assert(end >= begin); words.reserve(end - begin); mpSeg_.Cut(begin, end, words); - vector hmmRes; + vector hmmRes; hmmRes.reserve(end - begin); - Unicode piece; - piece.reserve(end - begin); - for (size_t i = 0, j = 0; i < words.size(); i++) { + for (size_t i = 0; i < words.size(); i++) { //if mp Get a word, it's ok, put it into result - if (1 != words[i].size() || (words[i].size() == 1 && mpSeg_.IsUserDictSingleChineseWord(words[i][0]))) { + if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) { res.push_back(words[i]); continue; } // if mp Get a single one and it is not in userdict, collect it in sequence - j = i; - while (j < words.size() && 1 == words[j].size() && !mpSeg_.IsUserDictSingleChineseWord(words[j][0])) { - piece.push_back(words[j][0]); + size_t j = i; + while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { j++; } // Cut the sequence with hmm - hmmSeg_.Cut(piece.begin(), piece.end(), hmmRes); - + assert(j - 1 >= i); + // TODO + hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes); //put hmm result to result for (size_t k = 0; k < hmmRes.size(); k++) { res.push_back(hmmRes[k]); } //clear tmp vars - piece.clear(); hmmRes.clear(); //let i jump over this piece diff --git a/deps/cppjieba/PosTagger.hpp b/deps/cppjieba/PosTagger.hpp index 26941da..863c07b 100644 --- a/deps/cppjieba/PosTagger.hpp +++ b/deps/cppjieba/PosTagger.hpp @@ -30,17 +30,17 @@ class PosTagger { segment_.Cut(src, CutRes); const DictUnit *tmp = NULL; - Unicode unico; + RuneStrArray runes; const DictTrie * dict = segment_.GetDictTrie(); assert(dict != NULL); for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { - if (!TransCode::Decode(*itr, unico)) { + if (!DecodeRunesInString(*itr, runes)) { XLOG(ERROR) << "Decode failed."; return false; } - tmp = dict->Find(unico.begin(), unico.end()); + tmp = dict->Find(runes.begin(), runes.end()); if (tmp == NULL || tmp->tag.empty()) { - res.push_back(make_pair(*itr, SpecialRule(unico))); + res.push_back(make_pair(*itr, SpecialRule(runes))); } else { res.push_back(make_pair(*itr, tmp->tag)); } @@ -48,13 +48,13 @@ class PosTagger { return !res.empty(); } private: - const char* SpecialRule(const Unicode& unicode) const { + const char* SpecialRule(const RuneStrArray& unicode) const { size_t m = 0; size_t eng = 0; for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) { - if (unicode[i] < 0x80) { + if (unicode[i].rune < 0x80) { eng ++; - if ('0' <= unicode[i] && unicode[i] <= '9') { + if ('0' <= unicode[i].rune && unicode[i].rune <= '9') { m++; } } diff --git a/deps/cppjieba/PreFilter.hpp b/deps/cppjieba/PreFilter.hpp index c4c5661..ecb81c0 100644 --- a/deps/cppjieba/PreFilter.hpp +++ b/deps/cppjieba/PreFilter.hpp @@ -1,32 +1,25 @@ #ifndef CPPJIEBA_PRE_FILTER_H #define CPPJIEBA_PRE_FILTER_H -#include "TransCode.hpp" +#include "Trie.hpp" +#include "limonp/Logging.hpp" namespace cppjieba { -//class PreFilterIterator { -// public: -// PreFilterIterator() { -// } -// ~PreFilterIterator() { -// } -// -// private: -// const unordered_set& specialSymbols_; -//}; // PreFilterIterator - class PreFilter { public: + //TODO use WordRange instead of Range struct Range { - Unicode::const_iterator begin; - Unicode::const_iterator end; + RuneStrArray::const_iterator begin; + RuneStrArray::const_iterator end; }; // struct Range PreFilter(const unordered_set& symbols, const string& sentence) : symbols_(symbols) { - TransCode::Decode(sentence, sentence_); + if (!DecodeRunesInString(sentence, sentence_)) { + XLOG(ERROR) << "decode failed. "; + } cursor_ = sentence_.begin(); } ~PreFilter() { @@ -38,7 +31,7 @@ class PreFilter { Range range; range.begin = cursor_; while (cursor_ != sentence_.end()) { - if (IsIn(symbols_, *cursor_)) { + if (IsIn(symbols_, cursor_->rune)) { if (range.begin == cursor_) { cursor_ ++; } @@ -51,8 +44,8 @@ class PreFilter { return range; } private: - Unicode::const_iterator cursor_; - Unicode sentence_; + RuneStrArray::const_iterator cursor_; + RuneStrArray sentence_; const unordered_set& symbols_; }; // class PreFilter diff --git a/deps/cppjieba/QuerySegment.hpp b/deps/cppjieba/QuerySegment.hpp index d859e5d..6783bd9 100644 --- a/deps/cppjieba/QuerySegment.hpp +++ b/deps/cppjieba/QuerySegment.hpp @@ -9,7 +9,7 @@ #include "SegmentBase.hpp" #include "FullSegment.hpp" #include "MixSegment.hpp" -#include "TransCode.hpp" +#include "Unicode.hpp" #include "DictTrie.hpp" namespace cppjieba { @@ -27,27 +27,34 @@ class QuerySegment: public SegmentBase { ~QuerySegment() { } void Cut(const string& sentence, vector& words, bool hmm = true) const { + vector tmp; + Cut(sentence, tmp, hmm); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, vector& words, bool hmm = true) const { PreFilter pre_filter(symbols_, sentence); PreFilter::Range range; - vector uwords; - uwords.reserve(sentence.size()); + vector wrs; + wrs.reserve(sentence.size()/2); while (pre_filter.HasNext()) { range = pre_filter.Next(); - Cut(range.begin, range.end, uwords, hmm); + Cut(range.begin, range.end, wrs, hmm); } - TransCode::Encode(uwords, words); + words.clear(); + words.reserve(wrs.size()); + GetWordsFromWordRanges(sentence, wrs, words); } - void Cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector& res, bool hmm) const { + void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { //use mix Cut first - vector mixRes; + vector mixRes; mixSeg_.Cut(begin, end, mixRes, hmm); - vector fullRes; - for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { + vector fullRes; + for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { // if it's too long, Cut with fullSeg_, put fullRes in res - if (mixResItr->size() > maxWordLen_ && !IsAllAscii(*mixResItr)) { - fullSeg_.Cut(mixResItr->begin(), mixResItr->end(), fullRes); - for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { + if (mixResItr->Length() > maxWordLen_ && !mixResItr->IsAllAscii()) { + fullSeg_.Cut(mixResItr->left, mixResItr->right + 1, fullRes); + for (vector::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { res.push_back(*fullResItr); } diff --git a/deps/cppjieba/TransCode.hpp b/deps/cppjieba/TransCode.hpp deleted file mode 100644 index 6320beb..0000000 --- a/deps/cppjieba/TransCode.hpp +++ /dev/null @@ -1,70 +0,0 @@ -/************************************ - * file enc : utf-8 - * author : wuyanyi09@gmail.com - ************************************/ -#ifndef CPPJIEBA_TRANSCODE_H -#define CPPJIEBA_TRANSCODE_H - - -#include "limonp/StringUtil.hpp" -#include "limonp/LocalVector.hpp" - -namespace cppjieba { - -using namespace limonp; - -typedef uint32_t Rune; -typedef limonp::LocalVector Unicode; - -namespace TransCode { -inline bool Decode(const string& str, Unicode& res) { -#ifdef CPPJIEBA_GBK - return gbkTrans(str, res); -#else - return Utf8ToUnicode32(str, res); -#endif -} - -inline void Encode(Unicode::const_iterator begin, Unicode::const_iterator end, string& res) { -#ifdef CPPJIEBA_GBK - gbkTrans(begin, end, res); -#else - Unicode32ToUtf8(begin, end, res); -#endif -} - -inline void Encode(const Unicode& uni, string& res) { - Encode(uni.begin(), uni.end(), res); -} - -// compiler is expected to optimized this function to avoid return value copy -inline string Encode(Unicode::const_iterator begin, Unicode::const_iterator end) { - string res; - res.reserve(end - begin); - Encode(begin, end, res); - return res; -} - -inline string Encode(const Unicode& unicode) { - return Encode(unicode.begin(), unicode.end()); -} - -// compiler is expected to optimized this function to avoid return value copy -inline Unicode Decode(const string& str) { - Unicode unicode; - unicode.reserve(str.size()); - Decode(str, unicode); - return unicode; -} - -inline void Encode(const vector& input, vector& output) { - output.resize(input.size()); - for (size_t i = 0; i < output.size(); i++) { - Encode(input[i], output[i]); - } -} - -} // namespace TransCode -} // namespace cppjieba - -#endif diff --git a/deps/cppjieba/Trie.hpp b/deps/cppjieba/Trie.hpp index 6d1350a..fcd5e32 100644 --- a/deps/cppjieba/Trie.hpp +++ b/deps/cppjieba/Trie.hpp @@ -4,7 +4,7 @@ #include #include #include "limonp/StdExtension.hpp" -#include "Trie.hpp" +#include "Unicode.hpp" namespace cppjieba { @@ -16,24 +16,25 @@ struct DictUnit { Unicode word; double weight; string tag; -}; +}; // struct DictUnit // for debugging -inline ostream & operator << (ostream& os, const DictUnit& unit) { - string s; - s << unit.word; - return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); -} +// inline ostream & operator << (ostream& os, const DictUnit& unit) { +// string s; +// s << unit.word; +// return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); +// } struct Dag { - Rune rune; - LocalVector > nexts; + RuneStr runestr; + // [offset, nexts.first] + limonp::LocalVector > nexts; const DictUnit * pInfo; double weight; - size_t nextPos; - Dag():rune(0), pInfo(NULL), weight(0.0), nextPos(0) { + size_t nextPos; // TODO + Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) { } -}; +}; // struct Dag typedef Rune TrieKey; @@ -57,18 +58,18 @@ class Trie { DeleteNode(root_); } - const DictUnit* Find(Unicode::const_iterator begin, Unicode::const_iterator end) const { + const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { if (begin == end) { return NULL; } const TrieNode* ptNode = root_; TrieNode::NextMap::const_iterator citer; - for (Unicode::const_iterator it = begin; it != end; it++) { + for (RuneStrArray::const_iterator it = begin; it != end; it++) { if (NULL == ptNode->next) { return NULL; } - citer = ptNode->next->find(*it); + citer = ptNode->next->find(it->rune); if (ptNode->next->end() == citer) { return NULL; } @@ -77,8 +78,8 @@ class Trie { return ptNode->ptValue; } - void Find(Unicode::const_iterator begin, - Unicode::const_iterator end, + void Find(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, vector&res, size_t max_word_len = MAX_WORD_LENGTH) const { assert(root_ != NULL); @@ -87,10 +88,9 @@ class Trie { const TrieNode *ptNode = NULL; TrieNode::NextMap::const_iterator citer; for (size_t i = 0; i < size_t(end - begin); i++) { - Rune rune = *(begin + i); - res[i].rune = rune; + res[i].runestr = *(begin + i); - if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(rune))) { + if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) { ptNode = citer->second; } else { ptNode = NULL; @@ -105,7 +105,7 @@ class Trie { if (ptNode == NULL || ptNode->next == NULL) { break; } - citer = ptNode->next->find(*(begin + j)); + citer = ptNode->next->find((begin + j)->rune); if (ptNode->next->end() == citer) { break; } diff --git a/deps/cppjieba/Unicode.hpp b/deps/cppjieba/Unicode.hpp new file mode 100644 index 0000000..22a9d83 --- /dev/null +++ b/deps/cppjieba/Unicode.hpp @@ -0,0 +1,215 @@ +#ifndef CPPJIEBA_UNICODE_H +#define CPPJIEBA_UNICODE_H + +#include +#include +#include +#include +#include +#include "limonp/LocalVector.hpp" + +namespace cppjieba { + +using std::string; +using std::vector; + +typedef uint32_t Rune; + +struct Word { + string word; + uint32_t offset; + Word(const string& w, uint32_t o) + : word(w), offset(o) { + } +}; // struct Word + +inline std::ostream& operator << (std::ostream& os, const Word& w) { + return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}"; +} + +struct RuneStr { + Rune rune; + uint32_t offset; + uint32_t len; + RuneStr(): rune(0), offset(0), len(0) { + } + RuneStr(Rune r, uint32_t o, uint32_t l) + : rune(r), offset(o), len(l) { + } +}; // struct RuneStr + +inline std::ostream& operator << (std::ostream& os, const RuneStr& r) { + return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}"; +} + +typedef limonp::LocalVector Unicode; +typedef limonp::LocalVector RuneStrArray; + +// [left, right] +struct WordRange { + RuneStrArray::const_iterator left; + RuneStrArray::const_iterator right; + WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r) + : left(l), right(r) { + } + size_t Length() const { + return right - left + 1; + } + bool IsAllAscii() const { + for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) { + if (iter->rune >= 0x80) { + return false; + } + } + return true; + } +}; // struct WordRange + +struct RuneStrLite { + uint32_t rune; + uint32_t len; + RuneStrLite(): rune(0), len(0) { + } + RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) { + } +}; // struct RuneStrLite + +inline RuneStrLite DecodeRuneInString(const char* str, size_t len) { + RuneStrLite rp(0, 0); + if (str == NULL || len == 0) { + return rp; + } + if (!(str[0] & 0x80)) { // 0xxxxxxx + // 7bit, total 7bit + rp.rune = (uint8_t)(str[0]) & 0x7f; + rp.len = 1; + } else if ((uint8_t)str[0] <= 0xdf && 1 < len) { + // 110xxxxxx + // 5bit, total 5bit + rp.rune = (uint8_t)(str[0]) & 0x1f; + + // 6bit, total 11bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[1]) & 0x3f; + rp.len = 2; + } else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx + // 4bit, total 4bit + rp.rune = (uint8_t)(str[0]) & 0x0f; + + // 6bit, total 10bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[1]) & 0x3f; + + // 6bit, total 16bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[2]) & 0x3f; + + rp.len = 3; + } else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx + // 3bit, total 3bit + rp.rune = (uint8_t)(str[0]) & 0x07; + + // 6bit, total 9bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[1]) & 0x3f; + + // 6bit, total 15bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[2]) & 0x3f; + + // 6bit, total 21bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[3]) & 0x3f; + + rp.len = 4; + } else { + rp.rune = 0; + rp.len = 0; + } + return rp; +} + +inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) { + runes.clear(); + runes.reserve(len / 2); + for (size_t i = 0; i < len;) { + RuneStrLite rp = DecodeRuneInString(s + i, len - i); + if (rp.len == 0) { + runes.clear(); + return false; + } + RuneStr x(rp.rune, i, rp.len); + runes.push_back(x); + i += rp.len; + } + return true; +} + +inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) { + return DecodeRunesInString(s.c_str(), s.size(), runes); +} + +inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) { + unicode.clear(); + RuneStrArray runes; + if (!DecodeRunesInString(s, len, runes)) { + return false; + } + unicode.reserve(runes.size()); + for (size_t i = 0; i < runes.size(); i++) { + unicode.push_back(runes[i].rune); + } + return true; +} + +inline bool IsSingleWord(const string& str) { + RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size()); + return rp.len == str.size(); +} + +inline bool DecodeRunesInString(const string& s, Unicode& unicode) { + return DecodeRunesInString(s.c_str(), s.size(), unicode); +} + +inline Unicode DecodeRunesInString(const string& s) { + Unicode result; + DecodeRunesInString(s, result); + return result; +} + + +// [left, right] +inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { + assert(right->offset >= left->offset); + uint32_t len = right->offset - left->offset + right->len; + return Word(s.substr(left->offset, len), left->offset); +} + +inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { + assert(right->offset >= left->offset); + uint32_t len = right->offset - left->offset + right->len; + return s.substr(left->offset, len); +} + +inline void GetWordsFromWordRanges(const string& s, const vector& wrs, vector& words) { + for (size_t i = 0; i < wrs.size(); i++) { + words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right)); + } +} + +inline vector GetWordsFromWordRanges(const string& s, const vector& wrs) { + vector result; + GetWordsFromWordRanges(s, wrs, result); + return result; +} + +inline void GetStringsFromWords(const vector& words, vector& strs) { + strs.resize(words.size()); + for (size_t i = 0; i < words.size(); ++i) { + strs[i] = words[i].word; + } +} + +} // namespace cppjieba + +#endif // CPPJIEBA_UNICODE_H diff --git a/deps/limonp/StdExtension.hpp b/deps/limonp/StdExtension.hpp index 62cfef8..098a268 100644 --- a/deps/limonp/StdExtension.hpp +++ b/deps/limonp/StdExtension.hpp @@ -35,6 +35,19 @@ namespace std { template ostream& operator << (ostream& os, const vector& v) { + if(v.empty()) { + return os << "[]"; + } + os<<"["< +inline ostream& operator << (ostream& os, const vector& v) { if(v.empty()) { return os << "[]"; }