From 74c70c70cd8205f8451466cba4dcdda57c7c941c Mon Sep 17 00:00:00 2001 From: yanyiwu Date: Sun, 11 Sep 2016 21:42:36 +0800 Subject: [PATCH] create keyword_extract in Jieba --- ChangeLog.md | 4 ++++ include/cppjieba/Jieba.hpp | 18 ++++++++++------- include/cppjieba/KeywordExtractor.hpp | 11 +++-------- test/demo.cpp | 10 ++++------ test/unittest/jieba_test.cpp | 28 +++++++++++++++++++++------ 5 files changed, 44 insertions(+), 27 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index 8e99418f..cd39135a 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,5 +1,9 @@ # CppJieba ChangeLog +## next version + ++ Notice(**api changed**) : Jieba class 3 arguments -> 5 arguments, and use KeywordExtractor in Jieba + ## v4.8.1 + add TextRankExtractor by [@questionfish] in [pull request 65](https://github.com/yanyiwu/cppjieba/pull/65) diff --git a/include/cppjieba/Jieba.hpp b/include/cppjieba/Jieba.hpp index 9c87ce9b..ef5cb456 100644 --- a/include/cppjieba/Jieba.hpp +++ b/include/cppjieba/Jieba.hpp @@ -2,22 +2,25 @@ #define CPPJIEAB_JIEBA_H #include "QuerySegment.hpp" -//#include "LevelSegment.hpp" +#include "KeywordExtractor.hpp" namespace cppjieba { class Jieba { public: - Jieba(const string& dict_path, const string& model_path, const string& user_dict_path) + Jieba(const string& dict_path, + const string& model_path, + const string& user_dict_path, + const string& idfPath, + const string& stopWordPath) : dict_trie_(dict_path, user_dict_path), model_(model_path), mp_seg_(&dict_trie_), hmm_seg_(&model_), mix_seg_(&dict_trie_, &model_), full_seg_(&dict_trie_), - query_seg_(&dict_trie_, &model_) - //level_seg_(&dict_trie_), - { + query_seg_(&dict_trie_, &model_), + extractor(&dict_trie_, &model_, idfPath, stopWordPath) { } ~Jieba() { } @@ -84,7 +87,7 @@ class Jieba { const HMMModel* GetHMMModel() const { return &model_; } - + private: DictTrie dict_trie_; HMMModel model_; @@ -95,8 +98,9 @@ class Jieba { MixSegment mix_seg_; FullSegment full_seg_; QuerySegment query_seg_; - //LevelSegment level_seg_; + public: + KeywordExtractor extractor; }; // class Jieba } // namespace cppjieba diff --git a/include/cppjieba/KeywordExtractor.hpp b/include/cppjieba/KeywordExtractor.hpp index 6d262233..319ce0ab 100644 --- a/include/cppjieba/KeywordExtractor.hpp +++ b/include/cppjieba/KeywordExtractor.hpp @@ -3,10 +3,12 @@ #include #include -#include "Jieba.hpp" +#include "MixSegment.hpp" namespace cppjieba { + using namespace limonp; +using namespace std; /*utf8*/ class KeywordExtractor { @@ -34,13 +36,6 @@ class KeywordExtractor { LoadIdfDict(idfPath); LoadStopWordDict(stopWordPath); } - KeywordExtractor(const Jieba& jieba, - const string& idfPath, - const string& stopWordPath) - : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { - LoadIdfDict(idfPath); - LoadStopWordDict(stopWordPath); - } ~KeywordExtractor() { } diff --git a/test/demo.cpp b/test/demo.cpp index a2fc0aee..97b6eee3 100644 --- a/test/demo.cpp +++ b/test/demo.cpp @@ -1,5 +1,4 @@ #include "cppjieba/Jieba.hpp" -#include "cppjieba/KeywordExtractor.hpp" using namespace std; @@ -12,7 +11,9 @@ const char* const STOP_WORD_PATH = "../dict/stop_words.utf8"; int main(int argc, char** argv) { cppjieba::Jieba jieba(DICT_PATH, HMM_PATH, - USER_DICT_PATH); + USER_DICT_PATH, + IDF_PATH, + STOP_WORD_PATH); vector words; vector jiebawords; string s; @@ -69,13 +70,10 @@ int main(int argc, char** argv) { cout << s << endl; cout << tagres << endl;; - cppjieba::KeywordExtractor extractor(jieba, - IDF_PATH, - STOP_WORD_PATH); cout << "[demo] Keyword Extraction" << endl; const size_t topk = 5; vector keywordres; - extractor.Extract(s, keywordres, topk); + jieba.extractor.Extract(s, keywordres, topk); cout << s << endl; cout << keywordres << endl; return EXIT_SUCCESS; diff --git a/test/unittest/jieba_test.cpp b/test/unittest/jieba_test.cpp index bffedc9b..06218244 100644 --- a/test/unittest/jieba_test.cpp +++ b/test/unittest/jieba_test.cpp @@ -5,8 +5,10 @@ using namespace cppjieba; TEST(JiebaTest, Test1) { cppjieba::Jieba jieba("../dict/jieba.dict.utf8", - "../dict/hmm_model.utf8", - "../dict/user.dict.utf8"); + "../dict/hmm_model.utf8", + "../dict/user.dict.utf8", + "../dict/idf.utf8", + "../dict/stop_words.utf8"); vector words; string result; @@ -40,8 +42,10 @@ TEST(JiebaTest, Test1) { } TEST(JiebaTest, WordTest) { cppjieba::Jieba jieba("../dict/jieba.dict.utf8", - "../dict/hmm_model.utf8", - "../dict/user.dict.utf8"); + "../dict/hmm_model.utf8", + "../dict/user.dict.utf8", + "../dict/idf.utf8", + "../dict/stop_words.utf8"); vector words; string result; @@ -80,8 +84,10 @@ TEST(JiebaTest, WordTest) { TEST(JiebaTest, InsertUserWord) { cppjieba::Jieba jieba("../dict/jieba.dict.utf8", - "../dict/hmm_model.utf8", - "../dict/user.dict.utf8"); + "../dict/hmm_model.utf8", + "../dict/user.dict.utf8", + "../dict/idf.utf8", + "../dict/stop_words.utf8"); vector words; string result; @@ -114,4 +120,14 @@ TEST(JiebaTest, InsertUserWord) { jieba.Cut("同一个世界,同一个梦想", words); result = Join(words.begin(), words.end(), "/"); ASSERT_EQ(result, "同一个世界,同一个梦想"); + + { + string s("一部iPhone6"); + string res; + vector wordweights; + size_t topN = 5; + jieba.extractor.Extract(s, wordweights, topN); + res << wordweights; + ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]"); + } }