Skip to content

Commit

Permalink
create keyword_extract in Jieba
Browse files Browse the repository at this point in the history
  • Loading branch information
yanyiwu committed Sep 11, 2016
1 parent 4a755df commit 74c70c7
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 27 deletions.
4 changes: 4 additions & 0 deletions ChangeLog.md
@@ -1,5 +1,9 @@
# CppJieba ChangeLog

## next version

+ Notice(**api changed**) : Jieba class 3 arguments -> 5 arguments, and use KeywordExtractor in Jieba

## v4.8.1

+ add TextRankExtractor by [@questionfish] in [pull request 65](https://github.com/yanyiwu/cppjieba/pull/65)
Expand Down
18 changes: 11 additions & 7 deletions include/cppjieba/Jieba.hpp
Expand Up @@ -2,22 +2,25 @@
#define CPPJIEAB_JIEBA_H

#include "QuerySegment.hpp"
//#include "LevelSegment.hpp"
#include "KeywordExtractor.hpp"

namespace cppjieba {

class Jieba {
public:
Jieba(const string& dict_path, const string& model_path, const string& user_dict_path)
Jieba(const string& dict_path,
const string& model_path,
const string& user_dict_path,
const string& idfPath,
const string& stopWordPath)
: dict_trie_(dict_path, user_dict_path),
model_(model_path),
mp_seg_(&dict_trie_),
hmm_seg_(&model_),
mix_seg_(&dict_trie_, &model_),
full_seg_(&dict_trie_),
query_seg_(&dict_trie_, &model_)
//level_seg_(&dict_trie_),
{
query_seg_(&dict_trie_, &model_),
extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
}
~Jieba() {
}
Expand Down Expand Up @@ -84,7 +87,7 @@ class Jieba {
const HMMModel* GetHMMModel() const {
return &model_;
}

private:
DictTrie dict_trie_;
HMMModel model_;
Expand All @@ -95,8 +98,9 @@ class Jieba {
MixSegment mix_seg_;
FullSegment full_seg_;
QuerySegment query_seg_;
//LevelSegment level_seg_;

public:
KeywordExtractor extractor;
}; // class Jieba

} // namespace cppjieba
Expand Down
11 changes: 3 additions & 8 deletions include/cppjieba/KeywordExtractor.hpp
Expand Up @@ -3,10 +3,12 @@

#include <cmath>
#include <set>
#include "Jieba.hpp"
#include "MixSegment.hpp"

namespace cppjieba {

using namespace limonp;
using namespace std;

/*utf8*/
class KeywordExtractor {
Expand Down Expand Up @@ -34,13 +36,6 @@ class KeywordExtractor {
LoadIdfDict(idfPath);
LoadStopWordDict(stopWordPath);
}
KeywordExtractor(const Jieba& jieba,
const string& idfPath,
const string& stopWordPath)
: segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
LoadIdfDict(idfPath);
LoadStopWordDict(stopWordPath);
}
~KeywordExtractor() {
}

Expand Down
10 changes: 4 additions & 6 deletions test/demo.cpp
@@ -1,5 +1,4 @@
#include "cppjieba/Jieba.hpp"
#include "cppjieba/KeywordExtractor.hpp"

using namespace std;

Expand All @@ -12,7 +11,9 @@ const char* const STOP_WORD_PATH = "../dict/stop_words.utf8";
int main(int argc, char** argv) {
cppjieba::Jieba jieba(DICT_PATH,
HMM_PATH,
USER_DICT_PATH);
USER_DICT_PATH,
IDF_PATH,
STOP_WORD_PATH);
vector<string> words;
vector<cppjieba::Word> jiebawords;
string s;
Expand Down Expand Up @@ -69,13 +70,10 @@ int main(int argc, char** argv) {
cout << s << endl;
cout << tagres << endl;;

cppjieba::KeywordExtractor extractor(jieba,
IDF_PATH,
STOP_WORD_PATH);
cout << "[demo] Keyword Extraction" << endl;
const size_t topk = 5;
vector<cppjieba::KeywordExtractor::Word> keywordres;
extractor.Extract(s, keywordres, topk);
jieba.extractor.Extract(s, keywordres, topk);
cout << s << endl;
cout << keywordres << endl;
return EXIT_SUCCESS;
Expand Down
28 changes: 22 additions & 6 deletions test/unittest/jieba_test.cpp
Expand Up @@ -5,8 +5,10 @@ using namespace cppjieba;

TEST(JiebaTest, Test1) {
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8");
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8",
"../dict/idf.utf8",
"../dict/stop_words.utf8");
vector<string> words;
string result;

Expand Down Expand Up @@ -40,8 +42,10 @@ TEST(JiebaTest, Test1) {
}
TEST(JiebaTest, WordTest) {
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8");
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8",
"../dict/idf.utf8",
"../dict/stop_words.utf8");
vector<Word> words;
string result;

Expand Down Expand Up @@ -80,8 +84,10 @@ TEST(JiebaTest, WordTest) {

TEST(JiebaTest, InsertUserWord) {
cppjieba::Jieba jieba("../dict/jieba.dict.utf8",
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8");
"../dict/hmm_model.utf8",
"../dict/user.dict.utf8",
"../dict/idf.utf8",
"../dict/stop_words.utf8");
vector<string> words;
string result;

Expand Down Expand Up @@ -114,4 +120,14 @@ TEST(JiebaTest, InsertUserWord) {
jieba.Cut("同一个世界,同一个梦想", words);
result = Join(words.begin(), words.end(), "/");
ASSERT_EQ(result, "同一个世界,同一个梦想");

{
string s("一部iPhone6");
string res;
vector<KeywordExtractor::Word> wordweights;
size_t topN = 5;
jieba.extractor.Extract(s, wordweights, topN);
res << wordweights;
ASSERT_EQ(res, "[{\"word\": \"iPhone6\", \"offset\": [6], \"weight\": 11.7392}, {\"word\": \"\xE4\xB8\x80\xE9\x83\xA8\", \"offset\": [0], \"weight\": 6.47592}]");
}
}

0 comments on commit 74c70c7

Please sign in to comment.