Skip to content

Commit

Permalink
Merge pull request #35 from wangfenjin/jieba
Browse files Browse the repository at this point in the history
support jieba_query
  • Loading branch information
wangfenjin committed Feb 20, 2021
2 parents c04304f + 5de0ba0 commit 988de24
Show file tree
Hide file tree
Showing 15 changed files with 306 additions and 53 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Expand Up @@ -6,4 +6,5 @@ libsimple.*
build/
*.gch
bin/
output/
output/
output-no-jieba/
8 changes: 7 additions & 1 deletion CMakeLists.txt
Expand Up @@ -27,7 +27,6 @@ cmrc_add_resource_library(PINYIN_TEXT NAMESPACE pinyin_text contrib/pinyin.txt)
# https://github.com/vector-of-bool/cmrc/issues/17#issuecomment-659501280
set_property(TARGET PINYIN_TEXT PROPERTY POSITION_INDEPENDENT_CODE ON)


# Code Coverage Configuration
if(NOT TARGET coverage_config)
add_library(coverage_config INTERFACE)
Expand All @@ -49,7 +48,14 @@ if(CODE_COVERAGE)
endif(CODE_COVERAGE)
# endif(CODE_COVERAGE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")

# https://stackoverflow.com/a/15212881/1203241
OPTION(SIMPLE_WITH_JIEBA "Option to build with cppjieba" ON)
if(SIMPLE_WITH_JIEBA)
add_definitions(-DUSE_JIEBA=1)
endif()

add_subdirectory(src)

add_subdirectory(examples/cpp)
enable_testing()
add_subdirectory(test)
Expand Down
3 changes: 3 additions & 0 deletions README.md
Expand Up @@ -8,6 +8,8 @@ simple 是一个支持中文和拼音的 [sqlite3 fts5](https://www.sqlite.org/f

实现相关介绍:https://www.wangfenjin.com/posts/simple-tokenizer/

在此基础上,我们还支持通过 (cppjieba)[https://github.com/yanyiwu/cppjieba] 实现更精准的词组匹配。

## 用法

首先需要确认你用到的 sqlite 版本支持 fts5 拓展,确认方法是:
Expand All @@ -23,6 +25,7 @@ select fts5(?1);
3. simple_highlight() 实现连续高亮 match 的词汇,与 sqlite 自带的 highlight 类似,但是 simple_highlight 实现了连续 match 的词汇分到同一组的逻辑,理论上用户更需要这样
4. simple_highlight_pos() 实现返回 match 的词汇位置,用户可以自行决定怎么使用
5. simple_snippet() 实现截取 match 片段的功能,与 sqlite 自带的 snippet 功能类似,同样是增强连续 match 的词汇分到同一组的逻辑
6. jieba_query() 实现jieba分词的效果,在索引不变的情况下,可以实现更精准的匹配。

## 开发

Expand Down
3 changes: 2 additions & 1 deletion build-and-run
Expand Up @@ -51,6 +51,7 @@ simple.clean() {
simple.build() {
hl.subtle "build..."
run "cd build/run"
find . -name "*.gcda" -print0 | xargs -0 rm
run "cmake -DCODE_COVERAGE=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_INSTALL_PREFIX=${ProjectRoot}/output ../.."
run.set-next show-output-on
run "make -j 12"
Expand All @@ -73,7 +74,7 @@ simple.example() {
}
hl.subtle "run example..."
run "cd output/bin/"
run "./sqlite3 < ${ProjectRoot}/example.sql"
run "cat ${ProjectRoot}/example.sql ${ProjectRoot}/example-jieba.sql | ./sqlite3"
run "./simple_cpp_example"
run "cd ${ProjectRoot}"

Expand Down
91 changes: 91 additions & 0 deletions build-and-run-no-jieba
@@ -0,0 +1,91 @@
#!/usr/bin/env bash
#
# © 2018-2019 Konstantin Gredeskoul, All Rights Reserved.
# MIT License
#
# WARNING: This BASH script is completely optional. You don't need it to build this project.
#
# If you choose to run this script to build the project, run:
#
# $ ./build-and-run
#
# It will clean, build and run the tests.
#

[[ -z $(which git) ]] && {
echo "You need git installed. Please run 'xcode-select --install' first."
exit 1
}

export BashMatic="${HOME}/.bashmatic"
[[ ! -f "${BashMatic}/init.sh" ]] && {
bash -c "$(curl -fsSL https://bashmatic.re1.re); bashmatic-install"
}
source "${BashMatic}/init.sh"

export ProjectRoot=$(pwd)
export BuildDir="${ProjectRoot}/build/run"
export BashLibRoot="${ProjectRoot}/bin/lib-bash"
export LibBashRepo="https://github.com/kigster/lib-bash"

simple.header() {
h1.purple "Simple Tokenizer no jieba"
local OIFC=${IFC}
IFS="|" read -r -a gcc_info <<< "$(gcc --version 2>&1 | tr '\n' '|')"
export IFC=${OIFC}
h1 "${bldylw}GCC" "${gcc_info[1]}" "${gcc_info[2]}" "${gcc_info[3]}" "${gcc_info[4]}"
h1 "${bldylw}GIT: ${bldblu}$(git --version)"
h1 "${bldylw}CMAKE: ${bldblu}$(cmake --version | tr '\n' ' ')"
}

simple.setup() {
hl.subtle "Creating Build Folder..."
run "mkdir -p build/run-no-jieba"
}

simple.clean() {
hl.subtle "Cleaning output folders..."
run 'rm -rf bin-no-jieba/* include/* lib/* build/*'
}

simple.build() {
hl.subtle "build..."
run "cd build/run-no-jieba"
find . -name "*.gcda" -print0 | xargs -0 rm
run "cmake -DCODE_COVERAGE=ON -DSIMPLE_WITH_JIEBA=OFF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_INSTALL_PREFIX=${ProjectRoot}/output-no-jieba ../.."
run.set-next show-output-on
run "make -j 12"
run "make install | egrep -v 'gmock|gtest'"
run "cd ${ProjectRoot}"
}

simple.tests() {
hl.subtle "testing..."
run.set-all show-output-on
run "cd build/run-no-jieba"
run "ctest . -V"
run "cd ${ProjectRoot}"
}

simple.example() {
[[ ! -f ./output-no-jieba/bin/sqlite3 ]] && {
error "You don't have the cmpiled sqlite3 binary yet".
exit 3
}
hl.subtle "run example..."
run "cd output-no-jieba/bin/"
run "./sqlite3 < ${ProjectRoot}/example.sql"
run "./simple_cpp_example"
run "cd ${ProjectRoot}"

}

main() {
simple.header
simple.setup
simple.build
simple.tests
simple.example
}

(( $_s_ )) || main
7 changes: 7 additions & 0 deletions example-jieba.sql
@@ -0,0 +1,7 @@
select '使用jieba分词:';
-- will match
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match simple_query('国中woai');
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match jieba_query('中国woai');
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match jieba_query('中国woai', 0);
-- will not match, in jieba_query, the order matters
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match jieba_query('国中woai');
1 change: 0 additions & 1 deletion example.sql
Expand Up @@ -49,7 +49,6 @@ select '搜索 love zg:';
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match simple_query('love zg');
select ' ', simple_highlight_pos(t1, 0) from t1 where x match simple_query('love zg');


select '';
select '';
select '--------------------------------------------------------------------------------';
Expand Down
10 changes: 10 additions & 0 deletions examples/cpp/main.cc
Expand Up @@ -74,6 +74,16 @@ int main() {
"simple_query('@\"._''-&%')";
rc = sqlite3_exec(db, sql.c_str(), callback, 0, &zErrMsg);
handle_rc(db, rc);
#ifdef USE_JIEBA
// case 4: jieba, no match
sql = "select simple_highlight(t1, 0, '[', ']') as no_matched_jieba from t1 where x match jieba_query('国中')";
rc = sqlite3_exec(db, sql.c_str(), callback, 0, &zErrMsg);
handle_rc(db, rc);
// case 5: jieba, match
sql = "select simple_highlight(t1, 0, '[', ']') as matched_jieba from t1 where x match jieba_query('中国')";
rc = sqlite3_exec(db, sql.c_str(), callback, 0, &zErrMsg);
handle_rc(db, rc);
#endif

// Close the connection
sqlite3_close(db);
Expand Down
35 changes: 33 additions & 2 deletions src/CMakeLists.txt
@@ -1,6 +1,20 @@
cmake_minimum_required(VERSION 3.2)
project(simple CXX)

if(SIMPLE_WITH_JIEBA)
include(ExternalProject)
ExternalProject_Add(
cppjieba
PREFIX ${CMAKE_BINARY_DIR}/cppjieba
GIT_REPOSITORY https://github.com/yanyiwu/cppjieba.git
CONFIGURE_COMMAND ""
BUILD_COMMAND cmake -E echo "Skipping build cppjieba."
INSTALL_COMMAND cmake -E echo "Skipping install cppjieba."
LOG_DOWNLOAD ON
)
ExternalProject_Get_Property(cppjieba source_dir)
endif()

set(SOURCE_FILES
pinyin.h
simple_highlight.h
Expand All @@ -11,8 +25,25 @@ set(SOURCE_FILES
entry.cc
)

include_directories(${SQLITE3_HEADERS_DIR})
if(SIMPLE_WITH_JIEBA)
include_directories(${SQLITE3_HEADERS_DIR} ${source_dir}/include ${source_dir}/deps)
INSTALL(DIRECTORY ${source_dir}/dict/ DESTINATION bin/dict FILES_MATCHING PATTERN "*.utf8")
else()
include_directories(${SQLITE3_HEADERS_DIR})
endif()

add_library(simple SHARED ${SOURCE_FILES})
target_link_libraries(simple PUBLIC coverage_config PRIVATE PINYIN_TEXT SQLite3)

if(SIMPLE_WITH_JIEBA)
target_include_directories(simple INTERFACE ${SQLITE3_HEADERS_DIR} ${source_dir}/include ${source_dir}/deps)
# for tests only
add_custom_command(TARGET simple PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory
${source_dir}/dict/ $<TARGET_FILE_DIR:simple>/../test/dict/)
else()
target_include_directories(simple INTERFACE ${SQLITE3_HEADERS_DIR})
endif()

target_link_libraries(simple PUBLIC coverage_config PRIVATE PINYIN_TEXT SQLite3)

install(TARGETS simple DESTINATION bin)
23 changes: 23 additions & 0 deletions src/entry.cc
Expand Up @@ -44,6 +44,25 @@ static int fts5_api_from_db(sqlite3 *db, fts5_api **ppApi) {
return rc;
}

#ifdef USE_JIEBA
static void jieba_query(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal) {
int rc;
if (nVal >= 1) {
const char *text = (const char *)sqlite3_value_text(apVal[0]);
if (text) {
int flags = 1;
if (nVal >= 2) {
flags = atoi((const char *)sqlite3_value_text(apVal[1]));
}
std::string result = simple_tokenizer::SimpleTokenizer::tokenize_jieba_query(text, std::strlen(text), flags);
sqlite3_result_text(pCtx, result.c_str(), -1, SQLITE_TRANSIENT);
return;
}
}
sqlite3_result_null(pCtx);
}
#endif

static void simple_query(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal) {
int rc;
if (nVal >= 1) {
Expand All @@ -67,6 +86,10 @@ int sqlite3_simple_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines

rc = sqlite3_create_function(db, "simple_query", -1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL, &simple_query, NULL,
NULL);
#ifdef USE_JIEBA
rc = sqlite3_create_function(db, "jieba_query", -1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL, &jieba_query, NULL,
NULL);
#endif

// fts5_tokenizer tokenizer = {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize };
fts5_tokenizer tokenizer = {fts5_simple_xCreate, fts5_simple_xDelete, fts5_simple_xTokenize};
Expand Down

0 comments on commit 988de24

Please sign in to comment.