Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support jieba_query #35

Merged
merged 1 commit into from Feb 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Expand Up @@ -6,4 +6,5 @@ libsimple.*
build/
*.gch
bin/
output/
output/
output-no-jieba/
8 changes: 7 additions & 1 deletion CMakeLists.txt
Expand Up @@ -27,7 +27,6 @@ cmrc_add_resource_library(PINYIN_TEXT NAMESPACE pinyin_text contrib/pinyin.txt)
# https://github.com/vector-of-bool/cmrc/issues/17#issuecomment-659501280
set_property(TARGET PINYIN_TEXT PROPERTY POSITION_INDEPENDENT_CODE ON)


# Code Coverage Configuration
if(NOT TARGET coverage_config)
add_library(coverage_config INTERFACE)
Expand All @@ -49,7 +48,14 @@ if(CODE_COVERAGE)
endif(CODE_COVERAGE)
# endif(CODE_COVERAGE AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")

# https://stackoverflow.com/a/15212881/1203241
OPTION(SIMPLE_WITH_JIEBA "Option to build with cppjieba" ON)
if(SIMPLE_WITH_JIEBA)
add_definitions(-DUSE_JIEBA=1)
endif()

add_subdirectory(src)

add_subdirectory(examples/cpp)
enable_testing()
add_subdirectory(test)
Expand Down
3 changes: 3 additions & 0 deletions README.md
Expand Up @@ -8,6 +8,8 @@ simple 是一个支持中文和拼音的 [sqlite3 fts5](https://www.sqlite.org/f

实现相关介绍:https://www.wangfenjin.com/posts/simple-tokenizer/

在此基础上,我们还支持通过 (cppjieba)[https://github.com/yanyiwu/cppjieba] 实现更精准的词组匹配。

## 用法

首先需要确认你用到的 sqlite 版本支持 fts5 拓展,确认方法是:
Expand All @@ -23,6 +25,7 @@ select fts5(?1);
3. simple_highlight() 实现连续高亮 match 的词汇,与 sqlite 自带的 highlight 类似,但是 simple_highlight 实现了连续 match 的词汇分到同一组的逻辑,理论上用户更需要这样
4. simple_highlight_pos() 实现返回 match 的词汇位置,用户可以自行决定怎么使用
5. simple_snippet() 实现截取 match 片段的功能,与 sqlite 自带的 snippet 功能类似,同样是增强连续 match 的词汇分到同一组的逻辑
6. jieba_query() 实现jieba分词的效果,在索引不变的情况下,可以实现更精准的匹配。

## 开发

Expand Down
3 changes: 2 additions & 1 deletion build-and-run
Expand Up @@ -51,6 +51,7 @@ simple.clean() {
simple.build() {
hl.subtle "build..."
run "cd build/run"
find . -name "*.gcda" -print0 | xargs -0 rm
run "cmake -DCODE_COVERAGE=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_INSTALL_PREFIX=${ProjectRoot}/output ../.."
run.set-next show-output-on
run "make -j 12"
Expand All @@ -73,7 +74,7 @@ simple.example() {
}
hl.subtle "run example..."
run "cd output/bin/"
run "./sqlite3 < ${ProjectRoot}/example.sql"
run "cat ${ProjectRoot}/example.sql ${ProjectRoot}/example-jieba.sql | ./sqlite3"
run "./simple_cpp_example"
run "cd ${ProjectRoot}"

Expand Down
91 changes: 91 additions & 0 deletions build-and-run-no-jieba
@@ -0,0 +1,91 @@
#!/usr/bin/env bash
#
# © 2018-2019 Konstantin Gredeskoul, All Rights Reserved.
# MIT License
#
# WARNING: This BASH script is completely optional. You don't need it to build this project.
#
# If you choose to run this script to build the project, run:
#
# $ ./build-and-run
#
# It will clean, build and run the tests.
#

[[ -z $(which git) ]] && {
echo "You need git installed. Please run 'xcode-select --install' first."
exit 1
}

export BashMatic="${HOME}/.bashmatic"
[[ ! -f "${BashMatic}/init.sh" ]] && {
bash -c "$(curl -fsSL https://bashmatic.re1.re); bashmatic-install"
}
source "${BashMatic}/init.sh"

export ProjectRoot=$(pwd)
export BuildDir="${ProjectRoot}/build/run"
export BashLibRoot="${ProjectRoot}/bin/lib-bash"
export LibBashRepo="https://github.com/kigster/lib-bash"

simple.header() {
h1.purple "Simple Tokenizer no jieba"
local OIFC=${IFC}
IFS="|" read -r -a gcc_info <<< "$(gcc --version 2>&1 | tr '\n' '|')"
export IFC=${OIFC}
h1 "${bldylw}GCC" "${gcc_info[1]}" "${gcc_info[2]}" "${gcc_info[3]}" "${gcc_info[4]}"
h1 "${bldylw}GIT: ${bldblu}$(git --version)"
h1 "${bldylw}CMAKE: ${bldblu}$(cmake --version | tr '\n' ' ')"
}

simple.setup() {
hl.subtle "Creating Build Folder..."
run "mkdir -p build/run-no-jieba"
}

simple.clean() {
hl.subtle "Cleaning output folders..."
run 'rm -rf bin-no-jieba/* include/* lib/* build/*'
}

simple.build() {
hl.subtle "build..."
run "cd build/run-no-jieba"
find . -name "*.gcda" -print0 | xargs -0 rm
run "cmake -DCODE_COVERAGE=ON -DSIMPLE_WITH_JIEBA=OFF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_INSTALL_PREFIX=${ProjectRoot}/output-no-jieba ../.."
run.set-next show-output-on
run "make -j 12"
run "make install | egrep -v 'gmock|gtest'"
run "cd ${ProjectRoot}"
}

simple.tests() {
hl.subtle "testing..."
run.set-all show-output-on
run "cd build/run-no-jieba"
run "ctest . -V"
run "cd ${ProjectRoot}"
}

simple.example() {
[[ ! -f ./output-no-jieba/bin/sqlite3 ]] && {
error "You don't have the cmpiled sqlite3 binary yet".
exit 3
}
hl.subtle "run example..."
run "cd output-no-jieba/bin/"
run "./sqlite3 < ${ProjectRoot}/example.sql"
run "./simple_cpp_example"
run "cd ${ProjectRoot}"

}

main() {
simple.header
simple.setup
simple.build
simple.tests
simple.example
}

(( $_s_ )) || main
7 changes: 7 additions & 0 deletions example-jieba.sql
@@ -0,0 +1,7 @@
select '使用jieba分词:';
-- will match
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match simple_query('国中woai');
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match jieba_query('中国woai');
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match jieba_query('中国woai', 0);
-- will not match, in jieba_query, the order matters
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match jieba_query('国中woai');
1 change: 0 additions & 1 deletion example.sql
Expand Up @@ -49,7 +49,6 @@ select '搜索 love zg:';
select ' ', simple_highlight(t1, 0, '[', ']') from t1 where x match simple_query('love zg');
select ' ', simple_highlight_pos(t1, 0) from t1 where x match simple_query('love zg');


select '';
select '';
select '--------------------------------------------------------------------------------';
Expand Down
10 changes: 10 additions & 0 deletions examples/cpp/main.cc
Expand Up @@ -74,6 +74,16 @@ int main() {
"simple_query('@\"._''-&%')";
rc = sqlite3_exec(db, sql.c_str(), callback, 0, &zErrMsg);
handle_rc(db, rc);
#ifdef USE_JIEBA
// case 4: jieba, no match
sql = "select simple_highlight(t1, 0, '[', ']') as no_matched_jieba from t1 where x match jieba_query('国中')";
rc = sqlite3_exec(db, sql.c_str(), callback, 0, &zErrMsg);
handle_rc(db, rc);
// case 5: jieba, match
sql = "select simple_highlight(t1, 0, '[', ']') as matched_jieba from t1 where x match jieba_query('中国')";
rc = sqlite3_exec(db, sql.c_str(), callback, 0, &zErrMsg);
handle_rc(db, rc);
#endif

// Close the connection
sqlite3_close(db);
Expand Down
35 changes: 33 additions & 2 deletions src/CMakeLists.txt
@@ -1,6 +1,20 @@
cmake_minimum_required(VERSION 3.2)
project(simple CXX)

if(SIMPLE_WITH_JIEBA)
include(ExternalProject)
ExternalProject_Add(
cppjieba
PREFIX ${CMAKE_BINARY_DIR}/cppjieba
GIT_REPOSITORY https://github.com/yanyiwu/cppjieba.git
CONFIGURE_COMMAND ""
BUILD_COMMAND cmake -E echo "Skipping build cppjieba."
INSTALL_COMMAND cmake -E echo "Skipping install cppjieba."
LOG_DOWNLOAD ON
)
ExternalProject_Get_Property(cppjieba source_dir)
endif()

set(SOURCE_FILES
pinyin.h
simple_highlight.h
Expand All @@ -11,8 +25,25 @@ set(SOURCE_FILES
entry.cc
)

include_directories(${SQLITE3_HEADERS_DIR})
if(SIMPLE_WITH_JIEBA)
include_directories(${SQLITE3_HEADERS_DIR} ${source_dir}/include ${source_dir}/deps)
INSTALL(DIRECTORY ${source_dir}/dict/ DESTINATION bin/dict FILES_MATCHING PATTERN "*.utf8")
else()
include_directories(${SQLITE3_HEADERS_DIR})
endif()

add_library(simple SHARED ${SOURCE_FILES})
target_link_libraries(simple PUBLIC coverage_config PRIVATE PINYIN_TEXT SQLite3)

if(SIMPLE_WITH_JIEBA)
target_include_directories(simple INTERFACE ${SQLITE3_HEADERS_DIR} ${source_dir}/include ${source_dir}/deps)
# for tests only
add_custom_command(TARGET simple PRE_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory
${source_dir}/dict/ $<TARGET_FILE_DIR:simple>/../test/dict/)
else()
target_include_directories(simple INTERFACE ${SQLITE3_HEADERS_DIR})
endif()

target_link_libraries(simple PUBLIC coverage_config PRIVATE PINYIN_TEXT SQLite3)

install(TARGETS simple DESTINATION bin)
23 changes: 23 additions & 0 deletions src/entry.cc
Expand Up @@ -44,6 +44,25 @@ static int fts5_api_from_db(sqlite3 *db, fts5_api **ppApi) {
return rc;
}

#ifdef USE_JIEBA
static void jieba_query(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal) {
int rc;
if (nVal >= 1) {
const char *text = (const char *)sqlite3_value_text(apVal[0]);
if (text) {
int flags = 1;
if (nVal >= 2) {
flags = atoi((const char *)sqlite3_value_text(apVal[1]));
}
std::string result = simple_tokenizer::SimpleTokenizer::tokenize_jieba_query(text, std::strlen(text), flags);
sqlite3_result_text(pCtx, result.c_str(), -1, SQLITE_TRANSIENT);
return;
}
}
sqlite3_result_null(pCtx);
}
#endif

static void simple_query(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal) {
int rc;
if (nVal >= 1) {
Expand All @@ -67,6 +86,10 @@ int sqlite3_simple_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines

rc = sqlite3_create_function(db, "simple_query", -1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL, &simple_query, NULL,
NULL);
#ifdef USE_JIEBA
rc = sqlite3_create_function(db, "jieba_query", -1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, NULL, &jieba_query, NULL,
NULL);
#endif

// fts5_tokenizer tokenizer = {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize };
fts5_tokenizer tokenizer = {fts5_simple_xCreate, fts5_simple_xDelete, fts5_simple_xTokenize};
Expand Down