From 6f0b044191ee5c49ca1dcab4ec5422f0b2e97c98 Mon Sep 17 00:00:00 2001 From: tadashi-aikawa Date: Sun, 13 Feb 2022 20:33:33 +0900 Subject: [PATCH] Create a logic as original for `English only tokenizer` (#86) --- .../tokenizers/EnglishOnlyTokenizer.test.ts | 23 +++---- .../tokenizers/EnglishOnlyTokenizer.ts | 67 +++++++++++++++++-- 2 files changed, 74 insertions(+), 16 deletions(-) diff --git a/src/tokenizer/tokenizers/EnglishOnlyTokenizer.test.ts b/src/tokenizer/tokenizers/EnglishOnlyTokenizer.test.ts index 1c06da4..d4862a4 100644 --- a/src/tokenizer/tokenizers/EnglishOnlyTokenizer.test.ts +++ b/src/tokenizer/tokenizers/EnglishOnlyTokenizer.test.ts @@ -4,11 +4,11 @@ import { TokenizeStrategy } from "../TokenizeStrategy"; describe.each` content | raw | expected ${"aa bb cc"} | ${false} | ${["aa", "bb", "cc"]} - ${"Edith旧市街"} | ${false} | ${["Edith"]} - ${"Edith旧city"} | ${false} | ${["Edith", "city"]} - ${"イーディスold city"} | ${false} | ${["old", "city"]} - ${"イーディスold市街"} | ${false} | ${["old"]} - ${"イーディス旧市街"} | ${false} | ${[]} + ${"Edith旧市街"} | ${false} | ${["Edith", "旧市街"]} + ${"Edith旧city"} | ${false} | ${["Edith", "旧", "city"]} + ${"イーディスold city"} | ${false} | ${["イーディス", "old", "city"]} + ${"イーディスold市街"} | ${false} | ${["イーディス", "old", "市街"]} + ${"イーディス旧市街"} | ${false} | ${["イーディス旧市街"]} `("tokenize", ({ content, raw, expected }) => { test(`tokenize(${content}, ${raw}) = ${expected}`, () => { expect( @@ -17,17 +17,16 @@ describe.each` }); }); -// XXX: Hack implementation. There are some problems especially in recursiveTokenize. describe.each` content | expected ${"aa bb cc"} | ${[{ word: "aa bb cc", offset: 0 }, { word: "bb cc", offset: 3 }, { word: "cc", offset: 6 }]} ${"aa:bb:cc"} | ${[{ word: "aa:bb:cc", offset: 0 }, { word: "bb:cc", offset: 3 }, { word: "cc", offset: 6 }]} - ${"## @smi"} | ${[{ word: "## @smi", offset: 0 }, { word: "@smi", offset: 3 }]} - ${"Edith旧市街"} | ${[{ word: "Edith旧市街", offset: 0 }, { word: "市街", offset: 6 }, { word: "街", offset: 7 }, { word: "", offset: 8 }]} - ${"Edith旧city"} | ${[{ word: "Edith旧city", offset: 0 }, { word: "city", offset: 6 }]} - ${"ヒナold city"} | ${[{ word: "ヒナold city", offset: 0 }, { word: "ナold city", offset: 1 }, { word: "old city", offset: 2 }, { word: "city", offset: 6 }]} - ${"ヒナold市街"} | ${[{ word: "ヒナold市街", offset: 0 }, { word: "ナold市街", offset: 1 }, { word: "old市街", offset: 2 }, { word: "街", offset: 6 }, { word: "", offset: 7 }]} - ${"ヒナ旧市街"} | ${[{ word: "ヒナ旧市街", offset: 0 }, { word: "ナ旧市街", offset: 1 }, { word: "旧市街", offset: 2 }, { word: "市街", offset: 3 }, { word: "街", offset: 4 }, { word: "", offset: 5 }]} + ${"## @smi"} | ${[{ word: "## @smi", offset: 0 }, { word: "@smi", offset: 3 }, { word: "smi", offset: 4 }]} + ${"Edith旧市街"} | ${[{ word: "Edith旧市街", offset: 0 }, { word: "旧市街", offset: 5 }]} + ${"Edith旧city"} | ${[{ word: "Edith旧city", offset: 0 }, { word: "旧city", offset: 5 }, { word: "city", offset: 6 }]} + ${"ヒナold city"} | ${[{ word: "ヒナold city", offset: 0 }, { word: "old city", offset: 2 }, { word: "city", offset: 6 }]} + ${"ヒナold市街"} | ${[{ word: "ヒナold市街", offset: 0 }, { word: "old市街", offset: 2 }, { word: "市街", offset: 5 }]} + ${"ヒナ旧市街"} | ${[{ word: "ヒナ旧市街", offset: 0 }]} `("recursiveTokenize", ({ content, expected }) => { test(`recursiveTokenize(${content}) = ${expected}`, () => { expect( diff --git a/src/tokenizer/tokenizers/EnglishOnlyTokenizer.ts b/src/tokenizer/tokenizers/EnglishOnlyTokenizer.ts index eeca693..846d1b5 100644 --- a/src/tokenizer/tokenizers/EnglishOnlyTokenizer.ts +++ b/src/tokenizer/tokenizers/EnglishOnlyTokenizer.ts @@ -1,9 +1,68 @@ import { DefaultTokenizer } from "./DefaultTokenizer"; -// XXX: Hack implementation. There are some problems especially in recursiveTokenize. -export const ENGLISH_ONLY_TRIM_CHAR_PATTERN = /[^a-zA-Z0-9_\-#@]/g; +type PreviousType = "none" | "trim" | "english" | "others"; +const ENGLISH_PATTERN = /[a-zA-Z0-9_\-]/; export class EnglishOnlyTokenizer extends DefaultTokenizer { - getTrimPattern(): RegExp { - return ENGLISH_ONLY_TRIM_CHAR_PATTERN; + tokenize(content: string, raw?: boolean): string[] { + const tokenized = Array.from(this._tokenize(content)); + return raw + ? tokenized.map((x) => x.word) + : tokenized + .map((x) => x.word) + .filter((x) => !x.match(this.getTrimPattern())); + } + + recursiveTokenize(content: string): { word: string; offset: number }[] { + const offsets = Array.from(this._tokenize(content)) + .filter((x) => !x.word.match(this.getTrimPattern())) + .map((x) => x.offset); + return [ + ...offsets.map((i) => ({ + word: content.slice(i), + offset: i, + })), + ]; + } + + private *_tokenize( + content: string + ): Iterable<{ word: string; offset: number }> { + let startIndex = 0; + let previousType: PreviousType = "none"; + + for (let i = 0; i < content.length; i++) { + if (content[i].match(super.getTrimPattern())) { + yield { word: content.slice(startIndex, i), offset: startIndex }; + previousType = "trim"; + startIndex = i; + continue; + } + + if (content[i].match(ENGLISH_PATTERN)) { + if (previousType === "english" || previousType === "none") { + previousType = "english"; + continue; + } + + yield { word: content.slice(startIndex, i), offset: startIndex }; + previousType = "english"; + startIndex = i; + continue; + } + + if (previousType === "others" || previousType === "none") { + previousType = "others"; + continue; + } + + yield { word: content.slice(startIndex, i), offset: startIndex }; + previousType = "others"; + startIndex = i; + } + + yield { + word: content.slice(startIndex, content.length), + offset: startIndex, + }; } }