Skip to content

Commit

Permalink
Create a logic as original for English only tokenizer (#86)
Browse files Browse the repository at this point in the history
  • Loading branch information
tadashi-aikawa committed Feb 13, 2022
1 parent 52f2bc9 commit 6f0b044
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 16 deletions.
23 changes: 11 additions & 12 deletions src/tokenizer/tokenizers/EnglishOnlyTokenizer.test.ts
Expand Up @@ -4,11 +4,11 @@ import { TokenizeStrategy } from "../TokenizeStrategy";
describe.each`
content | raw | expected
${"aa bb cc"} | ${false} | ${["aa", "bb", "cc"]}
${"Edith旧市街"} | ${false} | ${["Edith"]}
${"Edith旧city"} | ${false} | ${["Edith", "city"]}
${"イーディスold city"} | ${false} | ${["old", "city"]}
${"イーディスold市街"} | ${false} | ${["old"]}
${"イーディス旧市街"} | ${false} | ${[]}
${"Edith旧市街"} | ${false} | ${["Edith", "旧市街"]}
${"Edith旧city"} | ${false} | ${["Edith", "旧", "city"]}
${"イーディスold city"} | ${false} | ${["イーディス", "old", "city"]}
${"イーディスold市街"} | ${false} | ${["イーディス", "old", "市街"]}
${"イーディス旧市街"} | ${false} | ${["イーディス旧市街"]}
`("tokenize", ({ content, raw, expected }) => {
test(`tokenize(${content}, ${raw}) = ${expected}`, () => {
expect(
Expand All @@ -17,17 +17,16 @@ describe.each`
});
});

// XXX: Hack implementation. There are some problems especially in recursiveTokenize.
describe.each`
content | expected
${"aa bb cc"} | ${[{ word: "aa bb cc", offset: 0 }, { word: "bb cc", offset: 3 }, { word: "cc", offset: 6 }]}
${"aa:bb:cc"} | ${[{ word: "aa:bb:cc", offset: 0 }, { word: "bb:cc", offset: 3 }, { word: "cc", offset: 6 }]}
${"## @smi"} | ${[{ word: "## @smi", offset: 0 }, { word: "@smi", offset: 3 }]}
${"Edith旧市街"} | ${[{ word: "Edith旧市街", offset: 0 }, { word: "市街", offset: 6 }, { word: "街", offset: 7 }, { word: "", offset: 8 }]}
${"Edith旧city"} | ${[{ word: "Edith旧city", offset: 0 }, { word: "city", offset: 6 }]}
${"ヒナold city"} | ${[{ word: "ヒナold city", offset: 0 }, { word: "ナold city", offset: 1 }, { word: "old city", offset: 2 }, { word: "city", offset: 6 }]}
${"ヒナold市街"} | ${[{ word: "ヒナold市街", offset: 0 }, { word: "ナold市街", offset: 1 }, { word: "old市街", offset: 2 }, { word: "", offset: 6 }, { word: "", offset: 7 }]}
${"ヒナ旧市街"} | ${[{ word: "ヒナ旧市街", offset: 0 }, { word: "ナ旧市街", offset: 1 }, { word: "旧市街", offset: 2 }, { word: "市街", offset: 3 }, { word: "街", offset: 4 }, { word: "", offset: 5 }]}
${"## @smi"} | ${[{ word: "## @smi", offset: 0 }, { word: "@smi", offset: 3 }, { word: "smi", offset: 4 }]}
${"Edith旧市街"} | ${[{ word: "Edith旧市街", offset: 0 }, { word: "旧市街", offset: 5 }]}
${"Edith旧city"} | ${[{ word: "Edith旧city", offset: 0 }, { word: "旧city", offset: 5 }, { word: "city", offset: 6 }]}
${"ヒナold city"} | ${[{ word: "ヒナold city", offset: 0 }, { word: "old city", offset: 2 }, { word: "city", offset: 6 }]}
${"ヒナold市街"} | ${[{ word: "ヒナold市街", offset: 0 }, { word: "old市街", offset: 2 }, { word: "市街", offset: 5 }]}
${"ヒナ旧市街"} | ${[{ word: "ヒナ旧市街", offset: 0 }]}
`("recursiveTokenize", ({ content, expected }) => {
test(`recursiveTokenize(${content}) = ${expected}`, () => {
expect(
Expand Down
67 changes: 63 additions & 4 deletions src/tokenizer/tokenizers/EnglishOnlyTokenizer.ts
@@ -1,9 +1,68 @@
import { DefaultTokenizer } from "./DefaultTokenizer";

// XXX: Hack implementation. There are some problems especially in recursiveTokenize.
export const ENGLISH_ONLY_TRIM_CHAR_PATTERN = /[^a-zA-Z0-9_\-#@]/g;
type PreviousType = "none" | "trim" | "english" | "others";
const ENGLISH_PATTERN = /[a-zA-Z0-9_\-]/;
export class EnglishOnlyTokenizer extends DefaultTokenizer {
getTrimPattern(): RegExp {
return ENGLISH_ONLY_TRIM_CHAR_PATTERN;
tokenize(content: string, raw?: boolean): string[] {
const tokenized = Array.from(this._tokenize(content));
return raw
? tokenized.map((x) => x.word)
: tokenized
.map((x) => x.word)
.filter((x) => !x.match(this.getTrimPattern()));
}

recursiveTokenize(content: string): { word: string; offset: number }[] {
const offsets = Array.from(this._tokenize(content))
.filter((x) => !x.word.match(this.getTrimPattern()))
.map((x) => x.offset);
return [
...offsets.map((i) => ({
word: content.slice(i),
offset: i,
})),
];
}

private *_tokenize(
content: string
): Iterable<{ word: string; offset: number }> {
let startIndex = 0;
let previousType: PreviousType = "none";

for (let i = 0; i < content.length; i++) {
if (content[i].match(super.getTrimPattern())) {
yield { word: content.slice(startIndex, i), offset: startIndex };
previousType = "trim";
startIndex = i;
continue;
}

if (content[i].match(ENGLISH_PATTERN)) {
if (previousType === "english" || previousType === "none") {
previousType = "english";
continue;
}

yield { word: content.slice(startIndex, i), offset: startIndex };
previousType = "english";
startIndex = i;
continue;
}

if (previousType === "others" || previousType === "none") {
previousType = "others";
continue;
}

yield { word: content.slice(startIndex, i), offset: startIndex };
previousType = "others";
startIndex = i;
}

yield {
word: content.slice(startIndex, content.length),
offset: startIndex,
};
}
}

0 comments on commit 6f0b044

Please sign in to comment.