Skip to content

Commit

Permalink
Add tests for EnglishOnlyTokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
tadashi-aikawa committed Feb 13, 2022
1 parent eed5154 commit 52f2bc9
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/tokenizer/tokenizers/DefaultTokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ export class DefaultTokenizer implements Tokenizer {
}

recursiveTokenize(content: string): { word: string; offset: number }[] {
const trimIndexes = Array.from(content.matchAll(TRIM_CHAR_PATTERN))
const trimIndexes = Array.from(content.matchAll(this.getTrimPattern()))
.sort((a, b) => a.index! - b.index!)
.map((x) => x.index!);
return [
Expand Down
37 changes: 37 additions & 0 deletions src/tokenizer/tokenizers/EnglishOnlyTokenizer.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import { createTokenizer } from "../tokenizer";
import { TokenizeStrategy } from "../TokenizeStrategy";

describe.each`
content | raw | expected
${"aa bb cc"} | ${false} | ${["aa", "bb", "cc"]}
${"Edith旧市街"} | ${false} | ${["Edith"]}
${"Edith旧city"} | ${false} | ${["Edith", "city"]}
${"イーディスold city"} | ${false} | ${["old", "city"]}
${"イーディスold市街"} | ${false} | ${["old"]}
${"イーディス旧市街"} | ${false} | ${[]}
`("tokenize", ({ content, raw, expected }) => {
test(`tokenize(${content}, ${raw}) = ${expected}`, () => {
expect(
createTokenizer(TokenizeStrategy.ENGLISH_ONLY).tokenize(content, raw)
).toStrictEqual(expected);
});
});

// XXX: Hack implementation. There are some problems especially in recursiveTokenize.
describe.each`
content | expected
${"aa bb cc"} | ${[{ word: "aa bb cc", offset: 0 }, { word: "bb cc", offset: 3 }, { word: "cc", offset: 6 }]}
${"aa:bb:cc"} | ${[{ word: "aa:bb:cc", offset: 0 }, { word: "bb:cc", offset: 3 }, { word: "cc", offset: 6 }]}
${"## @smi"} | ${[{ word: "## @smi", offset: 0 }, { word: "@smi", offset: 3 }]}
${"Edith旧市街"} | ${[{ word: "Edith旧市街", offset: 0 }, { word: "市街", offset: 6 }, { word: "街", offset: 7 }, { word: "", offset: 8 }]}
${"Edith旧city"} | ${[{ word: "Edith旧city", offset: 0 }, { word: "city", offset: 6 }]}
${"ヒナold city"} | ${[{ word: "ヒナold city", offset: 0 }, { word: "ナold city", offset: 1 }, { word: "old city", offset: 2 }, { word: "city", offset: 6 }]}
${"ヒナold市街"} | ${[{ word: "ヒナold市街", offset: 0 }, { word: "ナold市街", offset: 1 }, { word: "old市街", offset: 2 }, { word: "街", offset: 6 }, { word: "", offset: 7 }]}
${"ヒナ旧市街"} | ${[{ word: "ヒナ旧市街", offset: 0 }, { word: "ナ旧市街", offset: 1 }, { word: "旧市街", offset: 2 }, { word: "市街", offset: 3 }, { word: "街", offset: 4 }, { word: "", offset: 5 }]}
`("recursiveTokenize", ({ content, expected }) => {
test(`recursiveTokenize(${content}) = ${expected}`, () => {
expect(
createTokenizer(TokenizeStrategy.ENGLISH_ONLY).recursiveTokenize(content)
).toStrictEqual(expected);
});
});
3 changes: 2 additions & 1 deletion src/tokenizer/tokenizers/EnglishOnlyTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { DefaultTokenizer } from "./DefaultTokenizer";

export const ENGLISH_ONLY_TRIM_CHAR_PATTERN = /[^a-zA-Z0-9_-]/g;
// XXX: Hack implementation. There are some problems especially in recursiveTokenize.
export const ENGLISH_ONLY_TRIM_CHAR_PATTERN = /[^a-zA-Z0-9_\-#@]/g;
export class EnglishOnlyTokenizer extends DefaultTokenizer {
getTrimPattern(): RegExp {
return ENGLISH_ONLY_TRIM_CHAR_PATTERN;
Expand Down

0 comments on commit 52f2bc9

Please sign in to comment.