generated from obsidianmd/obsidian-sample-plugin
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create a logic as original for
English only tokenizer
(#86)
- Loading branch information
1 parent
52f2bc9
commit 6f0b044
Showing
2 changed files
with
74 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,68 @@ | ||
import { DefaultTokenizer } from "./DefaultTokenizer"; | ||
|
||
// XXX: Hack implementation. There are some problems especially in recursiveTokenize. | ||
export const ENGLISH_ONLY_TRIM_CHAR_PATTERN = /[^a-zA-Z0-9_\-#@]/g; | ||
type PreviousType = "none" | "trim" | "english" | "others"; | ||
const ENGLISH_PATTERN = /[a-zA-Z0-9_\-]/; | ||
export class EnglishOnlyTokenizer extends DefaultTokenizer { | ||
getTrimPattern(): RegExp { | ||
return ENGLISH_ONLY_TRIM_CHAR_PATTERN; | ||
tokenize(content: string, raw?: boolean): string[] { | ||
const tokenized = Array.from(this._tokenize(content)); | ||
return raw | ||
? tokenized.map((x) => x.word) | ||
: tokenized | ||
.map((x) => x.word) | ||
.filter((x) => !x.match(this.getTrimPattern())); | ||
} | ||
|
||
recursiveTokenize(content: string): { word: string; offset: number }[] { | ||
const offsets = Array.from(this._tokenize(content)) | ||
.filter((x) => !x.word.match(this.getTrimPattern())) | ||
.map((x) => x.offset); | ||
return [ | ||
...offsets.map((i) => ({ | ||
word: content.slice(i), | ||
offset: i, | ||
})), | ||
]; | ||
} | ||
|
||
private *_tokenize( | ||
content: string | ||
): Iterable<{ word: string; offset: number }> { | ||
let startIndex = 0; | ||
let previousType: PreviousType = "none"; | ||
|
||
for (let i = 0; i < content.length; i++) { | ||
if (content[i].match(super.getTrimPattern())) { | ||
yield { word: content.slice(startIndex, i), offset: startIndex }; | ||
previousType = "trim"; | ||
startIndex = i; | ||
continue; | ||
} | ||
|
||
if (content[i].match(ENGLISH_PATTERN)) { | ||
if (previousType === "english" || previousType === "none") { | ||
previousType = "english"; | ||
continue; | ||
} | ||
|
||
yield { word: content.slice(startIndex, i), offset: startIndex }; | ||
previousType = "english"; | ||
startIndex = i; | ||
continue; | ||
} | ||
|
||
if (previousType === "others" || previousType === "none") { | ||
previousType = "others"; | ||
continue; | ||
} | ||
|
||
yield { word: content.slice(startIndex, i), offset: startIndex }; | ||
previousType = "others"; | ||
startIndex = i; | ||
} | ||
|
||
yield { | ||
word: content.slice(startIndex, content.length), | ||
offset: startIndex, | ||
}; | ||
} | ||
} |