-
Notifications
You must be signed in to change notification settings - Fork 61
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #48 from daomtthuan/master
Add Typescript Definition
- Loading branch information
Showing
6 changed files
with
221 additions
and
144 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import CrfsuiteCore from "crfsuite"; | ||
|
||
declare namespace VNTK { | ||
interface Tokenizer { | ||
tokenize(text: string): string[]; | ||
stokenize(text: string): string; | ||
} | ||
|
||
interface WordTokenizer { | ||
tag(text: string, mode?: "text"): string[] | string; | ||
} | ||
|
||
interface PosTag { | ||
tag(text: string, mode?: "text"): string[2][] | string; | ||
} | ||
|
||
interface Chunking { | ||
tag(text: string, mode?: "text"): string[3][] | string; | ||
} | ||
|
||
interface NamedEntityRecognition { | ||
tag(text: string, mode?: "text"): string[4][] | string; | ||
} | ||
|
||
namespace Utility { | ||
interface DictionarySense { | ||
example: string; | ||
sub_pos: string; | ||
definition: string; | ||
pos: string; | ||
} | ||
|
||
interface Dictionary { | ||
has(word: string): boolean; | ||
lookup(word: string): DictionarySense[]; | ||
} | ||
|
||
interface Util { | ||
clean_html(html: string): string; | ||
} | ||
|
||
interface FastTextClassifierResult { | ||
label: string; | ||
value: number; | ||
} | ||
} | ||
|
||
interface LanguageIdentificationResult { | ||
label: string; | ||
} | ||
|
||
interface LanguageIdentification { | ||
detect(document: string): Promise<string>; | ||
getLanguages(document: string, umberLanguagues: number): Promise<LanguageIdentificationResult[]>; | ||
} | ||
|
||
interface Crfsuite { | ||
Tagger: CrfsuiteCore.Tagger; | ||
Trainer: CrfsuiteCore.Trainer; | ||
} | ||
} | ||
|
||
declare module "vntk" { | ||
function tokenizer(): VNTK.Tokenizer; | ||
function wordTokenizer(newModelPath?: string): VNTK.WordTokenizer; | ||
function posTag(newModelPath?: string): VNTK.PosTag; | ||
function chunking(newModelPath?: string): VNTK.Chunking; | ||
function ner(newModelPath?: string): VNTK.NamedEntityRecognition; | ||
function dictionary(): VNTK.Utility.Dictionary; | ||
function util(): VNTK.Utility.Util; | ||
|
||
class TfIdf { | ||
public constructor(); | ||
public addDocument(document: string): void; | ||
public tfidfs(word: string, callback: (i: number, measure: number) => void): void; | ||
} | ||
|
||
class BayesClassifier { | ||
public constructor(); | ||
public addDocument(document: string, baye: string): void; | ||
public train(): void; | ||
public classify(document: string): string; | ||
} | ||
|
||
class FastTextClassifier { | ||
public constructor(modelPath: string); | ||
public predict(document: string, numberExamples: number, callback: (error: Error, result: VNTK.Utility.FastTextClassifierResult[]) => void): void; | ||
} | ||
|
||
function langid(newModelPath?: string): VNTK.LanguageIdentification; | ||
|
||
function crfsuite(): VNTK.Crfsuite; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,37 +1,37 @@ | ||
/** | ||
* Natural Language Toolkit: Utility functions | ||
* | ||
* Copyright (C) 2016 VNTK Project | ||
* Author: Nhu Bao Vu <nhubaovu@gmail.com> | ||
* Homepage: https://vntk.github.io/ | ||
*/ | ||
|
||
var vm = module.exports; | ||
|
||
/** | ||
* Remove HTML markup from the given string. | ||
* | ||
* @html: the HTML string to be cleaned | ||
* @return: string | ||
*/ | ||
vm.clean_html = function (html) { | ||
var cleaned = html; | ||
// First we remove inline JavaScript/CSS (http://stackoverflow.com/a/18052486/1896897) | ||
cleaned = cleaned.replace(/<script([^'"]|"(\\.|[^"\\])*"|'(\\.|[^'\\])*')*?<\/script>/gim, ""); | ||
cleaned = cleaned.replace(/<style([^'"]|"(\\.|[^"\\])*"|'(\\.|[^'\\])*')*?<\/style>/gim, ""); | ||
// Then we remove html comments (https://regex101.com/r/gB9iY8/3) | ||
cleaned = cleaned.replace(/<!--((.|[\r\n])*?)-->/g, ""); | ||
// Next we can remove the remaining tags | ||
cleaned = cleaned.replace(/<.*?>/gi, " "); | ||
// Finally, we deal with whitespace | ||
cleaned = cleaned.replace(/ /gi, " "); | ||
cleaned = cleaned.replace(/ /g, " "); | ||
cleaned = cleaned.replace(/ /g, " "); | ||
cleaned = cleaned.replace(/\n\s*\n/g, "\n"); | ||
cleaned = cleaned.replace(/\s*\n/g, "\n"); | ||
cleaned = cleaned.replace(/\n\s*/g, "\n "); | ||
return cleaned.trim(); | ||
} | ||
|
||
// | ||
/** | ||
* Natural Language Toolkit: Utility functions | ||
* | ||
* Copyright (C) 2016 VNTK Project | ||
* Author: Nhu Bao Vu <nhubaovu@gmail.com> | ||
* Homepage: https://vntk.github.io/ | ||
*/ | ||
|
||
var vm = module.exports; | ||
|
||
/** | ||
* Remove HTML markup from the given string. | ||
* | ||
* @html: the HTML string to be cleaned | ||
* @return: string | ||
*/ | ||
vm.clean_html = function (html) { | ||
var cleaned = html; | ||
// First we remove inline JavaScript/CSS (http://stackoverflow.com/a/18052486/1896897) | ||
cleaned = cleaned.replace(/<script([^'"]|"(\\.|[^"\\])*"|'(\\.|[^'\\])*')*?<\/script>/gim, ""); | ||
cleaned = cleaned.replace(/<style([^'"]|"(\\.|[^"\\])*"|'(\\.|[^'\\])*')*?<\/style>/gim, ""); | ||
// Then we remove html comments (https://regex101.com/r/gB9iY8/3) | ||
cleaned = cleaned.replace(/<!--((.|[\r\n])*?)-->/g, ""); | ||
// Next we can remove the remaining tags | ||
cleaned = cleaned.replace(/<.*?>/gi, " "); | ||
// Finally, we deal with whitespace | ||
cleaned = cleaned.replace(/ /gi, " "); | ||
cleaned = cleaned.replace(/ /g, " "); | ||
cleaned = cleaned.replace(/ /g, " "); | ||
cleaned = cleaned.replace(/\n\s*\n/g, "\n"); | ||
cleaned = cleaned.replace(/\s*\n/g, "\n"); | ||
cleaned = cleaned.replace(/\n\s*/g, "\n "); | ||
return cleaned.trim(); | ||
} | ||
|
||
// | ||
vm.replacer = require('./replacer'); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,18 @@ | ||
'use strict'; | ||
var test = require('tape'), | ||
vntk = require('../../../lib/vntk'), | ||
ws = vntk.wordTokenizer(); | ||
|
||
test('wordTokenizer simple case', function (t) { | ||
t.plan(9); | ||
|
||
t.equal(ws.tag('Thương mại và các sản phẩm cũng vậy.', 'text'), 'Thương_mại và các sản_phẩm cũng vậy .'); | ||
t.equal(ws.tag('Nhờ đó, chúng ta có thể kiềm chế căng thẳng và các xung đột tiếm năng không dẫn tới xung đột quân sự.', 'text'), 'Nhờ đó , chúng_ta có_thể kiềm_chế căng_thẳng và các xung_đột tiếm năng không dẫn tới xung_đột quân_sự .'); | ||
t.equal(ws.tag(' qua bộ đồ da thú ', 'text'), 'qua bộ đồ_da thú', 'multiple spaces'); | ||
t.equal(ws.tag('con', 'text'), 'con'); | ||
t.equal(ws.tag('Phải', 'text'), 'Phải'); | ||
t.equal(ws.tag('Không', 'text'), 'Không'); | ||
t.equal(ws.tag('Được không', 'text'), 'Được không'); | ||
t.equal(ws.tag('', 'text'), '', 'empty string'); | ||
t.equal(ws.tag('Tên?', 'text'), 'Tên ?', 'question mark'); | ||
'use strict'; | ||
var test = require('tape'), | ||
vntk = require('../../../lib/vntk'), | ||
ws = vntk.wordTokenizer(); | ||
|
||
test('wordTokenizer simple case', function (t) { | ||
t.plan(9); | ||
|
||
t.equal(ws.tag('Thương mại và các sản phẩm cũng vậy.', 'text'), 'Thương_mại và các sản_phẩm cũng vậy .'); | ||
t.equal(ws.tag('Nhờ đó, chúng ta có thể kiềm chế căng thẳng và các xung đột tiếm năng không dẫn tới xung đột quân sự.', 'text'), 'Nhờ đó , chúng_ta có_thể kiềm_chế căng_thẳng và các xung_đột tiếm năng không dẫn tới xung_đột quân_sự .'); | ||
t.equal(ws.tag(' qua bộ đồ da thú ', 'text'), 'qua bộ đồ_da thú', 'multiple spaces'); | ||
t.equal(ws.tag('con', 'text'), 'con'); | ||
t.equal(ws.tag('Phải', 'text'), 'Phải'); | ||
t.equal(ws.tag('Không', 'text'), 'Không'); | ||
t.equal(ws.tag('Được không', 'text'), 'Được không'); | ||
t.equal(ws.tag('', 'text'), '', 'empty string'); | ||
t.equal(ws.tag('Tên?', 'text'), 'Tên ?', 'question mark'); | ||
}); |
Oops, something went wrong.