Skip to content

Commit

Permalink
Merge pull request #48 from daomtthuan/master
Browse files Browse the repository at this point in the history
Add Typescript Definition
  • Loading branch information
vunb committed Apr 24, 2020
2 parents 145b7f3 + f3ef88c commit 45f8a41
Show file tree
Hide file tree
Showing 6 changed files with 221 additions and 144 deletions.
35 changes: 9 additions & 26 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,17 @@
#
# Files which match the splat patterns below will
# be ignored by git. This keeps random crap and
# sensitive credentials from being uploaded to
# sensitive credentials from being uploaded to
# your repository. It allows you to configure your
# app for your machine without accidentally
# committing settings which will smash the local
# settings of other developers on your team.
# committing settings which will smash the local
# settings of other developers on your team.
#
# Some reasonable defaults are included below,
# but, of course, you should modify/extend/prune
# to fit your needs!
################################################




################################################
# Local Configuration
#
Expand All @@ -31,24 +28,20 @@
#
# 2. Environment-specific configuration
# Basically, anything that would be annoying
# to have to change every time you do a
# to have to change every time you do a
# `git pull`
# e.g., your local development database, or
# the S3 bucket you're using for file uploads
# development.
#
#
################################################

#config/local.js





################################################
# Dependencies
#
# When releasing a production app, you may
# When releasing a production app, you may
# consider including your node_modules and
# bower_components directory in your git repo,
# but during development, its best to exclude it,
Expand All @@ -63,16 +56,13 @@
# About bower_components dir, you can see this:
# http://addyosmani.com/blog/checking-in-front-end-dependencies/
# (credit Addy Osmani, @addyosmani)
#
#
################################################

www
node_modules
bower_components




################################################
# Sails.js / Waterline / Grunt
#
Expand All @@ -82,10 +72,6 @@ bower_components
.tmp
dump.rdb





################################################
# Node.js / NPM
#
Expand All @@ -99,10 +85,6 @@ lib-cov
*.pid
npm-debug.log





################################################
# VNTK - Vietnamese language toolkit
#
Expand All @@ -122,5 +104,6 @@ obj
# build
deps
package-lock.json
yarn.lock
test.js
test2.js
test2.js
93 changes: 93 additions & 0 deletions @types/index.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import CrfsuiteCore from "crfsuite";

declare namespace VNTK {
interface Tokenizer {
tokenize(text: string): string[];
stokenize(text: string): string;
}

interface WordTokenizer {
tag(text: string, mode?: "text"): string[] | string;
}

interface PosTag {
tag(text: string, mode?: "text"): string[2][] | string;
}

interface Chunking {
tag(text: string, mode?: "text"): string[3][] | string;
}

interface NamedEntityRecognition {
tag(text: string, mode?: "text"): string[4][] | string;
}

namespace Utility {
interface DictionarySense {
example: string;
sub_pos: string;
definition: string;
pos: string;
}

interface Dictionary {
has(word: string): boolean;
lookup(word: string): DictionarySense[];
}

interface Util {
clean_html(html: string): string;
}

interface FastTextClassifierResult {
label: string;
value: number;
}
}

interface LanguageIdentificationResult {
label: string;
}

interface LanguageIdentification {
detect(document: string): Promise<string>;
getLanguages(document: string, umberLanguagues: number): Promise<LanguageIdentificationResult[]>;
}

interface Crfsuite {
Tagger: CrfsuiteCore.Tagger;
Trainer: CrfsuiteCore.Trainer;
}
}

declare module "vntk" {
function tokenizer(): VNTK.Tokenizer;
function wordTokenizer(newModelPath?: string): VNTK.WordTokenizer;
function posTag(newModelPath?: string): VNTK.PosTag;
function chunking(newModelPath?: string): VNTK.Chunking;
function ner(newModelPath?: string): VNTK.NamedEntityRecognition;
function dictionary(): VNTK.Utility.Dictionary;
function util(): VNTK.Utility.Util;

class TfIdf {
public constructor();
public addDocument(document: string): void;
public tfidfs(word: string, callback: (i: number, measure: number) => void): void;
}

class BayesClassifier {
public constructor();
public addDocument(document: string, baye: string): void;
public train(): void;
public classify(document: string): string;
}

class FastTextClassifier {
public constructor(modelPath: string);
public predict(document: string, numberExamples: number, callback: (error: Error, result: VNTK.Utility.FastTextClassifierResult[]) => void): void;
}

function langid(newModelPath?: string): VNTK.LanguageIdentification;

function crfsuite(): VNTK.Crfsuite;
}
72 changes: 36 additions & 36 deletions lib/util/index.js
Original file line number Diff line number Diff line change
@@ -1,37 +1,37 @@
/**
* Natural Language Toolkit: Utility functions
*
* Copyright (C) 2016 VNTK Project
* Author: Nhu Bao Vu <nhubaovu@gmail.com>
* Homepage: https://vntk.github.io/
*/

var vm = module.exports;

/**
* Remove HTML markup from the given string.
*
* @html: the HTML string to be cleaned
* @return: string
*/
vm.clean_html = function (html) {
var cleaned = html;
// First we remove inline JavaScript/CSS (http://stackoverflow.com/a/18052486/1896897)
cleaned = cleaned.replace(/<script([^'"]|"(\\.|[^"\\])*"|'(\\.|[^'\\])*')*?<\/script>/gim, "");
cleaned = cleaned.replace(/<style([^'"]|"(\\.|[^"\\])*"|'(\\.|[^'\\])*')*?<\/style>/gim, "");
// Then we remove html comments (https://regex101.com/r/gB9iY8/3)
cleaned = cleaned.replace(/<!--((.|[\r\n])*?)-->/g, "");
// Next we can remove the remaining tags
cleaned = cleaned.replace(/<.*?>/gi, " ");
// Finally, we deal with whitespace
cleaned = cleaned.replace(/&nbsp;/gi, " ");
cleaned = cleaned.replace(/ /g, " ");
cleaned = cleaned.replace(/ /g, " ");
cleaned = cleaned.replace(/\n\s*\n/g, "\n");
cleaned = cleaned.replace(/\s*\n/g, "\n");
cleaned = cleaned.replace(/\n\s*/g, "\n ");
return cleaned.trim();
}

//
/**
* Natural Language Toolkit: Utility functions
*
* Copyright (C) 2016 VNTK Project
* Author: Nhu Bao Vu <nhubaovu@gmail.com>
* Homepage: https://vntk.github.io/
*/

var vm = module.exports;

/**
* Remove HTML markup from the given string.
*
* @html: the HTML string to be cleaned
* @return: string
*/
vm.clean_html = function (html) {
var cleaned = html;
// First we remove inline JavaScript/CSS (http://stackoverflow.com/a/18052486/1896897)
cleaned = cleaned.replace(/<script([^'"]|"(\\.|[^"\\])*"|'(\\.|[^'\\])*')*?<\/script>/gim, "");
cleaned = cleaned.replace(/<style([^'"]|"(\\.|[^"\\])*"|'(\\.|[^'\\])*')*?<\/style>/gim, "");
// Then we remove html comments (https://regex101.com/r/gB9iY8/3)
cleaned = cleaned.replace(/<!--((.|[\r\n])*?)-->/g, "");
// Next we can remove the remaining tags
cleaned = cleaned.replace(/<.*?>/gi, " ");
// Finally, we deal with whitespace
cleaned = cleaned.replace(/&nbsp;/gi, " ");
cleaned = cleaned.replace(/ /g, " ");
cleaned = cleaned.replace(/ /g, " ");
cleaned = cleaned.replace(/\n\s*\n/g, "\n");
cleaned = cleaned.replace(/\s*\n/g, "\n");
cleaned = cleaned.replace(/\n\s*/g, "\n ");
return cleaned.trim();
}

//
vm.replacer = require('./replacer');
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"version": "1.4.3",
"description": "Vietnamese NLP Toolkit for Node",
"main": "index.js",
"types": "./@types/index.d.ts",
"bin": {
"vntk": "./bin/vntk.js"
},
Expand Down
34 changes: 17 additions & 17 deletions test/specs/tokenizer/word_tokenizer.js
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
'use strict';
var test = require('tape'),
vntk = require('../../../lib/vntk'),
ws = vntk.wordTokenizer();

test('wordTokenizer simple case', function (t) {
t.plan(9);

t.equal(ws.tag('Thương mại và các sản phẩm cũng vậy.', 'text'), 'Thương_mại và các sản_phẩm cũng vậy .');
t.equal(ws.tag('Nhờ đó, chúng ta có thể kiềm chế căng thẳng và các xung đột tiếm năng không dẫn tới xung đột quân sự.', 'text'), 'Nhờ đó , chúng_ta có_thể kiềm_chế căng_thẳng và các xung_đột tiếm năng không dẫn tới xung_đột quân_sự .');
t.equal(ws.tag(' qua bộ đồ da thú ', 'text'), 'qua bộ đồ_da thú', 'multiple spaces');
t.equal(ws.tag('con', 'text'), 'con');
t.equal(ws.tag('Phải', 'text'), 'Phải');
t.equal(ws.tag('Không', 'text'), 'Không');
t.equal(ws.tag('Được không', 'text'), 'Được không');
t.equal(ws.tag('', 'text'), '', 'empty string');
t.equal(ws.tag('Tên?', 'text'), 'Tên ?', 'question mark');
'use strict';
var test = require('tape'),
vntk = require('../../../lib/vntk'),
ws = vntk.wordTokenizer();

test('wordTokenizer simple case', function (t) {
t.plan(9);

t.equal(ws.tag('Thương mại và các sản phẩm cũng vậy.', 'text'), 'Thương_mại và các sản_phẩm cũng vậy .');
t.equal(ws.tag('Nhờ đó, chúng ta có thể kiềm chế căng thẳng và các xung đột tiếm năng không dẫn tới xung đột quân sự.', 'text'), 'Nhờ đó , chúng_ta có_thể kiềm_chế căng_thẳng và các xung_đột tiếm năng không dẫn tới xung_đột quân_sự .');
t.equal(ws.tag(' qua bộ đồ da thú ', 'text'), 'qua bộ đồ_da thú', 'multiple spaces');
t.equal(ws.tag('con', 'text'), 'con');
t.equal(ws.tag('Phải', 'text'), 'Phải');
t.equal(ws.tag('Không', 'text'), 'Không');
t.equal(ws.tag('Được không', 'text'), 'Được không');
t.equal(ws.tag('', 'text'), '', 'empty string');
t.equal(ws.tag('Tên?', 'text'), 'Tên ?', 'question mark');
});
Loading

0 comments on commit 45f8a41

Please sign in to comment.