Merge pull request #48 from daomtthuan/master

Add Typescript Definition
vunb · Apr 24, 2020 · 45f8a41 · 45f8a41
2 parents 145b7f3 + f3ef88c
commit 45f8a41
Show file tree

Hide file tree

Showing 6 changed files with 221 additions and 144 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,20 +6,17 @@
 #
 # Files which match the splat patterns below will
 # be ignored by git.  This keeps random crap and
-# sensitive credentials from being uploaded to 
+# sensitive credentials from being uploaded to
 # your repository.  It allows you to configure your
 # app for your machine without accidentally
-# committing settings which will smash the local 
-# settings of  other developers on your team. 
+# committing settings which will smash the local
+# settings of  other developers on your team.
 #
 # Some reasonable defaults are included below,
 # but, of course, you should modify/extend/prune
 # to fit your needs!
 ################################################
 
-
-
-
 ################################################
 # Local Configuration
 #
@@ -31,24 +28,20 @@
 #
 # 2. Environment-specific configuration
 #    Basically, anything that would be annoying
-#    to have to change every time you do a 
+#    to have to change every time you do a
 #    `git pull`
 #    e.g., your local development database, or
 #    the S3 bucket you're using for file uploads
 #    development.
-# 
+#
 ################################################
 
 #config/local.js
 
-
-
-
-
 ################################################
 # Dependencies
 #
-# When releasing a production app, you may 
+# When releasing a production app, you may
 # consider including your node_modules and
 # bower_components directory in your git repo,
 # but during development, its best to exclude it,
@@ -63,16 +56,13 @@
 # About bower_components dir, you can see this:
 # http://addyosmani.com/blog/checking-in-front-end-dependencies/
 # (credit Addy Osmani, @addyosmani)
-# 
+#
 ################################################
 
 www
 node_modules
 bower_components
 
-
-
-
 ################################################
 # Sails.js / Waterline / Grunt
 #
@@ -82,10 +72,6 @@ bower_components
 .tmp
 dump.rdb
 
-
-
-
-
 ################################################
 # Node.js / NPM
 #
@@ -99,10 +85,6 @@ lib-cov
 *.pid
 npm-debug.log
 
-
-
-
-
 ################################################
 # VNTK - Vietnamese language toolkit
 #
@@ -122,5 +104,6 @@ obj
 # build
 deps
 package-lock.json
+yarn.lock
 test.js
-test2.js
+test2.js
diff --git a/@types/index.d.ts b/@types/index.d.ts
@@ -0,0 +1,93 @@
+import CrfsuiteCore from "crfsuite";
+
+declare namespace VNTK {
+  interface Tokenizer {
+    tokenize(text: string): string[];
+    stokenize(text: string): string;
+  }
+
+  interface WordTokenizer {
+    tag(text: string, mode?: "text"): string[] | string;
+  }
+
+  interface PosTag {
+    tag(text: string, mode?: "text"): string[2][] | string;
+  }
+
+  interface Chunking {
+    tag(text: string, mode?: "text"): string[3][] | string;
+  }
+
+  interface NamedEntityRecognition {
+    tag(text: string, mode?: "text"): string[4][] | string;
+  }
+
+  namespace Utility {
+    interface DictionarySense {
+      example: string;
+      sub_pos: string;
+      definition: string;
+      pos: string;
+    }
+
+    interface Dictionary {
+      has(word: string): boolean;
+      lookup(word: string): DictionarySense[];
+    }
+
+    interface Util {
+      clean_html(html: string): string;
+    }
+
+    interface FastTextClassifierResult {
+      label: string;
+      value: number;
+    }
+  }
+
+  interface LanguageIdentificationResult {
+    label: string;
+  }
+
+  interface LanguageIdentification {
+    detect(document: string): Promise<string>;
+    getLanguages(document: string, umberLanguagues: number): Promise<LanguageIdentificationResult[]>;
+  }
+
+  interface Crfsuite {
+    Tagger: CrfsuiteCore.Tagger;
+    Trainer: CrfsuiteCore.Trainer;
+  }
+}
+
+declare module "vntk" {
+  function tokenizer(): VNTK.Tokenizer;
+  function wordTokenizer(newModelPath?: string): VNTK.WordTokenizer;
+  function posTag(newModelPath?: string): VNTK.PosTag;
+  function chunking(newModelPath?: string): VNTK.Chunking;
+  function ner(newModelPath?: string): VNTK.NamedEntityRecognition;
+  function dictionary(): VNTK.Utility.Dictionary;
+  function util(): VNTK.Utility.Util;
+
+  class TfIdf {
+    public constructor();
+    public addDocument(document: string): void;
+    public tfidfs(word: string, callback: (i: number, measure: number) => void): void;
+  }
+
+  class BayesClassifier {
+    public constructor();
+    public addDocument(document: string, baye: string): void;
+    public train(): void;
+    public classify(document: string): string;
+  }
+
+  class FastTextClassifier {
+    public constructor(modelPath: string);
+    public predict(document: string, numberExamples: number, callback: (error: Error, result: VNTK.Utility.FastTextClassifierResult[]) => void): void;
+  }
+
+  function langid(newModelPath?: string): VNTK.LanguageIdentification;
+
+  function crfsuite(): VNTK.Crfsuite;
+}
diff --git a/lib/util/index.js b/lib/util/index.js
@@ -1,37 +1,37 @@
-/**
- * Natural Language Toolkit: Utility functions
- * 
- * Copyright (C) 2016 VNTK Project
- * Author: Nhu Bao Vu <nhubaovu@gmail.com>
- * Homepage: https://vntk.github.io/
- */
-
-var vm = module.exports;
-
-/**
- * Remove HTML markup from the given string.
- * 
- * @html: the HTML string to be cleaned
- * @return: string
- */
-vm.clean_html = function (html) {
-    var cleaned = html;
-    // First we remove inline JavaScript/CSS (http://stackoverflow.com/a/18052486/1896897)
-    cleaned = cleaned.replace(/<script([^'"]|"(\\.|[^"\\])*"|'(\\.|[^'\\])*')*?<\/script>/gim, "");
-    cleaned = cleaned.replace(/<style([^'"]|"(\\.|[^"\\])*"|'(\\.|[^'\\])*')*?<\/style>/gim, "");
-    // Then we remove html comments (https://regex101.com/r/gB9iY8/3)
-    cleaned = cleaned.replace(/<!--((.|[\r\n])*?)-->/g, "");
-    // Next we can remove the remaining tags
-    cleaned = cleaned.replace(/<.*?>/gi, " ");
-    // Finally, we deal with whitespace
-    cleaned = cleaned.replace(/&nbsp;/gi, " ");
-    cleaned = cleaned.replace(/  /g, " ");
-    cleaned = cleaned.replace(/  /g, " ");
-    cleaned = cleaned.replace(/\n\s*\n/g, "\n");
-    cleaned = cleaned.replace(/\s*\n/g, "\n");
-    cleaned = cleaned.replace(/\n\s*/g, "\n ");
-    return cleaned.trim();
-}
-
-// 
+/**
+ * Natural Language Toolkit: Utility functions
+ * 
+ * Copyright (C) 2016 VNTK Project
+ * Author: Nhu Bao Vu <nhubaovu@gmail.com>
+ * Homepage: https://vntk.github.io/
+ */
+
+var vm = module.exports;
+
+/**
+ * Remove HTML markup from the given string.
+ * 
+ * @html: the HTML string to be cleaned
+ * @return: string
+ */
+vm.clean_html = function (html) {
+    var cleaned = html;
+    // First we remove inline JavaScript/CSS (http://stackoverflow.com/a/18052486/1896897)
+    cleaned = cleaned.replace(/<script([^'"]|"(\\.|[^"\\])*"|'(\\.|[^'\\])*')*?<\/script>/gim, "");
+    cleaned = cleaned.replace(/<style([^'"]|"(\\.|[^"\\])*"|'(\\.|[^'\\])*')*?<\/style>/gim, "");
+    // Then we remove html comments (https://regex101.com/r/gB9iY8/3)
+    cleaned = cleaned.replace(/<!--((.|[\r\n])*?)-->/g, "");
+    // Next we can remove the remaining tags
+    cleaned = cleaned.replace(/<.*?>/gi, " ");
+    // Finally, we deal with whitespace
+    cleaned = cleaned.replace(/&nbsp;/gi, " ");
+    cleaned = cleaned.replace(/  /g, " ");
+    cleaned = cleaned.replace(/  /g, " ");
+    cleaned = cleaned.replace(/\n\s*\n/g, "\n");
+    cleaned = cleaned.replace(/\s*\n/g, "\n");
+    cleaned = cleaned.replace(/\n\s*/g, "\n ");
+    return cleaned.trim();
+}
+
+// 
 vm.replacer = require('./replacer');
diff --git a/package.json b/package.json
@@ -3,6 +3,7 @@
   "version": "1.4.3",
   "description": "Vietnamese NLP Toolkit for Node",
   "main": "index.js",
+  "types": "./@types/index.d.ts",
   "bin": {
     "vntk": "./bin/vntk.js"
   },

diff --git a/test/specs/tokenizer/word_tokenizer.js b/test/specs/tokenizer/word_tokenizer.js
@@ -1,18 +1,18 @@
-'use strict';
-var test = require('tape'),
-    vntk = require('../../../lib/vntk'),
-    ws = vntk.wordTokenizer();
-
-test('wordTokenizer simple case', function (t) {
-    t.plan(9);
-
-    t.equal(ws.tag('Thương mại và các sản phẩm cũng vậy.', 'text'), 'Thương_mại và các sản_phẩm cũng vậy .');
-    t.equal(ws.tag('Nhờ đó, chúng ta có thể kiềm chế căng thẳng và các xung đột tiếm năng không dẫn tới xung đột quân sự.', 'text'), 'Nhờ đó , chúng_ta có_thể kiềm_chế căng_thẳng và các xung_đột tiếm năng không dẫn tới xung_đột quân_sự .');
-    t.equal(ws.tag(' qua  bộ đồ  da  thú  ', 'text'), 'qua bộ đồ_da thú', 'multiple spaces');
-    t.equal(ws.tag('con', 'text'), 'con');
-    t.equal(ws.tag('Phải', 'text'), 'Phải');
-    t.equal(ws.tag('Không', 'text'), 'Không');
-    t.equal(ws.tag('Được không', 'text'), 'Được không');
-    t.equal(ws.tag('', 'text'), '', 'empty string');
-    t.equal(ws.tag('Tên?', 'text'), 'Tên ?', 'question mark');
+'use strict';
+var test = require('tape'),
+    vntk = require('../../../lib/vntk'),
+    ws = vntk.wordTokenizer();
+
+test('wordTokenizer simple case', function (t) {
+    t.plan(9);
+
+    t.equal(ws.tag('Thương mại và các sản phẩm cũng vậy.', 'text'), 'Thương_mại và các sản_phẩm cũng vậy .');
+    t.equal(ws.tag('Nhờ đó, chúng ta có thể kiềm chế căng thẳng và các xung đột tiếm năng không dẫn tới xung đột quân sự.', 'text'), 'Nhờ đó , chúng_ta có_thể kiềm_chế căng_thẳng và các xung_đột tiếm năng không dẫn tới xung_đột quân_sự .');
+    t.equal(ws.tag(' qua  bộ đồ  da  thú  ', 'text'), 'qua bộ đồ_da thú', 'multiple spaces');
+    t.equal(ws.tag('con', 'text'), 'con');
+    t.equal(ws.tag('Phải', 'text'), 'Phải');
+    t.equal(ws.tag('Không', 'text'), 'Không');
+    t.equal(ws.tag('Được không', 'text'), 'Được không');
+    t.equal(ws.tag('', 'text'), '', 'empty string');
+    t.equal(ws.tag('Tên?', 'text'), 'Tên ?', 'question mark');
 });