Skip to content
Browse files

Initial commit.

  • Loading branch information...
0 parents commit 63d85f3d8db32cc55cc79d265d5b4ac4e64b9fc8 @walling committed Nov 15, 2011
3 .npmignore
@@ -0,0 +1,3 @@
+./src/
+./download-source.sh
+./package.sh
59 README.markdown
@@ -0,0 +1,59 @@
+This is [Unicode Normalizer] in a Common JS module. I'm not affiliated with
+Matsuza, the author of Unicode Normalizer.
+
+Installation
+------------
+
+ npm install unorm
+
+
+Usage example
+-------------
+
+For a longer example, see `example.js`.
+
+ unorm = require('unorm');
+
+ text =
+ 'The \u212B symbol invented by A. J. \u00C5ngstr\u00F6m ' +
+ '(1814, L\u00F6gd\u00F6, \u2013 1874) denotes the length ' +
+ '10\u207B\u00B9\u2070 m.';
+
+ combining = /[\u0300-\u036F]/g; // Use XRegExp('\\p{M}', 'g'); see example.js.
+
+ console.log('Regular: ' + text);
+ console.log('NFC: ' + unorm.nfc(text));
+ console.log('NFD: ' + unorm.nfd(text));
+ console.log('NFKC: ' + unorm.nfkc(text));
+ console.log('NFKD: * ' + unorm.nfkd(text).replace(combining, ''));
+ console.log(' * = Combining characters removed from decomposed form.');
+
+
+License
+-------
+
+This project includes the software package **Unicode Normalizer 1.0.0**. The
+software dual licensed under the MIT and GPL licenses. Here is the MIT license:
+
+ Copyright (c) 2008 Matsuza <matsuza@gmail.com>
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to
+ deal in the Software without restriction, including without limitation the
+ rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ sell copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ IN THE SOFTWARE.
+
+
+[Unicode Normalizer]: http://coderepos.org/share/browser/lang/javascript/UnicodeNormalizer
3 download-source.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+test -d src/ && rm -r src/
+svn export http://svn.coderepos.org/share/lang/javascript/UnicodeNormalizer/ src/
39 example.js
@@ -0,0 +1,39 @@
+unorm = require('unorm');
+
+// Function to display Unicode codepoints of a string.
+function codepoints(string) {
+ return string.split('').map(function(chr) {
+ var codepoint = chr.charCodeAt(0);
+ return (codepoint >= 33 && codepoint <= 126) ?
+ JSON.stringify(chr) :
+ 'U+' + codepoint.toString(16).toUpperCase();
+ }).join(' ');
+}
+
+// Scientific Ångström symbol is converted to Scandinavian letter Å.
+angstrom = '\u212B';
+console.log('- Example 1 -');
+console.log(codepoints(angstrom));
+console.log(codepoints(unorm.nfc(angstrom)));
+
+// German ä and ü decomposed into a and u with Combining Diaeresis character.
+letters = '\u00E4\u00FC'
+console.log('- Example 2 -');
+console.log(codepoints(letters));
+console.log(codepoints(unorm.nfd(letters)));
+
+// String optimized for compatibility, ie. CO₂ becomes CO2.
+scientific = 'CO\u2082 and E=mc\u00B2'
+console.log('- Example 3 -');
+console.log(scientific)
+console.log(unorm.nfkc(scientific));
+
+// NOTE: Rest of the example requires XRegExp: npm install xregexp
+
+// Remove combining characters / marks from Swedish name, ie. ö becomes o.
+// This is useful for indexing and searching internationalized text.
+XRegExp = require('xregexp');
+name = '\u00C5ngstr\u00F6m';
+console.log('- Example 4 -');
+console.log(unorm.nfkd(name));
+console.log(unorm.nfkd(name).replace(XRegExp('\\p{M}', 'g'), ''));
404 index.js
404 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
17 package.json
@@ -0,0 +1,17 @@
+{
+ "name" : "unorm",
+ "version" : "1.0.0",
+ "description" : "JavaScript Unicode Normalization - NFC, NFD, NFKC, NFKD. Read <http://unicode.org/reports/tr15/> UAX #15 Unicode Normalization Forms.",
+ "author": "Matsuza <matsuza@gmail.com>",
+ "contributors": [
+ { "name": "Bjarke Walling", "email": "bwp@bwp.dk" }
+ ],
+ "repository" : {
+ "type" : "git",
+ "url" : "http://github.com/walling/unorm.git"
+ },
+ "main": "./index.js",
+ "engines" : {
+ "node" : ">= 0.4.0"
+ }
+}
21 package.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+test -f index.js && rm index.js
+
+echo '' >> index.js
+echo '/***** unorm.js *****/' >> index.js
+echo '' >> index.js
+cat src/unorm.js | sed -e 's/^})();$/}).call(this);/' >> index.js
+echo 'var UNorm = this.UNorm; // Small hack :-)' >> index.js
+echo '' >> index.js
+echo '/***** unormdata.js *****/' >> index.js
+echo '' >> index.js
+cat src/unormdata.js >> index.js
+echo '' >> index.js
+echo '/***** Export as Common JS module *****/' >> index.js
+echo '' >> index.js
+echo '// The easy conversion functions are exported.' >> index.js
+echo '' >> index.js
+echo 'exports.nfc = UNorm.nfc;' >> index.js
+echo 'exports.nfd = UNorm.nfd;' >> index.js
+echo 'exports.nfkc = UNorm.nfkc;' >> index.js
+echo 'exports.nfkd = UNorm.nfkd;' >> index.js
6 src/data/.classpath
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+ <classpathentry kind="src" path="src"/>
+ <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+ <classpathentry kind="output" path="bin"/>
+</classpath>
17 src/data/.project
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>UnicodeNormalizer</name>
+ <comment></comment>
+ <projects>
+ </projects>
+ <buildSpec>
+ <buildCommand>
+ <name>org.eclipse.jdt.core.javabuilder</name>
+ <arguments>
+ </arguments>
+ </buildCommand>
+ </buildSpec>
+ <natures>
+ <nature>org.eclipse.jdt.core.javanature</nature>
+ </natures>
+</projectDescription>
197 src/data/CompositionExclusions.txt
@@ -0,0 +1,197 @@
+# CompositionExclusions-5.1.0.txt
+# Date: 2008-03-20, 17:45:00 PDT [KW]
+#
+# This file lists the characters for the Composition Exclusion Table
+# defined in UAX #15, Unicode Normalization Forms.
+#
+# This file is a normative contributory data file in the
+# Unicode Character Database.
+#
+# Copyright (c) 1991-2008 Unicode, Inc.
+# For terms of use, see http://www.unicode.org/terms_of_use.html
+#
+# For more information, see
+# http://www.unicode.org/unicode/reports/tr15/#Primary Exclusion List Table
+#
+# For a full derivation of composition exclusions, see the derived property
+# Full_Composition_Exclusion in DerivedNormalizationProps.txt
+#
+
+# ================================================
+# (1) Script Specifics
+#
+# This list of characters cannot be derived from the UnicodeData.txt file.
+# ================================================
+
+0958 # DEVANAGARI LETTER QA
+0959 # DEVANAGARI LETTER KHHA
+095A # DEVANAGARI LETTER GHHA
+095B # DEVANAGARI LETTER ZA
+095C # DEVANAGARI LETTER DDDHA
+095D # DEVANAGARI LETTER RHA
+095E # DEVANAGARI LETTER FA
+095F # DEVANAGARI LETTER YYA
+09DC # BENGALI LETTER RRA
+09DD # BENGALI LETTER RHA
+09DF # BENGALI LETTER YYA
+0A33 # GURMUKHI LETTER LLA
+0A36 # GURMUKHI LETTER SHA
+0A59 # GURMUKHI LETTER KHHA
+0A5A # GURMUKHI LETTER GHHA
+0A5B # GURMUKHI LETTER ZA
+0A5E # GURMUKHI LETTER FA
+0B5C # ORIYA LETTER RRA
+0B5D # ORIYA LETTER RHA
+0F43 # TIBETAN LETTER GHA
+0F4D # TIBETAN LETTER DDHA
+0F52 # TIBETAN LETTER DHA
+0F57 # TIBETAN LETTER BHA
+0F5C # TIBETAN LETTER DZHA
+0F69 # TIBETAN LETTER KSSA
+0F76 # TIBETAN VOWEL SIGN VOCALIC R
+0F78 # TIBETAN VOWEL SIGN VOCALIC L
+0F93 # TIBETAN SUBJOINED LETTER GHA
+0F9D # TIBETAN SUBJOINED LETTER DDHA
+0FA2 # TIBETAN SUBJOINED LETTER DHA
+0FA7 # TIBETAN SUBJOINED LETTER BHA
+0FAC # TIBETAN SUBJOINED LETTER DZHA
+0FB9 # TIBETAN SUBJOINED LETTER KSSA
+FB1D # HEBREW LETTER YOD WITH HIRIQ
+FB1F # HEBREW LIGATURE YIDDISH YOD YOD PATAH
+FB2A # HEBREW LETTER SHIN WITH SHIN DOT
+FB2B # HEBREW LETTER SHIN WITH SIN DOT
+FB2C # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
+FB2D # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
+FB2E # HEBREW LETTER ALEF WITH PATAH
+FB2F # HEBREW LETTER ALEF WITH QAMATS
+FB30 # HEBREW LETTER ALEF WITH MAPIQ
+FB31 # HEBREW LETTER BET WITH DAGESH
+FB32 # HEBREW LETTER GIMEL WITH DAGESH
+FB33 # HEBREW LETTER DALET WITH DAGESH
+FB34 # HEBREW LETTER HE WITH MAPIQ
+FB35 # HEBREW LETTER VAV WITH DAGESH
+FB36 # HEBREW LETTER ZAYIN WITH DAGESH
+FB38 # HEBREW LETTER TET WITH DAGESH
+FB39 # HEBREW LETTER YOD WITH DAGESH
+FB3A # HEBREW LETTER FINAL KAF WITH DAGESH
+FB3B # HEBREW LETTER KAF WITH DAGESH
+FB3C # HEBREW LETTER LAMED WITH DAGESH
+FB3E # HEBREW LETTER MEM WITH DAGESH
+FB40 # HEBREW LETTER NUN WITH DAGESH
+FB41 # HEBREW LETTER SAMEKH WITH DAGESH
+FB43 # HEBREW LETTER FINAL PE WITH DAGESH
+FB44 # HEBREW LETTER PE WITH DAGESH
+FB46 # HEBREW LETTER TSADI WITH DAGESH
+FB47 # HEBREW LETTER QOF WITH DAGESH
+FB48 # HEBREW LETTER RESH WITH DAGESH
+FB49 # HEBREW LETTER SHIN WITH DAGESH
+FB4A # HEBREW LETTER TAV WITH DAGESH
+FB4B # HEBREW LETTER VAV WITH HOLAM
+FB4C # HEBREW LETTER BET WITH RAFE
+FB4D # HEBREW LETTER KAF WITH RAFE
+FB4E # HEBREW LETTER PE WITH RAFE
+
+# Total code points: 67
+
+# ================================================
+# (2) Post Composition Version precomposed characters
+#
+# These characters cannot be derived solely from the UnicodeData.txt file
+# in this version of Unicode.
+#
+# Note that characters added to the standard after the
+# Composition Version and which have canonical decomposition mappings
+# are not automatically added to this list of Post Composition
+# Version precomposed characters.
+# ================================================
+
+2ADC # FORKING
+1D15E # MUSICAL SYMBOL HALF NOTE
+1D15F # MUSICAL SYMBOL QUARTER NOTE
+1D160 # MUSICAL SYMBOL EIGHTH NOTE
+1D161 # MUSICAL SYMBOL SIXTEENTH NOTE
+1D162 # MUSICAL SYMBOL THIRTY-SECOND NOTE
+1D163 # MUSICAL SYMBOL SIXTY-FOURTH NOTE
+1D164 # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
+1D1BB # MUSICAL SYMBOL MINIMA
+1D1BC # MUSICAL SYMBOL MINIMA BLACK
+1D1BD # MUSICAL SYMBOL SEMIMINIMA WHITE
+1D1BE # MUSICAL SYMBOL SEMIMINIMA BLACK
+1D1BF # MUSICAL SYMBOL FUSA WHITE
+1D1C0 # MUSICAL SYMBOL FUSA BLACK
+
+# Total code points: 14
+
+# ================================================
+# (3) Singleton Decompositions
+#
+# These characters can be derived from the UnicodeData.txt file
+# by including all characters whose canonical decomposition
+# consists of a single character.
+#
+# These characters are simply quoted here for reference.
+# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
+# ================================================
+
+# 0340..0341 [2] COMBINING GRAVE TONE MARK..COMBINING ACUTE TONE MARK
+# 0343 COMBINING GREEK KORONIS
+# 0374 GREEK NUMERAL SIGN
+# 037E GREEK QUESTION MARK
+# 0387 GREEK ANO TELEIA
+# 1F71 GREEK SMALL LETTER ALPHA WITH OXIA
+# 1F73 GREEK SMALL LETTER EPSILON WITH OXIA
+# 1F75 GREEK SMALL LETTER ETA WITH OXIA
+# 1F77 GREEK SMALL LETTER IOTA WITH OXIA
+# 1F79 GREEK SMALL LETTER OMICRON WITH OXIA
+# 1F7B GREEK SMALL LETTER UPSILON WITH OXIA
+# 1F7D GREEK SMALL LETTER OMEGA WITH OXIA
+# 1FBB GREEK CAPITAL LETTER ALPHA WITH OXIA
+# 1FBE GREEK PROSGEGRAMMENI
+# 1FC9 GREEK CAPITAL LETTER EPSILON WITH OXIA
+# 1FCB GREEK CAPITAL LETTER ETA WITH OXIA
+# 1FD3 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
+# 1FDB GREEK CAPITAL LETTER IOTA WITH OXIA
+# 1FE3 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
+# 1FEB GREEK CAPITAL LETTER UPSILON WITH OXIA
+# 1FEE..1FEF [2] GREEK DIALYTIKA AND OXIA..GREEK VARIA
+# 1FF9 GREEK CAPITAL LETTER OMICRON WITH OXIA
+# 1FFB GREEK CAPITAL LETTER OMEGA WITH OXIA
+# 1FFD GREEK OXIA
+# 2000..2001 [2] EN QUAD..EM QUAD
+# 2126 OHM SIGN
+# 212A..212B [2] KELVIN SIGN..ANGSTROM SIGN
+# 2329 LEFT-POINTING ANGLE BRACKET
+# 232A RIGHT-POINTING ANGLE BRACKET
+# F900..FA0D [270] CJK COMPATIBILITY IDEOGRAPH-F900..CJK COMPATIBILITY IDEOGRAPH-FA0D
+# FA10 CJK COMPATIBILITY IDEOGRAPH-FA10
+# FA12 CJK COMPATIBILITY IDEOGRAPH-FA12
+# FA15..FA1E [10] CJK COMPATIBILITY IDEOGRAPH-FA15..CJK COMPATIBILITY IDEOGRAPH-FA1E
+# FA20 CJK COMPATIBILITY IDEOGRAPH-FA20
+# FA22 CJK COMPATIBILITY IDEOGRAPH-FA22
+# FA25..FA26 [2] CJK COMPATIBILITY IDEOGRAPH-FA25..CJK COMPATIBILITY IDEOGRAPH-FA26
+# FA2A..FA2D [4] CJK COMPATIBILITY IDEOGRAPH-FA2A..CJK COMPATIBILITY IDEOGRAPH-FA2D
+# FA30..FA6A [59] CJK COMPATIBILITY IDEOGRAPH-FA30..CJK COMPATIBILITY IDEOGRAPH-FA6A
+# FA70..FAD9 [106] CJK COMPATIBILITY IDEOGRAPH-FA70..CJK COMPATIBILITY IDEOGRAPH-FAD9
+# 2F800..2FA1D [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
+
+# Total code points: 1030
+
+# ================================================
+# (4) Non-Starter Decompositions
+#
+# These characters can be derived from the UnicodeData file
+# by including all characters whose canonical decomposition consists
+# of a sequence of characters, the first of which has a non-zero
+# combining class.
+#
+# These characters are simply quoted here for reference.
+# See also Full_Composition_Exclusion in DerivedNormalizationProps.txt
+# ================================================
+
+# 0344 COMBINING GREEK DIALYTIKA TONOS
+# 0F73 TIBETAN VOWEL SIGN II
+# 0F75 TIBETAN VOWEL SIGN UU
+# 0F81 TIBETAN VOWEL SIGN REVERSED II
+
+# Total code points: 4
+
17,819 src/data/NormalizationTest.txt
17,819 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
19,336 src/data/UnicodeData.txt
19,336 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
5 src/data/readme.txt
@@ -0,0 +1,5 @@
+This directory contains
+- Unicode data from Unicode.org
+- JS Code generator (written in Java)
+
+
416 src/data/src/UnormNormalizerBuilder.java
@@ -0,0 +1,416 @@
+import java.io.BufferedReader;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Formatter;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ * Builds the normalization tables. This is a separate class so that it can be
+ * unloaded once not needed.<br>
+ * Copyright (c) 1991-2005 Unicode, Inc. For terms of use, see
+ * http://www.unicode.org/terms_of_use.html For documentation, see UAX#15.<br>
+ *
+ * @author Mark Davis
+ */
+class UnormNormalizerBuilder {
+ static final String copyright = "Copyright � 1998-1999 Unicode, Inc.";
+
+ /**
+ * Testing flags
+ */
+
+ private static final boolean DEBUG = false;
+
+ /**
+ * Constants for the data file version to use.
+ */
+ static final boolean NEW_VERSION = true;
+ private static final String DIR = "C:/Documents and Settings/matsuza/My Documents/eclipseWorkspace/UnicodeNormalizer/";
+
+ public static final String UNICODE_DATA = DIR + "UnicodeData.txt";
+ public static final String COMPOSITION_EXCLUSIONS = DIR + "CompositionExclusions.txt";
+
+ /**
+ * Called exactly once by NormalizerData to build the static data
+ */
+
+ static class UChar {
+ final int codepoint;
+ boolean isCompatibility = false;
+ boolean isExcluded = false;
+ Integer[] decompose;
+ int canonicalClass;
+ Map<Integer, Integer> composeTrie = new TreeMap<Integer, Integer>();
+ public UChar(int cp) {
+ this.codepoint = cp;
+ }
+
+ private String toJSON_flag() {
+ return Long.toString(canonicalClass | (isCompatibility ? 1 << 8 : 0) | (isExcluded ? 1 << 9 : 0));
+ }
+
+ private String toJSON_decomp() {
+ StringBuilder sb = new StringBuilder();
+ Formatter f = new Formatter(sb);
+ /*
+ * sb.append("'"); for (int i = 0; decompose != null && i <
+ * decompose.length; ++i) { String s = cpconv(decompose[i]);
+ * f.format("%s", s); } sb.append("'");
+ */
+ sb.append("[");
+ for (int i = 0; decompose != null && i < decompose.length; ++i) {
+ f.format("%d,", decompose[i]);
+ }
+ sb.deleteCharAt(sb.length() - 1);
+ sb.append("]");
+
+ return sb.toString();
+ }
+ private String toJSON_comp() {
+ StringBuilder sb = new StringBuilder();
+ Formatter f = new Formatter(sb);
+ sb.append("{");
+ for (Iterator<Map.Entry<Integer, Integer>> iterator = composeTrie.entrySet().iterator(); iterator.hasNext();) {
+ Map.Entry<Integer, Integer> i = iterator.next();
+ f.format("%d:%d,", i.getKey(), i.getValue());
+ }
+ sb.deleteCharAt(sb.length() - 1);
+ sb.append("}");
+ return sb.toString();
+ }
+
+ public String toJSON() {
+ StringBuilder sb = new StringBuilder();
+ Formatter f = new Formatter(sb);
+ f.format("%d:[", codepoint);
+
+ String flagStr = toJSON_flag();
+ String decompStr = toJSON_decomp();
+ String compStr = toJSON_comp();
+ if (decompose != null) {
+ sb.append(decompStr);
+ }
+ if (flagStr.equals("0") && composeTrie.size() == 0) {
+ sb.append("]");
+ return sb.toString();
+ }
+ sb.append(",");
+ if (!flagStr.equals("0")) {
+ sb.append(flagStr);
+ }
+ if (composeTrie.size() == 0) {
+ f.format("]");
+ return sb.toString();
+ }
+ sb.append(",").append(compStr).append("]");
+ return sb.toString();
+ }
+
+ static TreeMap<Integer, UChar> cmap = new TreeMap<Integer, UChar>();
+ public static UChar getUChar(int cp) {
+ UChar ret = cmap.get(cp);
+ if (ret == null) {
+ ret = new UChar(cp);
+ cmap.put(cp, ret);
+ }
+ return ret;
+ }
+ public static String toJSONAll() {
+ StringBuilder sb = new StringBuilder();
+ UChar uc = null;
+ sb.append("if(!this.UNorm || !this.UNorm.UChar){throw 'must include unorm.js prior to unormdata.js';} UNorm.UChar.udata={\n");
+ Map<Integer, StringBuilder> res = new HashMap<Integer, StringBuilder>();
+ for (int i = 0; i < 256; ++i) {
+ res.put(i, new StringBuilder());
+ }
+ for (Iterator<UChar> iterator = cmap.values().iterator(); iterator.hasNext();) {
+ uc = iterator.next();
+ if (uc.canonicalClass == 0 && !uc.isCompatibility && !uc.isExcluded && uc.decompose == null
+ && uc.composeTrie.size() == 0) {
+ // do nothing
+ } else {
+ res.get((uc.codepoint >> 8) & 0xff).append(uc.toJSON()).append(",");
+ }
+ }
+ for (int i = 0; i < 256; ++i) {
+ StringBuilder sbout = (StringBuilder) res.get(i);
+ if (sbout.length() == 0) {
+ continue;
+ }
+ sbout.deleteCharAt(sbout.length() - 1);
+ sb.append(i << 8).append(":\"{").append(sbout).append("}\",\n");
+ }
+ sb.delete(sb.length() - 2, sb.length() - 1);
+ sb.append("\n};");
+ /*
+ *
+ *
+ *
+ * for (Iterator<UChar> iterator = cmap.values().iterator();
+ * iterator.hasNext();) { uc = iterator.next(); if
+ * (uc.canonicalClass == 0 && !uc.isCompatibility && !uc.isExcluded &&
+ * uc.decompose == null && uc.composeTrie.size() == 0) { } else {
+ * unitsb.append(uc.toJSON()).append(","); ++cnt; } if (cnt ==
+ * cntUnit) { cnt = 0; unitsb.deleteCharAt(unitsb.length() - 1);
+ * sb.append(uc.codepoint).append(":\"{").append(unitsb).append("}\",\n");
+ * unitsb = new StringBuilder(); } }
+ * unitsb.deleteCharAt(unitsb.length() - 1);
+ * sb.append(uc.codepoint).append(":\"{").append(unitsb).append("}\"");
+ * sb.append("\n};");
+ */
+ return sb.toString();
+ }
+ }
+
+ private static void write(Writer w) throws IOException {
+ w.append(UChar.toJSONAll());
+ }
+
+ public static void main(String[] args) {
+ try {
+ readExclusionList();
+ buildDecompositionTables();
+ OutputStreamWriter w = new OutputStreamWriter(new FileOutputStream("unormdata.js"), "utf-8");
+ write(w);
+ w.close();
+ } catch (java.io.IOException e) {
+ System.err.println("Can't load data file." + e + ", " + e.getMessage());
+ }
+ }
+
+ // =============================================================
+ // Building Decomposition Tables
+ // =============================================================
+
+ /**
+ * Reads exclusion list and stores the data
+ */
+ private static void readExclusionList() throws java.io.IOException {
+ if (DEBUG)
+ System.out.println("Reading Exclusions");
+ BufferedReader in = new BufferedReader(new FileReader(COMPOSITION_EXCLUSIONS), 5 * 1024);
+ while (true) {
+
+ // read a line, discarding comments and blank lines
+
+ String line = in.readLine();
+ if (line == null)
+ break;
+ int comment = line.indexOf('#'); // strip comments
+ if (comment != -1)
+ line = line.substring(0, comment);
+ if (line.length() == 0)
+ continue; // ignore blanks
+
+ // store -1 in the excluded table for each character hit
+
+ int value = Integer.parseInt(line.split("[^\\da-fA-F]")[0], 16);
+ UChar.getUChar(value).isExcluded = true;
+ System.out.println("Excluding " + hex(value));
+ }
+ in.close();
+ if (DEBUG)
+ System.out.println("Done reading Exclusions");
+
+ // workaround
+ UChar.getUChar(0x0F81).isExcluded = true;
+ UChar.getUChar(0x0F73).isExcluded = true;
+ UChar.getUChar(0x0F75).isExcluded = true;
+ }
+
+ /**
+ * Builds a decomposition table from a UnicodeData file
+ */
+ private static void buildDecompositionTables() throws java.io.IOException {
+ if (DEBUG)
+ System.out.println("Reading Unicode Character Database");
+ BufferedReader in = new BufferedReader(new FileReader(UNICODE_DATA), 64 * 1024);
+ int value;
+ int counter = 0;
+ while (true) {
+
+ // read a line, discarding comments and blank lines
+
+ String line = in.readLine();
+ if (line == null)
+ break;
+ int comment = line.indexOf('#'); // strip comments
+ if (comment != -1)
+ line = line.substring(0, comment);
+ if (line.length() == 0)
+ continue;
+ if (DEBUG) {
+ counter++;
+ if ((counter & 0xFF) == 0)
+ System.out.println("At: " + line);
+ }
+
+ // find the values of the particular fields that we need
+ // Sample line: 00C0;LATIN ...A GRAVE;Lu;0;L;0041 0300;;;;N;LATIN
+ // ... GRAVE;;;00E0;
+
+ int start = 0;
+ int end = line.indexOf(';'); // code
+ value = Integer.parseInt(line.substring(start, end), 16);
+ UChar uchar = UChar.getUChar(value);
+ if (true && value == '\u00c0') {
+ System.out.println("debug: " + line);
+ }
+ end = line.indexOf(';', start = end + 1); // name
+ end = line.indexOf(';', start = end + 1); // general category
+ end = line.indexOf(';', start = end + 1); // canonical class
+
+ // check consistency: canonical classes must be from 0 to 255
+
+ int cc = Integer.parseInt(line.substring(start, end));
+ if (cc != (cc & 0xFF))
+ System.err.println("Bad canonical class at: " + line);
+
+ // canonicalClass.put(value, cc);
+ uchar.canonicalClass = cc;
+ end = line.indexOf(';', start = end + 1); // BIDI
+ end = line.indexOf(';', start = end + 1); // decomp
+
+ // decomp requires more processing.
+ // store whether it is canonical or compatibility.
+ // store the decomp in one table, and the reverse mapping (from
+ // pairs) in another
+
+ if (start != end) {
+ String segment = line.substring(start, end);
+ boolean compat = segment.charAt(0) == '<';
+ if (compat) {
+ // isCompatibility.set(value);
+ uchar.isCompatibility = true;
+ }
+ Integer[] decomp = fromHex(segment);
+
+ // check consistency: all canon decomps must be singles or
+ // pairs!
+
+ if (decomp.length < 1 || decomp.length > 2 && !compat) {
+ System.err.println("Bad decomp at: " + line);
+ }
+ // decompose.put(value, decomp);
+ uchar.decompose = decomp;
+
+ // only compositions are canonical pairs
+ // skip if script exclusion
+
+ if (!compat && !uchar.isExcluded && decomp.length != 1) {
+ // <decomp>とかの表記がない && 除外指定されていない && singletonでない
+ UChar.getUChar(decomp[0]).composeTrie.put(decomp[1], value);
+ } else if (DEBUG) {
+ System.out.println("Excluding: " + decomp);
+ }
+ }
+ }
+ in.close();
+ if (DEBUG)
+ System.out.println("Done reading Unicode Character Database");
+
+ // add algorithmic Hangul decompositions
+ // this is more compact if done at runtime, but for simplicity we
+ // do it this way.
+ /*
+ * if (DEBUG) System.out.println("Adding Hangul");
+ *
+ * for (int SIndex = 0; SIndex < SCount; ++SIndex) { int TIndex = SIndex %
+ * TCount; int first, second; if (TIndex != 0) { // triple first =
+ * (SBase + SIndex - TIndex); second = (TBase + TIndex); } else { first =
+ * (LBase + SIndex / NCount); second = (VBase + (SIndex % NCount) /
+ * TCount); } value = SIndex + SBase;
+ *
+ * UChar uchar = UChar.getUChar(value); uchar.decompose = new
+ * Integer[]{first, second};
+ * UChar.getUChar(first).composeTrie.put(second, value); } if (DEBUG)
+ * System.out.println("Done adding Hangul");
+ */
+ }
+
+ /**
+ * Hangul composition constants
+ */
+ static final int SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, LCount = 19, VCount = 21, TCount = 28,
+ NCount = VCount * TCount, // 588
+ SCount = LCount * NCount; // 11172
+
+ /**
+ * Utility: Parses a sequence of hex Unicode characters separated by spaces
+ */
+ static public Integer[] fromHex(String source) {
+ ArrayList<Integer> result = new ArrayList<Integer>();
+ for (int i = 0; i < source.length(); ++i) {
+ char c = source.charAt(i);
+ switch (c) {
+ case ' ' :
+ break; // ignore
+ case '0' :
+ case '1' :
+ case '2' :
+ case '3' :
+ case '4' :
+ case '5' :
+ case '6' :
+ case '7' :
+ case '8' :
+ case '9' :
+ case 'A' :
+ case 'B' :
+ case 'C' :
+ case 'D' :
+ case 'E' :
+ case 'F' :
+ case 'a' :
+ case 'b' :
+ case 'c' :
+ case 'd' :
+ case 'e' :
+ case 'f' :
+ String num = source.substring(i).split("[^\\dA-Fa-f]")[0];
+ result.add(Integer.parseInt(num, 16));
+ i += num.length() - 1; // skip rest of number
+ break;
+ case '<' :
+ int j = source.indexOf('>', i); // skip <...>
+ if (j > 0) {
+ i = j;
+ break;
+ } // else fall through--error
+ default :
+ throw new IllegalArgumentException("Bad hex value in " + source);
+ }
+ }
+ return result.toArray(new Integer[result.size()]);
+ }
+
+ /**
+ * Utility: Supplies a zero-padded hex representation of an integer (without
+ * 0x)
+ */
+ static public String hex(int i) {
+ String result = Long.toString(i & 0xFFFFFFFFL, 16).toUpperCase();
+ return "00000000".substring(result.length(), 8) + result;
+ }
+
+ /**
+ * Utility: Supplies a zero-padded hex representation of a Unicode character
+ * (without 0x, \\u)
+ */
+ public static String hex(String s, String sep) {
+ StringBuffer result = new StringBuffer();
+ for (int i = 0; i < s.length(); ++i) {
+ if (i != 0)
+ result.append(sep);
+ result.append(hex(s.charAt(i)));
+ }
+ return result.toString();
+ }
+}
133 src/index.html
@@ -0,0 +1,133 @@
+<html>
+<head>
+<meta http-equiv="Content-type" content="text/html; charset=utf-8">
+<script charset="utf-8" src="http://jqueryjs.googlecode.com/files/jquery-1.2.6.min.js"></script>
+<script charset="utf-8" src="unorm.js"></script>
+<script charset="utf-8" src="unormdata.js"></script>
+</head>
+<body>
+
+
+<h1>Unicode Normalizer</h1>
+author; matsuza (matsuza gmail.com, <a href="http://d.hatena.ne.jp/matsuza">http://d.hatena.ne.jp/matsuza</a>)<p>
+Unicode規格で規定されているNormalize処理を行います。<br>
+<a href="http://unicode.org/reports/tr15/">UAX #15: Unicode Normalization Forms:</a><br>
+Unicode.orgで提供されているテストケースに準拠しています。
+<br>
+MITとGPLのデュアルライセンスとします。
+
+<script>
+function onchangesample(e){
+ var t = e.srcElement ? e.srcElement.value : e.target.value;
+ var tgt = [[UNorm.nfd, "#nfd"], [UNorm.nfkd, "#nfkd"], [UNorm.nfc, "#nfc"], [UNorm.nfkc, "#nfkc"]];
+ for(var i = 0; i < tgt.length; ++i){
+ var cnvd = tgt[i][0](t);
+ $(tgt[i][1] + "_res")[0].value = cnvd;
+ $(tgt[i][1] + "_esc")[0].innerHTML = (function(str){
+ var ret = "";
+ for(var i = 0; i < str.length; ++i){
+ ret +="&amp;#x" + str.charCodeAt(i).toString(16) + ";";
+ }
+ return ret;
+ })(cnvd);
+ $(tgt[i][1] + "_len")[0].innerHTML = cnvd.length;
+ }
+}
+</script>
+<style>
+
+</style>
+
+<h2>デモ</h2>
+↓に何か文字列を入れると、NFD, NFKD, NFC, NFKCの各モードでノーマライズした結果が表示されます。<br>
+入れると面白い文字列一覧 ; ㈱㍍ガガ①&#xAC00;&#519;<br>
+文字列長や、delキーで文字を削除するときの振る舞いなんか面白いかも知れません。<br>
+
+
+<textarea name="sample" id="sample" onkeyup="onchangesample(event);"></textarea>
+<br>
+<div>
+NFD(length=<span id="nfd_len"></span>)<br>
+<textarea id="nfd_res" class="ntest"></textarea>
+<div id="nfd_esc" class="esc"></div>
+</div>
+<div>
+NFKD(length=<span id="nfkd_len"></span>)<br>
+<textarea id="nfkd_res" class="ntest"></textarea>
+<div id="nfkd_esc" class="esc"></div>
+</div>
+<div>
+NFC(length=<span id="nfc_len"></span>)<br>
+<textarea id="nfc_res" class="ntest"></textarea>
+<div id="nfc_esc" class="esc"></div>
+</div>
+<div>
+NFKC(length=<span id="nfkc_len"></span>)<br>
+<textarea id="nfkc_res" class="ntest"></textarea>
+<div id="nfkc_esc" class="esc"></div>
+</div>
+
+<h2>利用方法</h2>
+<h3>読み込み方法</h3>
+unorm.jsとunormdata.jsを、この順に読み込んでください。
+<h3>名前空間</h3>
+UNorm名前空間を利用します。以下の説明ではこの名前空間が暗黙的に使われているものとします。
+<h3>API</h3>
+
+<h4>normalize(mode, str)</h4>
+ノーマライズを行います。
+<h5>引数</h5>
+<ul>
+<li>mode : ノーマライズのモードを指定します。"NFD", "NFKD", "NFC", "NFKC"が指定できます。
+<li>str : ノーマライズ処理を行う対象のStringです。
+<li>返値 : String
+</ul>
+<h5>備考</h5>
+ノーマライズはそれなりに重い処理なので、大きな文字列を処理する場合は後述するイテレータインタフェースを用いて適度にsetTimeout()することを推奨します。<br>
+<h5>サンプル</h5>
+<pre>
+var s = "㈱㍍";
+document.write(UNorm.normalize("NFKC", s));
+</pre>
+
+<h4>nfd(str)</h4>
+<h4>nfkd(str)</h4>
+<h4>nfc(str)</h4>
+<h4>nfkc(str)</h4>
+関数名に示される種類のノーマライズを行います。
+nfd(str)はnormalize("NFD", str)と厳密に等価です。
+<h5>引数</h5>
+<ul>
+<li>str : ノーマライズ処理を行う対象のStringです。
+<li>返値 : String
+</ul>
+<h5>サンプル</h5>
+<pre>
+var s = "㈱㍍";
+document.write(UNorm.nfkc(s));
+</pre>
+
+<h4>createIterator(mode, str)</h4>
+ノーマライザのイテレータインタフェースを返します。
+<h5>引数</h5>
+<ul>
+<li>mode : ノーマライズのモードを指定します。"NFD", "NFKD", "NFC", "NFKC"が指定できます。
+<li>str : ノーマライズ処理を行う対象のStringです。
+<li>返値 : イテレータを返します。
+</ul>
+<h5>備考</h5>
+返値はイテレータオブジェクトです。<br>
+ノーマライズ処理を行う際、簡便なのはnormalize関数を使って処理することですが、巨大な文字列を処理する場合はイテレータインタフェースを使うことも考慮してください。<br>
+イテレータオブジェクトはnext()メソッドを備えており、next()を呼ぶたびにUCharオブジェクトを返します。返すべきデータが無くなった以降は、next()を呼ぶたびにnullが帰ります。<br>
+UCharオブジェクトはtoString()を備えており、これを用いて文字列に変換できます。<br>
+<h5>サンプル</h5>
+<pre>
+var it = createIterator("NFC", "①ガギグゲゴ");
+var ret = "";
+var uchar;
+while(!!(uchar = it.next())){
+ ret += uchar.toString();
+}
+document.write(ret);
+</pre>
+</body>
14 src/test.html
@@ -0,0 +1,14 @@
+<script charset="utf-8" src="http://jqueryjs.googlecode.com/files/jquery-1.2.6.min.js"></script>
+<script charset="utf-8" src="unorm.js"></script>
+<script charset="utf-8" src="unormdata.js"></script>
+<script charset="utf-8" src="unittest.js"></script>
+
+<input type="button" value="Batch test (takes about 10minuts)" onclick="testTrigger();">
+<br>
+
+<input type="text" name="testcase" id="testcase">
+<input type="button" value="Single test" onclick="trigger(document.getElementById('testcase').value);">
+<br>
+
+<div id="test_result">
+</div>
116 src/unittest.js
@@ -0,0 +1,116 @@
+var utdata;
+var testpattern=/^([^@#;]*);([^;]*);([^;]*);([^;]*);([^;]*);/mg;
+
+var match;
+var nfd, nfkd;
+var testCntMax;
+var testCnt;
+var testUnitMax = 5;
+var testUnit;
+var timer;
+function testTrigger(){
+ $.get("data/NormalizationTest.txt", function(data){
+ utdata = data;
+ testpattern.lastIndex = 0;
+ testCntMax = data.match(testpattern).length;
+ testCnt = 0;
+ testUnit = 0;
+ timer = new Date().getTime();
+ testpattern.lastIndex = 0;
+ doTest();
+ });
+}
+function trigger(str){
+ utdata = str;
+ testCntMax = 1;
+ testCnt = 0;
+ timer = new Date().getTime();
+ testpattern.lastIndex = 0;
+ doTest();
+}
+function doTest(match){
+ if((match = testpattern.exec(utdata)) == null) {
+ log((new Date().getTime() - timer) + "ms");
+ return;
+ }
+ var raw = getTestString(match);
+ var nfd = jQuery.map(raw, function(i){
+ return UNorm.normalize("NFD", i);
+ });
+ var nfkd = jQuery.map(raw, function(i){
+ return UNorm.normalize("NFKD", i);
+ });
+ var nfc = jQuery.map(raw, function(i){
+ return UNorm.normalize("NFC", i);
+ });
+ var nfkc = jQuery.map(raw, function(i){
+ return UNorm.normalize("NFKC", i);
+ });
+ try{
+ //NFD
+ assert(raw[3], nfd[1], "c3 == NFD(c1)");
+ assert(raw[3], nfd[2], "c3 == NFD(c2)");
+ assert(raw[3], nfd[3], "c3 == NFD(c3)");
+ assert(raw[5], nfd[4], "c5 == NFD(c4)");
+ assert(raw[5], nfd[5], "c5 == NFD(c5)");
+ //NFKD
+ assert(raw[5], nfkd[1], "c5 == NFKD(c1)");
+ assert(raw[5], nfkd[2], "c5 == NFKD(c2)");
+ assert(raw[5], nfkd[3], "c5 == NFKD(c3)");
+ assert(raw[5], nfkd[4], "c5 == NFKD(c4)");
+ assert(raw[5], nfkd[5], "c5 == NFKD(c5)");
+ //NFC
+ assert(raw[2], nfc[1], "c2 == NFC(c1)");
+ assert(raw[2], nfc[2], "c2 == NFC(c2)");
+ assert(raw[2], nfc[3], "c2 == NFC(c3)");
+ assert(raw[4], nfc[4], "c4 == NFC(c4)");
+ assert(raw[4], nfc[5], "c4 == NFC(c5)");
+ //NFKC
+ assert(raw[4], nfkc[1], "c5 == NFKC(c1)");
+ assert(raw[4], nfkc[2], "c5 == NFKC(c2)");
+ assert(raw[4], nfkc[3], "c5 == NFKC(c3)");
+ assert(raw[4], nfkc[4], "c5 == NFKC(c4)");
+ assert(raw[4], nfkc[5], "c5 == NFKC(c5)");
+ } catch(e){
+ log(match[0] + ":" + e.toString());
+ }
+ if(++testCnt % 100 == 0){
+ log(testCnt + "/" + testCntMax);
+ }
+ if(++testUnit % testUnitMax == 0){
+ setTimeout(doTest, 0);
+ } else {
+ doTest();
+ }
+};
+
+function log(s){
+ $("#test_result")[0].innerHTML += s + "<br>";
+}
+function assert(l, r, msg){
+ function toReadable(s){
+ var ret = [];
+ for(var i = 0; i < s.length; ++i){
+ ret.push(s.charCodeAt(i));
+ }
+ return ret.join(" ");
+ }
+ if(l != r){
+ throw (msg + "(" + toReadable(l) + " vs " + toReadable(r) + ")");
+ }
+}
+function getTestString(line){
+ var ret = [];
+ var s;
+ var splitpattern = /[0-9a-fA-F]+/g;
+ for(var i = 0; i < line.length; ++i){
+ s = "";
+ var match;
+ while ((match = splitpattern.exec(line[i])) != null) {
+ s += UNorm.UChar.fromCharCode(eval("0x" + match[0])).toString();
+ }
+ ret.push(s);
+ }
+ return ret;
+}
+
320 src/unorm.js
@@ -0,0 +1,320 @@
+/*
+ * UnicodeNormalizer 1.0.0
+ * Copyright (c) 2008 Matsuza
+ * Dual licensed under the MIT (MIT-LICENSE.txt) and GPL (GPL-LICENSE.txt) licenses.
+ * $Date: 2008-06-05 16:44:17 +0200 (Thu, 05 Jun 2008) $
+ * $Rev: 13309 $
+ */
+
+
+
+(function(){
+ var DEFAULT_FEATURE = [null, 0, {}];
+ var CACHE_THRESHOLD = 10;
+ var SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, LCount = 19, VCount = 21, TCount = 28;
+ var NCount = VCount * TCount; // 588
+ var SCount = LCount * NCount; // 11172
+
+ var UChar = function(cp, feature){
+ this.codepoint = cp;
+ this.feature = feature;
+ };
+
+ UChar.fromCharCode = function(cp, /* option */ needFeature){
+ var ctx = arguments.callee;
+ function fromCache(next, cp, needFeature){
+ if(!ctx.cache){
+ ctx.cache = {};
+ ctx.counter = [];
+ for(var i = 0; i <= 0xFF; ++i){
+ ctx.counter[i] = 0;
+ }
+ }
+ var ret = ctx.cache[cp];
+ if(!ret){
+ ret = next(cp, needFeature);
+ if(!!ret.feature && ++ctx.counter[(cp >> 8) & 0xFF] > CACHE_THRESHOLD){
+ ctx.cache[cp] = ret;
+ }
+ }
+ return ret;
+ }
+ function fromData(next, cp, needFeature){
+ var hash = cp & 0xFF00;
+ var dunit = UChar.udata[hash];
+ if(dunit == null){
+ dunit = UChar.udata[hash] = {};
+ } else if(typeof(dunit) == "string"){
+ dunit = UChar.udata[hash] = eval("(" + dunit + ")");
+ }
+ var f = dunit[cp];
+ return f ? new UChar(cp, f) : new UChar(cp, DEFAULT_FEATURE);
+ }
+ function fromCpOnly(next, cp, needFeature){
+ return !!needFeature ? next(cp, needFeature) : new UChar(cp, null);
+ }
+ function fromRuleBasedJamo(next, cp, needFeature){
+ if(cp < LBase || (LBase + LCount <= cp && cp < SBase) || (SBase + SCount < cp)){
+ return next(cp, needFeature);
+ }
+ if(LBase <= cp && cp < LBase + LCount){
+ var c = {};
+ var base = (cp - LBase) * VCount;
+ for(var i = 0; i < VCount; ++i){
+ c[VBase + i] = SBase + TCount * (i + base);
+ }
+ return new UChar(cp, [,,c]);
+ }
+
+ var SIndex = cp - SBase;
+ var TIndex = SIndex % TCount;
+ var feature = [];
+ if(TIndex != 0){
+ feature[0] = [SBase + SIndex - TIndex, TBase + TIndex];
+ } else {
+ feature[0] = [LBase + Math.floor(SIndex / NCount), VBase + Math.floor((SIndex % NCount) / TCount)];
+ feature[2] = {};
+ for(var i = 1; i < TCount; ++i){
+ feature[2][TBase + i] = cp + i;
+ }
+ }
+ return new UChar(cp, feature);
+ }
+ function fromCpFilter(next, cp, needFeature){
+ return cp < 60 || 13311 < cp && cp < 42607 ? new UChar(cp, DEFAULT_FEATURE) : next(cp, needFeature);
+ }
+ if(!ctx.strategy){
+ //first call
+ var strategies = [fromCpFilter, fromCache, fromCpOnly, fromRuleBasedJamo, fromData];
+ UChar.fromCharCode.strategy = null;
+ while(strategies.length > 0){
+ ctx.strategy = (function(next, strategy, cp, needFeature){
+ return function(cp, needFeature){
+ return strategy(next, cp, needFeature);
+ };
+ })(ctx.strategy, strategies.pop(), cp, needFeature);
+ }
+ }
+ return ctx.strategy(cp, needFeature);
+ };
+
+ UChar.isHighSurrogate = function(cp){
+ return cp >= 0xD800 && cp <= 0xDBFF;
+ }
+ UChar.isLowSurrogate = function(cp){
+ return cp >= 0xDC00 && cp <= 0xDFFF;
+ }
+
+ UChar.prototype.prepFeature = function(){
+ if(!this.feature){
+ this.feature = UChar.fromCharCode(this.codepoint, true).feature;
+ }
+ };
+
+ UChar.prototype.toString = function(){
+ if(this.codepoint < 0x10000){
+ return String.fromCharCode(this.codepoint);
+ } else {
+ var x = this.codepoint - 0x10000;
+ return String.fromCharCode(Math.floor(x / 0x400) + 0xD800, x % 0x400 + 0xDC00);
+ }
+ };
+
+ UChar.prototype.getDecomp = function(){
+ this.prepFeature();
+ return this.feature[0] || null;
+ };
+
+ UChar.prototype.isCompatibility = function(){
+ this.prepFeature();
+ return !!this.feature[1] && (this.feature[1] & (1 << 8));
+ }
+ UChar.prototype.isExclude = function(){
+ this.prepFeature();
+ return !!this.feature[1] && (this.feature[1] & (1 << 9));
+ }
+ UChar.prototype.getCanonicalClass = function(){
+ this.prepFeature();
+ return !!this.feature[1] ? (this.feature[1] & 0xff) : 0;
+ }
+ UChar.prototype.getComposite = function(following){
+ this.prepFeature();
+ if(!this.feature[2]){
+ return null;
+ }
+ var cp = this.feature[2][following.codepoint];
+ return (cp != null) ? UChar.fromCharCode(cp) : null;
+ }
+
+ var UCharIterator = function(str){
+ this.str = str;
+ this.cursor = 0;
+ }
+ UCharIterator.prototype.next = function(){
+ if(!!this.str && this.cursor < this.str.length){
+ var cp = this.str.charCodeAt(this.cursor++);
+ var d;
+ if(UChar.isHighSurrogate(cp) && this.cursor < this.str.length && UChar.isLowSurrogate((d = this.str.charCodeAt(this.cursor)))){
+ cp = (cp - 0xD800) * 0x400 + (d -0xDC00) + 0x10000;
+ ++this.cursor;
+ }
+ return UChar.fromCharCode(cp);
+ } else {
+ this.str = null;
+ return null;
+ }
+ }
+
+ var RecursDecompIterator = function(it, cano){
+ this.it = it;
+ this.canonical = cano;
+ this.resBuf = [];
+ };
+
+ RecursDecompIterator.prototype.next = function(){
+ function recursiveDecomp(cano, uchar){
+ var decomp = uchar.getDecomp();
+ if(!!decomp && !(cano && uchar.isCompatibility())){
+ var ret = [];
+ for(var i = 0; i < decomp.length; ++i){
+ var a = recursiveDecomp(cano, UChar.fromCharCode(decomp[i]));
+ //ret.concat(a); //<-why does not this work?
+ //following block is a workaround.
+ for(var j = 0; j < a.length; ++j){
+ ret.push(a[j]);
+ }
+ }
+ return ret;
+ } else {
+ return [uchar];
+ }
+ }
+ if(this.resBuf.length == 0){
+ var uchar = this.it.next();
+ if(!uchar){
+ return null;
+ }
+ this.resBuf = recursiveDecomp(this.canonical, uchar);
+ }
+ return this.resBuf.shift();
+ };
+
+ var DecompIterator = function(it){
+ this.it = it;
+ this.resBuf = [];
+ };
+
+ DecompIterator.prototype.next = function(){
+ var cc;
+ if(this.resBuf.length == 0){
+ do{
+ var uchar = this.it.next();
+ if(!uchar){
+ break;
+ }
+ cc = uchar.getCanonicalClass();
+ var inspt = this.resBuf.length;
+ if(cc != 0){
+ for(; inspt > 0; --inspt){
+ var uchar2 = this.resBuf[inspt - 1];
+ var cc2 = uchar2.getCanonicalClass();
+ if(cc2 <= cc){
+ break;
+ }
+ }
+ }
+ this.resBuf.splice(inspt, 0, uchar);
+ } while(cc != 0);
+ }
+ return this.resBuf.shift();
+ };
+
+ var CompIterator = function(it){
+ this.it = it;
+ this.procBuf = [];
+ this.resBuf = [];
+ this.lastClass = null;
+ };
+
+ CompIterator.prototype.next = function(){
+ while(this.resBuf.length == 0){
+ var uchar = this.it.next();
+ if(!uchar){
+ this.resBuf = this.procBuf;
+ this.procBuf = [];
+ break;
+ }
+ if(this.procBuf.length == 0){
+ this.lastClass = uchar.getCanonicalClass();
+ this.procBuf.push(uchar);
+ } else {
+ var starter = this.procBuf[0];
+ var composite = starter.getComposite(uchar);
+ var cc = uchar.getCanonicalClass();
+ if(!!composite && (this.lastClass < cc || this.lastClass == 0)){
+ this.procBuf[0] = composite;
+ } else {
+ if(cc == 0){
+ this.resBuf = this.procBuf;
+ this.procBuf = [];
+ }
+ this.lastClass = cc;
+ this.procBuf.push(uchar);
+ }
+ }
+ }
+ return this.resBuf.shift();
+ };
+
+ var createIterator = function(mode, str){
+ switch(mode){
+ case "NFD":
+ return new DecompIterator(new RecursDecompIterator(new UCharIterator(str), true));
+ case "NFKD":
+ return new DecompIterator(new RecursDecompIterator(new UCharIterator(str), false));
+ case "NFC":
+ return new CompIterator(new DecompIterator(new RecursDecompIterator(new UCharIterator(str), true)));
+ case "NFKC":
+ return new CompIterator(new DecompIterator(new RecursDecompIterator(new UCharIterator(str), false)));
+ }
+ throw mode + " is invalid";
+ };
+ var normalize = function(mode, str){
+ var it = createIterator(mode, str);
+ var ret = "";
+ var uchar;
+ while(!!(uchar = it.next())){
+ ret += uchar.toString();
+ }
+ return ret;
+ };
+
+ var nfd = function(str){
+ return normalize("NFD", str);
+ };
+
+ var nfkd = function(str){
+ return normalize("NFKD", str);
+ };
+
+ var nfc = function(str){
+ return normalize("NFC", str);
+ };
+
+ var nfkc = function(str){
+ return normalize("NFKC", str);
+ };
+
+ // exports
+ this.UNorm = this.UNorm || {};
+ var ns = this.UNorm;
+ ns.UChar = UChar;
+ ns.normalize = normalize;
+ ns.createIterator = createIterator;
+ ns.nfd = nfd;
+ ns.nfkd = nfkd;
+ ns.nfc = nfc;
+ ns.nfkc = nfkc;
+})();
+
+
68 src/unormdata.js
68 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.

0 comments on commit 63d85f3

Please sign in to comment.
Something went wrong with that request. Please try again.