Skip to content

Commit

Permalink
No commit message
Browse files Browse the repository at this point in the history
  • Loading branch information
takscape committed Aug 5, 2006
1 parent 439c0ac commit d562377
Show file tree
Hide file tree
Showing 120 changed files with 9,886 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .classpath
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path=""/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="output" path=""/>
</classpath>
17 changes: 17 additions & 0 deletions .project
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>juniversalchardet</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>
Binary file added org/mozilla/universalchardet/CharsetListener.class
Binary file not shown.
43 changes: 43 additions & 0 deletions org/mozilla/universalchardet/CharsetListener.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA <k-tak@void.in> (Java port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */

package org.mozilla.universalchardet;

public interface CharsetListener
{
public void report(String charset);
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
299 changes: 299 additions & 0 deletions org/mozilla/universalchardet/UniversalDetector.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom <shooshX@gmail.com>
* Kohei TAKETA <k-tak@void.in> (Java port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */

package org.mozilla.universalchardet;

import org.mozilla.universalchardet.prober.CharsetProber;
import org.mozilla.universalchardet.prober.MBCSGroupProber;
import org.mozilla.universalchardet.prober.SBCSGroupProber;
import org.mozilla.universalchardet.prober.EscCharsetProber;
import org.mozilla.universalchardet.prober.Latin1Prober;

public class UniversalDetector
{
////////////////////////////////////////////////////////////////
// constants
////////////////////////////////////////////////////////////////
public static final float SHORTCUT_THRESHOLD = 0.95f;
public static final float MINIMUM_THRESHOLD = 0.20f;


////////////////////////////////////////////////////////////////
// inner types
////////////////////////////////////////////////////////////////
public enum InputState
{
PURE_ASCII,
ESC_ASCII,
HIGHBYTE
}


////////////////////////////////////////////////////////////////
// fields
////////////////////////////////////////////////////////////////
private InputState inputState;
private boolean done;
private boolean start;
private boolean gotData;
private byte lastChar;
private String detectedCharset;

private CharsetProber[] probers;
private CharsetProber escCharsetProber;

private CharsetListener listener;


////////////////////////////////////////////////////////////////
// methods
////////////////////////////////////////////////////////////////
public UniversalDetector(CharsetListener listener)
{
this.listener = listener;
this.escCharsetProber = null;
this.probers = new CharsetProber[3];
for (int i=0; i<this.probers.length; ++i) {
this.probers[i] = null;
}

reset();
}

public boolean isDone()
{
return this.done;
}

public void handleData(final byte[] buf, int offset, int length)
{
if (this.done) {
return;
}

if (length > 0) {
this.gotData = true;
}

if (this.start) {
this.start = false;
if (length > 3) {
int b1 = buf[offset] & 0xFF;
int b2 = buf[offset+1] & 0xFF;
int b3 = buf[offset+2] & 0xFF;
int b4 = buf[offset+3] & 0xFF;

switch (b1) {
case 0xEF:
if (b2 == 0xBB && b3 == 0xBF) {
this.detectedCharset = "UTF-8";
}
break;
case 0xFE:
if (b2 == 0xFF && b3 == 0x00 && b4 == 0x00) {
this.detectedCharset = "X-ISO-10646-UCS-4-3412";
} else if (b2 == 0xFF) {
this.detectedCharset = "UTF-16BE";
}
break;
case 0x00:
if (b2 == 0x00 && b3 == 0xFE && b4 == 0xFF) {
this.detectedCharset = "UTF-32BE";
} else if (b2 == 0x00 && b3 == 0xFF && b4 == 0xFE) {
this.detectedCharset = "X-ISO-10646-UCS-4-2143";
}
break;
case 0xFF:
if (b2 == 0xFE && b3 == 0x00 && b4 == 0x00) {
this.detectedCharset = "UTF-32LE";
} else if (b2 == 0xFE) {
this.detectedCharset = "UTF-16LE";
}
break;
} // swich end

if (this.detectedCharset != null) {
this.done = true;
return;
}
}
} // if (start) end

int maxPos = offset + length;
for (int i=offset; i<maxPos; ++i) {
int c = buf[i] & 0xFF;
if ((c & 0x80) != 0 && c != 0xA0) {
if (this.inputState != InputState.HIGHBYTE) {
this.inputState = InputState.HIGHBYTE;

if (this.escCharsetProber != null) {
this.escCharsetProber = null;
}

if (this.probers[0] == null) {
this.probers[0] = new MBCSGroupProber();
}
if (this.probers[1] == null) {
this.probers[1] = new SBCSGroupProber();
}
if (this.probers[2] == null) {
this.probers[2] = new Latin1Prober();
}
}
} else {
if (this.inputState == InputState.PURE_ASCII &&
(c == 0x1B || (c == 0x7B && this.lastChar == 0x7E))) {
this.inputState = InputState.ESC_ASCII;
}
this.lastChar = buf[i];
}
} // for end

CharsetProber.ProbingState st;
if (this.inputState == InputState.ESC_ASCII) {
if (this.escCharsetProber == null) {
this.escCharsetProber = new EscCharsetProber();
}
st = this.escCharsetProber.handleData(buf, offset, length);
if (st == CharsetProber.ProbingState.FOUND_IT) {
this.done = true;
this.detectedCharset = this.escCharsetProber.getCharSetName();
}
} else if (this.inputState == InputState.HIGHBYTE) {
for (int i=0; i<this.probers.length; ++i) {
st = this.probers[i].handleData(buf, offset, length);
if (st == CharsetProber.ProbingState.FOUND_IT) {
this.done = true;
this.detectedCharset = this.probers[i].getCharSetName();
return;
}
}
} else { // pure ascii
// do nothing
}
}

public void dataEnd()
{
if (!this.gotData) {
return;
}

if (this.detectedCharset != null) {
this.done = true;
if (this.listener != null) {
this.listener.report(this.detectedCharset);
}
return;
}

if (this.inputState == InputState.HIGHBYTE) {
float proberConfidence;
float maxProberConfidence = 0.0f;
int maxProber = 0;

for (int i=0; i<this.probers.length; ++i) {
proberConfidence = this.probers[i].getConfidence();
if (proberConfidence > maxProberConfidence) {
maxProberConfidence = proberConfidence;
maxProber = i;
}
}

if (maxProberConfidence > MINIMUM_THRESHOLD) {
if (this.listener != null) {
this.listener.report(this.probers[maxProber].getCharSetName());
}
}
} else if (this.inputState == InputState.ESC_ASCII) {
// do nothing
} else {
// do nothing
}
}

public void reset()
{
this.done = false;
this.start = true;
this.detectedCharset = null;
this.gotData = false;
this.inputState = InputState.PURE_ASCII;
this.lastChar = 0;

if (this.escCharsetProber != null) {
this.escCharsetProber.reset();
}

for (int i=0; i<this.probers.length; ++i) {
if (this.probers[i] != null) {
this.probers[i].reset();
}
}
}


////////////////////////////////////////////////////////////////
// testing
////////////////////////////////////////////////////////////////
public static void main(String[] args) throws Exception
{
if (args.length != 1) {
System.out.println("USAGE: java UniversalDetector filename");
return;
}

UniversalDetector detector = new UniversalDetector(
new CharsetListener() {
public void report(String name)
{
System.out.println("charset = " + name);
}
}
);

byte[] buf = new byte[4096];
java.io.FileInputStream fis = new java.io.FileInputStream(args[0]);

int nread;
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
detector.dataEnd();
}
}
Binary file not shown.
Loading

0 comments on commit d562377

Please sign in to comment.