Skip to content

Commit

Permalink
Add multi-byte character ratio detection in UTF-8 prober confidence f…
Browse files Browse the repository at this point in the history
…unction

see also aadsm/jschardet#57
  • Loading branch information
yinyue200 committed Feb 19, 2021
1 parent aef61a2 commit dcea8d5
Showing 1 changed file with 22 additions and 4 deletions.
26 changes: 22 additions & 4 deletions src/Core/Probers/MultiByte/UTF8Prober.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
*
* ***** END LICENSE BLOCK ***** */

using System;
using System.Text;

using UtfUnknown.Core.Models;
Expand All @@ -48,6 +49,9 @@ public class UTF8Prober : CharsetProber
private static float ONE_CHAR_PROB = 0.50f;
private CodingStateMachine codingSM;
private int numOfMBChar;
private int mbCharLen;
private int fullLen;
private int basicAsciiLen;

public UTF8Prober()
{
Expand All @@ -70,12 +74,13 @@ public override void Reset()

public override ProbingState HandleData(byte[] buf, int offset, int len)
{
fullLen += buf.Length;
int max = offset + len;

for (int i = offset; i < max; i++)
{

var codingState = codingSM.NextState(buf[i]);
var c = buf[i];
var codingState = codingSM.NextState(c);

if (codingState == StateMachineModel.ERROR)
{
Expand All @@ -92,7 +97,14 @@ public override ProbingState HandleData(byte[] buf, int offset, int len)
if (codingState == StateMachineModel.START)
{
if (codingSM.CurrentCharLen >= 2)
{
numOfMBChar++;
mbCharLen += codingSM.CurrentCharLen;
}
else if(c < 128)// codes higher than 127 are extended ASCII
{
basicAsciiLen++;
}
}
}

Expand All @@ -107,11 +119,17 @@ public override float GetConfidence(StringBuilder status = null)
{
float unlike = 0.99f;
float confidence;
var mbCharRatio = 0;
var nonBasciAsciiLen = fullLen - basicAsciiLen;
if (nonBasciAsciiLen > 0)
{
mbCharRatio = mbCharLen / nonBasciAsciiLen;
}

if (numOfMBChar < 6)
if (numOfMBChar < 6 && mbCharRatio <= 0.6)
{
for (int i = 0; i < numOfMBChar; i++)
unlike *= ONE_CHAR_PROB;
unlike *= (float)Math.Pow(ONE_CHAR_PROB, numOfMBChar);

confidence = 1.0f - unlike;
}
Expand Down

0 comments on commit dcea8d5

Please sign in to comment.