Skip to content

Commit

Permalink
[+]: fixed "UTF8::is_binary()"
Browse files Browse the repository at this point in the history
[+]: added some more UnitTests ...
  • Loading branch information
Lars Moelleken committed Apr 23, 2015
1 parent 36bf442 commit c45de08
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 39 deletions.
72 changes: 36 additions & 36 deletions src/voku/helper/UTF8.php
Original file line number Diff line number Diff line change
Expand Up @@ -1223,7 +1223,9 @@ public static function substr($str, $start = 0, $length = null, $cleanUtf8 = fal
}

if ($length === null) {
$length = self::strlen($str);
$length = (int) self::strlen($str);
} else {
$length = (int) $length;
}

if (self::$support['mbstring'] === true) {
Expand All @@ -1237,9 +1239,9 @@ public static function substr($str, $start = 0, $length = null, $cleanUtf8 = fal
}

if ($bug62759) {
return Intl::grapheme_substr_workaround62759($str, $start, $length);
return (string) Intl::grapheme_substr_workaround62759($str, $start, $length);
} else {
return grapheme_substr($str, $start, $length);
return (string) grapheme_substr($str, $start, $length);
}
}

Expand Down Expand Up @@ -1462,17 +1464,19 @@ public static function strwidth($s)
*
* @return int|null
*/
public static function strcspn($s, $charlist, $start = 0, $len = 2147483647)
public static function strcspn($str, $charlist, $start = 0, $len = 2147483647)
{
if ('' === $charlist .= '') {
return null;
}

if ($start || 2147483647 != $len) {
$s = self::substr($s, $start, $len);
$str = (string) self::substr($str, $start, $len);
} else {
$str = (string) $str;
}

return preg_match('/^(.*?)' . self::rxClass($charlist) . '/us', $s, $len) ? self::strlen($len[1]) : self::strlen($s);
return preg_match('/^(.*?)' . self::rxClass($charlist) . '/us', $str, $len) ? self::strlen($len[1]) : self::strlen($str);
}

/**
Expand Down Expand Up @@ -1503,7 +1507,7 @@ public static function remove_invisible_characters($str, $url_encoded = true)

do {
$str = preg_replace($non_displayables, '', $str, -1, $count);
} while ($count);
} while ($count !== 0);

return $str;
}
Expand Down Expand Up @@ -1973,7 +1977,7 @@ public static function file_get_contents($filename, $flags = null, $context = nu
);
}

if ($maxlen) {
if (is_int($maxlen)) {
$data = file_get_contents($filename, $flags, $context, $offset, $maxlen);
} else {
$data = file_get_contents($filename, $flags, $context, $offset);
Expand All @@ -1985,6 +1989,7 @@ public static function file_get_contents($filename, $flags = null, $context = nu
}

self::checkForSupport();

$encoding = self::str_detect_encoding($data);
if ($encoding != 'UTF-8') {
$data = mb_convert_encoding($data, 'UTF-8', $encoding);
Expand All @@ -1997,8 +2002,8 @@ public static function file_get_contents($filename, $flags = null, $context = nu
/**
* is_binary_file
*
* @param $file
* @return mixed
* @param string $file
* @return boolean
*/
public static function is_binary_file($file) {
try {
Expand All @@ -2009,32 +2014,27 @@ public static function is_binary_file($file) {
catch(\Exception $e) {
$block = "";
}

return self::is_binary($block);
}

/**
* is_binary
* check if the input is binary (is look like a hack)
*
* @param mixed $input
*
* @param $block
* @param bool $utf
* @return bool
*/
public static function is_binary($block, $utf = true) {
$testLength = strlen($block);
$test = (
0
or
($testLength ? substr_count($block, "^ -~") / $testLength > 0.3 : 1 == 0)
or
substr_count($block, "\x00") > 0
);
public static function is_binary($input) {

$testLength = strlen($input);

if (
$test
&&
!($utf && self::is_utf16($block))
&&
!($utf && self::is_utf32($block))
preg_match('~^[01]+$~', $input)
||
substr_count($input, "\x00") > 0
||
($testLength ? substr_count($input, "^ -~") / $testLength > 0.3 : 1 == 0)
) {
return true;
} else {
Expand All @@ -2049,12 +2049,12 @@ public static function is_binary($block, $utf = true) {
* @return int|false false if is't not UTF16, 1 for UTF-16LE, 2 for UTF-16BE
*/
public static function is_utf32($string) {
if (self::is_binary($string, false)) {
if (self::is_binary($string)) {
self::checkForSupport();

$maybeUTF32LE = 0;
$test = mb_convert_encoding($string, 'UTF-8', 'UTF-32LE');
if (strlen($test) > 1) {
if ($test !== false && strlen($test) > 1) {
$test2 = mb_convert_encoding($test, 'UTF-32LE', 'UTF-8');
$test3 = mb_convert_encoding($test2, 'UTF-8', 'UTF-32LE');
if ($test3 == $test) {
Expand All @@ -2069,7 +2069,7 @@ public static function is_utf32($string) {

$maybeUTF32BE = 0;
$test = mb_convert_encoding($string, 'UTF-8', 'UTF-32BE');
if (strlen($test) > 1) {
if ($test !== false && strlen($test) > 1) {
$test2 = mb_convert_encoding($test, 'UTF-32BE', 'UTF-8');
$test3 = mb_convert_encoding($test2, 'UTF-8', 'UTF-32BE');
if ($test3 == $test) {
Expand Down Expand Up @@ -2102,12 +2102,12 @@ public static function is_utf32($string) {
* @return int|false false if is't not UTF16, 1 for UTF-16LE, 2 for UTF-16BE
*/
public static function is_utf16($string) {
if (self::is_binary($string, false)) {
if (self::is_binary($string)) {
self::checkForSupport();

$maybeUTF16LE = 0;
$test = mb_convert_encoding($string, 'UTF-8', 'UTF-16LE');
if (strlen($test) > 1) {
if ($test !== false && strlen($test) > 1) {
$test2 = mb_convert_encoding($test, 'UTF-16LE', 'UTF-8');
$test3 = mb_convert_encoding($test2, 'UTF-8', 'UTF-16LE');
if ($test3 == $test) {
Expand All @@ -2122,7 +2122,7 @@ public static function is_utf16($string) {

$maybeUTF16BE = 0;
$test = mb_convert_encoding($string, 'UTF-8', 'UTF-16BE');
if (strlen($test) > 1) {
if ($test !== false && strlen($test) > 1) {
$test2 = mb_convert_encoding($test, 'UTF-16BE', 'UTF-8');
$test3 = mb_convert_encoding($test2, 'UTF-8', 'UTF-16BE');
if ($test3 == $test) {
Expand Down Expand Up @@ -2547,13 +2547,13 @@ public static function strcmp($a, $b)
*
* @param string $str The original Unicode string
*
* @return array An array of byte lengths of each character.
* @return int An array of byte lengths of each character.
*/
public static function max_chr_width($str)
{
$bytes = self::chr_size_list($str);
if (count($bytes) > 0) {
return max($bytes);
return (int) max($bytes);
} else {
return 0;
}
Expand Down Expand Up @@ -2713,7 +2713,7 @@ public static function str_detect_encoding($str)
$encoding = mb_detect_encoding($str, $detectOrder, true);
}

if (self::is_binary($str, false)) {
if (self::is_binary($str)) {
if (self::is_utf16($str) == 1) {
return 'UTF-16LE';
}
Expand Down
42 changes: 39 additions & 3 deletions tests/UTF8Test.php
Original file line number Diff line number Diff line change
Expand Up @@ -903,17 +903,20 @@ public function testIsBinary()
{
$tests = array(
"öäü" => false,
"" => false
"" => false,
"1" => false,
decbin(324546) => true,
01 => true
);

foreach ($tests as $before => $after) {
$this->assertEquals($after, UTF8::is_binary($before));
$this->assertEquals($after, UTF8::is_binary($before), 'value: ' . $before);
}
}

public function testFileGetContents()
{
// TODO: UTF-8 shim only works for UTF-8 :P
// INFO: UTF-8 shim only works for UTF-8
if (UTF8::mbstring_loaded() === true) {

$testString = UTF8::file_get_contents(dirname(__FILE__) . '/test1Utf16pe.txt');
Expand All @@ -930,6 +933,39 @@ public function testFileGetContents()

$testString = UTF8::file_get_contents(dirname(__FILE__) . '/test1Iso8859-7.txt');
$this->assertContains('Iñtërnâtiônàlizætiøn', $testString);

$testString = UTF8::file_get_contents(dirname(__FILE__) . '/test1Utf16pe.txt', FILE_TEXT);
$this->assertContains('<p>Today’s Internet users are not the same users who were online a decade ago. There are better connections.', $testString);

$testString = UTF8::file_get_contents(dirname(__FILE__) . '/test1Utf16le.txt', null, null, 0);
$this->assertContains('<p>Today’s Internet users are not the same users who were online a decade ago. There are better connections.', $testString);

$testString = UTF8::file_get_contents(dirname(__FILE__) . '/test1Utf16le.txt', null, null, 5);
$this->assertContains('There are better connections.', $testString);

$testString = UTF8::file_get_contents(dirname(__FILE__) . '/test1Utf8.txt', null, null, 7, 11);
$this->assertContains('Iñtërnât', $testString);

$testString = UTF8::file_get_contents(dirname(__FILE__) . '/test1Latin.txt', null, null, 7, 10, 15);
$this->assertContains('ñtërnâtiôn', $testString);

$testString = UTF8::file_get_contents(dirname(__FILE__) . '/test1Iso8859-7.txt', null, null, 7, null, 10);
$this->assertContains('Iñtërnâtiônàlizætiøn', $testString);

$testString = UTF8::file_get_contents(dirname(__FILE__) . '/test1Iso8859-7.txt', null, null, null, 10, 10);
$this->assertContains('Hírek', $testString);

$context = stream_context_create(
array(
'http' =>
array(
'timeout' => 10
)
)
);

$testString = UTF8::file_get_contents(dirname(__FILE__) . '/test1Iso8859-7.txt', null, $context, null, 10, 10);
$this->assertContains('Hírek', $testString);
}
}

Expand Down

0 comments on commit c45de08

Please sign in to comment.