Skip to content

Commit

Permalink
[+]: optimized "UTF8::str_detect_encoding()"
Browse files Browse the repository at this point in the history
[*]: added more API doc
  • Loading branch information
Lars Moelleken committed Jul 26, 2016
1 parent 5b1b348 commit d3676a4
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 17 deletions.
61 changes: 60 additions & 1 deletion API.md
Original file line number Diff line number Diff line change
Expand Up @@ -539,12 +539,71 @@ Create an array containing a range of UTF-8 characters.
UTF8::range('κ', 'ζ'); // array('κ', 'ι', 'θ', 'η', 'ζ',)
```

##### remove_bom(mixed $var1, mixed $var2) : array
##### remove_bom(string $str) : string

Remove the BOM from UTF-8 / UTF-16 / UTF-32 strings.

```php
UTF8::remove_bom("\xEF\xBB\xBFΜπορώ να"); // 'Μπορώ να'
```

##### remove_duplicates(string $str, string|array $what = ' ') : string

Removes duplicate occurrences of a string in another string.

```php
UTF8::remove_duplicates('öäü-κόσμεκόσμε-äöü', 'κόσμε'); // 'öäü-κόσμε-äöü'
```

##### remove_invisible_characters(string $str, bool $url_encoded = true, string $replacement = '') : string

Remove invisible characters from a string.

```php
UTF8::remove_duplicates("κόσ\0με"); // 'κόσμε'
```

##### replace_diamond_question_mark(string $str, string $unknown = '?') : string

Replace the diamond question mark (�) with the replacement.

```php
UTF8::replace_diamond_question_mark('中文空白�'); // '中文空白'
```

##### rtrim(string $str = '', string $chars = INF) : string

Strip whitespace or other characters from end of a UTF-8 string.

```php
UTF8::rtrim('-ABC-中文空白- '); // '-ABC-中文空白-'
```

##### single_chr_html_encode(string $char, bool $keepAsciiChars = false) : string

Converts a UTF-8 character to HTML Numbered Entity like "{".

```php
UTF8::single_chr_html_encode('κ'); // 'κ'
```

##### split(string $str, int $length = 1, bool $cleanUtf8 = false) : array

Convert a string to an array of Unicode characters.

```php
UTF8::split('中文空白'); // array('中', '文', '空', '白')
```

##### str_detect_encoding(string $str, int $length = 1, bool $cleanUtf8 = false) : array

Optimized "\mb_detect_encoding()"-function -> with support for UTF-16 and UTF-32.

```php
UTF8::str_detect_encoding('中文空白'); // array('中', '文', '空', '白')
```




... TODO
17 changes: 8 additions & 9 deletions src/voku/helper/UTF8.php
Original file line number Diff line number Diff line change
Expand Up @@ -4317,10 +4317,9 @@ public static function remove_duplicates($str, $what = ' ')
}

/**
* Remove Invisible Characters
* Remove invisible characters from a string.
*
* This prevents sandwiching null characters
* between ascii characters, like Java\0script.
* e.g.: This prevents sandwiching null characters between ascii characters, like Java\0script.
*
* copy&past from https://github.com/bcit-ci/CodeIgniter/blob/develop/system/core/Common.php
*
Expand Down Expand Up @@ -4352,7 +4351,7 @@ public static function remove_invisible_characters($str, $url_encoded = true, $r
}

/**
* replace diamond question mark (�)
* Replace the diamond question mark (�) with the replacement.
*
* @param string $str
* @param string $unknown
Expand Down Expand Up @@ -4612,14 +4611,14 @@ public static function str_detect_encoding($str)
// INFO: UTF-16, UTF-32, UCS2 and UCS4, encoding detection will fail always with "\mb_detect_encoding()"

$detectOrder = array(
'windows-1251',
'ISO-8859-1',
'ASCII',
'UTF-8',
'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4', 'ISO-8859-5',
'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9', 'ISO-8859-10',
'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'ISO-8859-16',
'WINDOWS-1251', 'WINDOWS-1252', 'WINDOWS-1254',
'ISO-2022-JP', 'JIS', 'EUC-JP',
);

self::checkForSupport();

$encoding = \mb_detect_encoding($str, $detectOrder, true);
if ($encoding) {
return $encoding;
Expand Down
22 changes: 15 additions & 7 deletions tests/Utf8GlobalTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ public function testEncode()
$tests = array(
' -ABC-中文空白- ' => ' -ABC-????- ',
' - ÖÄÜ- ' => ' - ÖÄÜ- ',
'öäü' => '???',
'öäü' => 'öäü',
'' => '',
'abc' => 'abc',
'Berbée' => 'Berbée',
Expand Down Expand Up @@ -1528,7 +1528,7 @@ public function testNormalizeWhitespace()
" foo\t foo " => ' foo foo ',
);

for ($i = 0; $i < 2; $i++) {
for ($i = 0; $i < 2; $i++) { // keep this loop for simple performance tests
foreach ($tests as $before => $after) {
self::assertSame($after, UTF8::normalize_whitespace($before));
}
Expand Down Expand Up @@ -1857,9 +1857,17 @@ public function testStrDetectEncoding()
01 => 'ASCII', // ASCII
);

foreach ($tests as $before => $after) {
self::assertSame($after, UTF8::str_detect_encoding($before), 'value: ' . $before);
for ($i = 0; $i <= 2; $i++) { // keep this loop for simple performance tests
foreach ($tests as $before => $after) {
self::assertSame($after, UTF8::str_detect_encoding($before), 'value: ' . $before);
}
}

$testString = file_get_contents(__DIR__ . '/fixtures/latin.txt');
self::assertContains('ISO-8859-1', UTF8::str_detect_encoding($testString));

$testString = file_get_contents(__DIR__ . '/fixtures/iso-8859-7.txt');
self::assertContains('ISO-8859-1', UTF8::str_detect_encoding($testString)); // ?
}

public function testStrLimit()
Expand Down Expand Up @@ -2095,7 +2103,7 @@ public function testStripTags()

public function testStripos()
{
for ($i = 0; $i <= 5; $i++) {
for ($i = 0; $i <= 2; $i++) { // keep this loop for simple performance tests
self::assertSame(3, UTF8::stripos('DÉJÀ', 'à'));
self::assertSame(1, UTF8::stripos('aςσb', 'ΣΣ'));
self::assertSame(16, UTF8::stripos('der Straße nach Paris', 'Paris'));
Expand Down Expand Up @@ -2210,7 +2218,7 @@ public function testStrpbrk()

public function testStrpos()
{
for ($i = 0; $i <= 3; $i++) { // keep this loop for simple performance tests
for ($i = 0; $i <= 2; $i++) { // keep this loop for simple performance tests

// php compatible tests

Expand Down Expand Up @@ -2993,7 +3001,7 @@ public function testToUtf8_v3()
*/
public function testTrim($input, $output)
{
for ($i = 0; $i <= 10; $i++) {
for ($i = 0; $i <= 2; $i++) { // keep this loop for simple performance tests
self::assertSame($output, UTF8::trim($input));
}
}
Expand Down

0 comments on commit d3676a4

Please sign in to comment.