Skip to content

Commit

Permalink
Add mb_trim polyfills
Browse files Browse the repository at this point in the history
  • Loading branch information
Fan2Shrek committed Mar 30, 2024
1 parent 67d131a commit 170efba
Show file tree
Hide file tree
Showing 7 changed files with 226 additions and 69 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ Polyfills are provided for:
- the `str_increment` and `str_decrement` functions introduced in PHP 8.3;
- the `Date*Exception/Error` classes introduced in PHP 8.3;
- the `SQLite3Exception` class introduced in PHP 8.3;
- the `mb_trim`, `mb_ltrim` and `mb_rtrim` functions introduced in PHP 8.4;

It is strongly recommended to upgrade your PHP version and/or install the missing
extensions whenever possible. This polyfill should be used only when there is no
Expand Down
63 changes: 63 additions & 0 deletions src/Mbstring/Mbstring.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@
* - mb_strstr - Finds first occurrence of a string within another
* - mb_strwidth - Return width of string
* - mb_substr_count - Count the number of substring occurrences
* - mb_trim - Strip whitespace (or other characters) from the beginning and end of a string
* - mb_ltrim - Strip whitespace (or other characters) from the beginning of a string
* - mb_rtrim - Strip whitespace (or other characters) from the end of a string
*
* Not implemented:
* - mb_convert_kana - Convert "kana" one from another ("zen-kaku", "han-kaku" and more)
Expand Down Expand Up @@ -77,6 +80,8 @@ final class Mbstring
private static $encodingList = ['ASCII', 'UTF-8'];
private static $language = 'neutral';
private static $internalEncoding = 'UTF-8';
private const CHARACTERS = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}";


public static function mb_convert_encoding($s, $toEncoding, $fromEncoding = null)
{
Expand Down Expand Up @@ -944,4 +949,62 @@ private static function getEncoding($encoding)

return $encoding;
}

public static function mb_trim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string
{
return self::mb_internal_trim('^[%s]+|[%s]+$', $string, $characters, $encoding);
}

public static function mb_ltrim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string
{
return self::mb_internal_trim('^[%s]+', $string, $characters, $encoding);
}

public static function mb_rtrim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string
{
return self::mb_internal_trim('[%s]+$', $string, $characters, $encoding);
}

private static function mb_internal_trim(string $regex, string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string
{
if (null === $encoding) {
$encoding = self::mb_internal_encoding();
}

try {
$validEncoding = @self::mb_check_encoding('', $encoding);
} catch (\ValueError $e) {
throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given.', debug_backtrace()[1]['function'], $encoding));
}

// BC for PHP 7.3 and lower
if (!$validEncoding) {
throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given.', debug_backtrace()[1]['function'], $encoding));
}

if ('' === $characters) {
return null === $encoding ? $string : mb_convert_encoding($string, $encoding);
}

$regexCharacter = preg_quote($characters, '/');
$regex = sprintf($regex, $regexCharacter, $regexCharacter);

if ('ASCII' === mb_detect_encoding($characters) && 'ASCII' === mb_detect_encoding($string) && !empty(array_intersect(str_split(self::CHARACTERS), str_split($string)))) {
$options = 'g';
} else {
$options = '';
}

try {
$a = mb_ereg_replace($regex, "", $string, $options);

if (null === $a) {
throw new \Exception();
}

return $a;
} catch (\Exception $e) {
return preg_replace(sprintf('/%s/', $regex), "", $string);
}
}
}
13 changes: 13 additions & 0 deletions src/Mbstring/bootstrap.php
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,19 @@ function mb_str_split($string, $length = 1, $encoding = null) { return p\Mbstrin
function mb_str_pad(string $string, int $length, string $pad_string = ' ', int $pad_type = STR_PAD_RIGHT, ?string $encoding = null): string { return p\Mbstring::mb_str_pad($string, $length, $pad_string, $pad_type, $encoding); }
}

if (!function_exists('mb_trim')) {
function mb_trim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string { return p\Mbstring::mb_trim($string, $characters, $encoding); }
}

if (!function_exists('mb_ltrim')) {
function mb_ltrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string { return p\Mbstring::mb_ltrim($string, $characters, $encoding); }
}

if (!function_exists('mb_rtrim')) {
function mb_rtrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string { return p\Mbstring::mb_rtrim($string, $characters, $encoding); }
}


if (extension_loaded('mbstring')) {
return;
}
Expand Down
81 changes: 32 additions & 49 deletions src/Php84/Php84.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,76 +23,59 @@ final class Php84

public static function mb_trim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string
{
try {
@mb_check_encoding('', $encoding);
} catch (\ValueError $e) {
throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given', __METHOD__, $encoding));
}

if ('' === $characters) {
return null === $encoding ? $string : mb_convert_encoding($string, $encoding);
}

if ($encoding !== null && $encoding !== 'UTF-8') {
$string = mb_convert_encoding($string, "UTF-8", $encoding);
$characters = mb_convert_encoding($characters, "UTF-8", $encoding);
}

$regex = preg_quote($characters, '/');
$regex = sprintf('^[%s]+|[%s]+$', $regex, $regex);
return self::mb_internal_trim('^[%s]+|[%s]+$', $string, $characters, $encoding);
}

if ('ASCII' === mb_detect_encoding($characters) && 'ASCII' === mb_detect_encoding($string) && !empty(array_intersect(str_split(self::CHARACTERS), str_split($string)))) {
$options = 'g';
} else {
$options = '';
}
public static function mb_ltrim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string
{
return self::mb_internal_trim('^[%s]+', $string, $characters, $encoding);
}

return mb_ereg_replace($regex, "", $string, $options);
public static function mb_rtrim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string
{
return self::mb_internal_trim('[%s]+$', $string, $characters, $encoding);
}

public static function mb_ltrim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string
private static function mb_internal_trim(string $regex, string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string
{
if (null === $encoding) {
$encoding = mb_internal_encoding();
}

try {
@mb_check_encoding('', $encoding);
$validEncoding = @mb_check_encoding('', $encoding);
} catch (\ValueError $e) {
throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given', __METHOD__, $encoding));
throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given.', debug_backtrace()[1]['function'], $encoding));
}

// BC for PHP 7.3 and lower
if (!$validEncoding) {
throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given.', debug_backtrace()[1]['function'], $encoding));
}

if ('' === $characters) {
return null === $encoding ? $string : mb_convert_encoding($string, $encoding);
}

$regex = sprintf('^[%s]+', preg_quote($characters, '/'));
$regexCharacter = preg_quote($characters, '/');
$regex = sprintf($regex, $regexCharacter, $regexCharacter);

if ('ASCII' === mb_detect_encoding($characters) && 'ASCII' === mb_detect_encoding($string)) {
if ('ASCII' === mb_detect_encoding($characters) && 'ASCII' === mb_detect_encoding($string) && !empty(array_intersect(str_split(self::CHARACTERS), str_split($string)))) {
$options = 'g';
} else {
$options = '';
}

return mb_ereg_replace($regex, "", $string, $options);
}

public static function mb_rtrim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string
{
try {
@mb_check_encoding('', $encoding);
} catch (\ValueError $e) {
throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given', __METHOD__, $encoding));
}

if ('' === $characters) {
return null === $encoding ? $string : mb_convert_encoding($string, $encoding);
}
$a = mb_ereg_replace($regex, "", $string, $options);

$regex = sprintf('[%s]+$', preg_quote($characters, '/'));
if (null === $a) {
throw new \Exception();
}

if ('ASCII' === mb_detect_encoding($characters)) {
$options = 'g';
} else {
$options = '';
return $a;
} catch (\Exception $e) {
return preg_replace(sprintf('/%s/', $regex), "", $string);
}

return mb_ereg_replace($regex, "", $string, $options);
}
}
}
15 changes: 3 additions & 12 deletions src/Php84/bootstrap.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,13 @@
}

if (!function_exists('mb_trim')) {
function mb_trim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string
{
return p\Php84::mb_trim($string, $characters, $encoding);
}
function mb_trim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string { return p\Php84::mb_trim($string, $characters, $encoding); }
}

if (!function_exists('mb_ltrim')) {
function mb_ltrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string
{
return p\Php84::mb_ltrim($string, $characters, $encoding);
}
function mb_ltrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string { return p\Php84::mb_ltrim($string, $characters, $encoding); }
}

if (!function_exists('mb_rtrim')) {
function mb_rtrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string
{
return p\Php84::mb_rtrim($string, $characters, $encoding);
}
function mb_rtrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string { return p\Php84::mb_rtrim($string, $characters, $encoding); }
}
107 changes: 107 additions & 0 deletions tests/Mbstring/MbstringTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -727,4 +727,111 @@ public static function mbStrPadInvalidArgumentsProvider(): iterable
yield ['mb_str_pad(): Argument #4 ($pad_type) must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH', '▶▶', 6, ' ', 123456];
yield ['mb_str_pad(): Argument #5 ($encoding) must be a valid encoding, "unexisting" given', '▶▶', 6, ' ', \STR_PAD_BOTH, 'unexisting'];
}

/**
* @covers \Symfony\Polyfill\Mbstring\Mbstring::mb_trim
*
* @dataProvider mbTrimProvider
*/
public function testMbTrim(string $expected, string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): void
{
$this->assertSame($expected, mb_trim($string, $characters, $encoding));
}

/**
* @covers \Symfony\Polyfill\Mbstring\Mbstring::mb_ltrim
*
* @dataProvider mbLTrimProvider
*/
public function testMbLTrim(string $expected, string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): void
{
$this->assertEquals($expected, mb_ltrim($string, $characters, $encoding));
}

/**
* @covers \Symfony\Polyfill\Mbstring\Mbstring::mb_rtrim
*
* @dataProvider mbRTrimProvider
*/
public function testMbRTrim(string $expected, string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): void
{
$this->assertSame($expected, mb_rtrim($string, $characters, $encoding));
}

public function testMbTrimException(): void
{
$this->expectException(\ValueError::class);
mb_trim("\u{180F}", "", "NULL");
}

public function testMbTrimEncoding(): void
{
$this->assertSame('あ', mb_convert_encoding(mb_trim("\x81\x40\x82\xa0\x81\x40", "\x81\x40", "SJIS"), "UTF-8", "SJIS"));
$this->assertSame('226f575b', bin2hex(mb_ltrim(mb_convert_encoding("\u{FFFE}漢字", "UTF-16LE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16LE")));
$this->assertSame('6f225b57', bin2hex(mb_ltrim(mb_convert_encoding("\u{FEFF}漢字", "UTF-16BE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16BE", "UTF-8"), "UTF-16BE")));
}

public static function mbTrimProvider(): iterable
{
yield ['ABC', 'ABC'];
yield ['ABC', "\0\t\nABC \0\t\n"];
yield ["\0\t\nABC \0\t\n", "\0\t\nABC \0\t\n", ''];

yield ['', ''];

yield ["あいうえおあお", " あいうえおあお ", " ", "UTF-8"];
yield ["foo BAR Spa", "foo BAR Spaß", "ß", "UTF-8"];
yield ["oo BAR Spaß", "oo BAR Spaß", "f", "UTF-8"];

yield ["oo BAR Spa", "foo BAR Spaß", "ßf", "UTF-8"];
yield ["oo BAR Spa", "foo BAR Spaß", "", "UTF-8"];
yield ["いうおえお", " あいうおえお あ", "", "UTF-8"];
yield ["いうおえお", " あいうおえお あ", "", "UTF-8"];
yield [" あいうおえお ", " あいうおえお a", "あa", "UTF-8"];
yield [" あいうおえお a", " あいうおえお a", "\xe3", "UTF-8"];

yield ["", str_repeat(" ", 129)];
yield ["a", str_repeat(" ", 129) . "a"];

yield ["", " \f\n\r\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}"];

yield [' abcd ', ' abcd ', ''];

yield ['f', 'foo', 'oo'];

yield ["foo\n", "foo\n", 'o'];
}

public static function mbLTrimProvider(): iterable
{
yield ['ABC', 'ABC'];
yield ["ABC \0\t\n", "\0\t\nABC \0\t\n"];
yield ["\0\t\nABC \0\t\n", "\0\t\nABC \0\t\n", ''];

yield ['', ''];

yield [' test ', ' test ', ''];

yield ['いああああ', 'あああああああああああああああああああああああああああああああああいああああ', 'あ'];

yield ["漢字", "\u{FFFE}漢字", "\u{FFFE}\u{FEFF}"];
yield [' abcd ', ' abcd ', ''];
}

public static function mbRTrimProvider(): iterable
{
yield ['ABC', 'ABC'];
yield ["ABC", "ABC \0\t\n"];
yield ["\0\t\nABC \0\t\n", "\0\t\nABC \0\t\n", ''];

yield ['', ''];

yield [" a", str_repeat(" ", 129) . "a"];

yield ['あああああああああああああああああああああああああああああああああい', 'あああああああああああああああああああああああああああああああああいああああ', 'あ'];

yield [' abcd ', ' abcd ', ''];

yield ["foo\n", "foo\n", 'o'];
}
}
15 changes: 7 additions & 8 deletions tests/Php84/Php84Test.php
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ public function testMbTrimException(): void
mb_trim("\u{180F}", "", "NULL");
}

public function testMbTrimEncoding(): void
{
$this->assertSame('あ', mb_convert_encoding(mb_trim("\x81\x40\x82\xa0\x81\x40", "\x81\x40", "SJIS"), "UTF-8", "SJIS"));
$this->assertSame('226f575b', bin2hex(mb_ltrim(mb_convert_encoding("\u{FFFE}漢字", "UTF-16LE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16LE")));
$this->assertSame('6f225b57', bin2hex(mb_ltrim(mb_convert_encoding("\u{FEFF}漢字", "UTF-16BE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16BE", "UTF-8"), "UTF-16BE")));
}

public static function mbTrimProvider(): iterable
{
Expand All @@ -69,7 +75,7 @@ public static function mbTrimProvider(): iterable
yield ["いうおえお", " あいうおえお あ", "", "UTF-8"];
yield ["いうおえお", " あいうおえお あ", "", "UTF-8"];
yield [" あいうおえお ", " あいうおえお a", "あa", "UTF-8"];
// yield [" あいうおえお a", " あいうおえお a", "\xe3", "UTF-8"];
yield [" あいうおえお a", " あいうおえお a", "\xe3", "UTF-8"];

yield ["", str_repeat(" ", 129)];
yield ["a", str_repeat(" ", 129) . "a"];
Expand All @@ -78,9 +84,6 @@ public static function mbTrimProvider(): iterable

yield [' abcd ', ' abcd ', ''];

// May not work
// yield ['あ', mb_convert_encoding('', "UTF-8", "SJIS"), "\x81\x40", "SJIS"];

yield ['f', 'foo', 'oo'];

yield ["foo\n", "foo\n", 'o'];
Expand All @@ -99,10 +102,6 @@ public static function mbLTrimProvider(): iterable
yield ['いああああ', 'あああああああああああああああああああああああああああああああああいああああ', 'あ'];

yield ["漢字", "\u{FFFE}漢字", "\u{FFFE}\u{FEFF}"];
// May does not work
// yield ['226f575b', \bin2hex(mb_convert_encoding("\u{FFFE}漢字", "UTF-16LE", "UTF-8")), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16LE"];
// yield ['漢字', \bin2hex(mb_convert_encoding("\u{FFFE}漢字", "UTF-16BE", "UTF-8")), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16BE"];

yield [' abcd ', ' abcd ', ''];
}

Expand Down

0 comments on commit 170efba

Please sign in to comment.