From 170efba299de89600fd8086e769881476b65ef92 Mon Sep 17 00:00:00 2001 From: Fan2Shrek Date: Sat, 30 Mar 2024 15:01:08 +0100 Subject: [PATCH] Add mb_trim polyfills --- README.md | 1 + src/Mbstring/Mbstring.php | 63 +++++++++++++++++++ src/Mbstring/bootstrap.php | 13 ++++ src/Php84/Php84.php | 81 ++++++++++-------------- src/Php84/bootstrap.php | 15 +---- tests/Mbstring/MbstringTest.php | 107 ++++++++++++++++++++++++++++++++ tests/Php84/Php84Test.php | 15 +++-- 7 files changed, 226 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index c5485051..dc6e034e 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ Polyfills are provided for: - the `str_increment` and `str_decrement` functions introduced in PHP 8.3; - the `Date*Exception/Error` classes introduced in PHP 8.3; - the `SQLite3Exception` class introduced in PHP 8.3; +- the `mb_trim`, `mb_ltrim` and `mb_rtrim` functions introduced in PHP 8.4; It is strongly recommended to upgrade your PHP version and/or install the missing extensions whenever possible. This polyfill should be used only when there is no diff --git a/src/Mbstring/Mbstring.php b/src/Mbstring/Mbstring.php index 04b35cb8..6ffabedd 100644 --- a/src/Mbstring/Mbstring.php +++ b/src/Mbstring/Mbstring.php @@ -48,6 +48,9 @@ * - mb_strstr - Finds first occurrence of a string within another * - mb_strwidth - Return width of string * - mb_substr_count - Count the number of substring occurrences + * - mb_trim - Strip whitespace (or other characters) from the beginning and end of a string + * - mb_ltrim - Strip whitespace (or other characters) from the beginning of a string + * - mb_rtrim - Strip whitespace (or other characters) from the end of a string * * Not implemented: * - mb_convert_kana - Convert "kana" one from another ("zen-kaku", "han-kaku" and more) @@ -77,6 +80,8 @@ final class Mbstring private static $encodingList = ['ASCII', 'UTF-8']; private static $language = 'neutral'; private static $internalEncoding = 'UTF-8'; + private const CHARACTERS = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}"; + public static function mb_convert_encoding($s, $toEncoding, $fromEncoding = null) { @@ -944,4 +949,62 @@ private static function getEncoding($encoding) return $encoding; } + + public static function mb_trim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string + { + return self::mb_internal_trim('^[%s]+|[%s]+$', $string, $characters, $encoding); + } + + public static function mb_ltrim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string + { + return self::mb_internal_trim('^[%s]+', $string, $characters, $encoding); + } + + public static function mb_rtrim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string + { + return self::mb_internal_trim('[%s]+$', $string, $characters, $encoding); + } + + private static function mb_internal_trim(string $regex, string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string + { + if (null === $encoding) { + $encoding = self::mb_internal_encoding(); + } + + try { + $validEncoding = @self::mb_check_encoding('', $encoding); + } catch (\ValueError $e) { + throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given.', debug_backtrace()[1]['function'], $encoding)); + } + + // BC for PHP 7.3 and lower + if (!$validEncoding) { + throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given.', debug_backtrace()[1]['function'], $encoding)); + } + + if ('' === $characters) { + return null === $encoding ? $string : mb_convert_encoding($string, $encoding); + } + + $regexCharacter = preg_quote($characters, '/'); + $regex = sprintf($regex, $regexCharacter, $regexCharacter); + + if ('ASCII' === mb_detect_encoding($characters) && 'ASCII' === mb_detect_encoding($string) && !empty(array_intersect(str_split(self::CHARACTERS), str_split($string)))) { + $options = 'g'; + } else { + $options = ''; + } + + try { + $a = mb_ereg_replace($regex, "", $string, $options); + + if (null === $a) { + throw new \Exception(); + } + + return $a; + } catch (\Exception $e) { + return preg_replace(sprintf('/%s/', $regex), "", $string); + } + } } diff --git a/src/Mbstring/bootstrap.php b/src/Mbstring/bootstrap.php index ecf1a035..3878b3dd 100644 --- a/src/Mbstring/bootstrap.php +++ b/src/Mbstring/bootstrap.php @@ -136,6 +136,19 @@ function mb_str_split($string, $length = 1, $encoding = null) { return p\Mbstrin function mb_str_pad(string $string, int $length, string $pad_string = ' ', int $pad_type = STR_PAD_RIGHT, ?string $encoding = null): string { return p\Mbstring::mb_str_pad($string, $length, $pad_string, $pad_type, $encoding); } } +if (!function_exists('mb_trim')) { + function mb_trim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string { return p\Mbstring::mb_trim($string, $characters, $encoding); } +} + +if (!function_exists('mb_ltrim')) { + function mb_ltrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string { return p\Mbstring::mb_ltrim($string, $characters, $encoding); } +} + +if (!function_exists('mb_rtrim')) { + function mb_rtrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string { return p\Mbstring::mb_rtrim($string, $characters, $encoding); } +} + + if (extension_loaded('mbstring')) { return; } diff --git a/src/Php84/Php84.php b/src/Php84/Php84.php index 6d22e97e..e43f04b3 100644 --- a/src/Php84/Php84.php +++ b/src/Php84/Php84.php @@ -23,76 +23,59 @@ final class Php84 public static function mb_trim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string { - try { - @mb_check_encoding('', $encoding); - } catch (\ValueError $e) { - throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given', __METHOD__, $encoding)); - } - - if ('' === $characters) { - return null === $encoding ? $string : mb_convert_encoding($string, $encoding); - } - - if ($encoding !== null && $encoding !== 'UTF-8') { - $string = mb_convert_encoding($string, "UTF-8", $encoding); - $characters = mb_convert_encoding($characters, "UTF-8", $encoding); - } - - $regex = preg_quote($characters, '/'); - $regex = sprintf('^[%s]+|[%s]+$', $regex, $regex); + return self::mb_internal_trim('^[%s]+|[%s]+$', $string, $characters, $encoding); + } - if ('ASCII' === mb_detect_encoding($characters) && 'ASCII' === mb_detect_encoding($string) && !empty(array_intersect(str_split(self::CHARACTERS), str_split($string)))) { - $options = 'g'; - } else { - $options = ''; - } + public static function mb_ltrim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string + { + return self::mb_internal_trim('^[%s]+', $string, $characters, $encoding); + } - return mb_ereg_replace($regex, "", $string, $options); + public static function mb_rtrim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string + { + return self::mb_internal_trim('[%s]+$', $string, $characters, $encoding); } - public static function mb_ltrim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string + private static function mb_internal_trim(string $regex, string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string { + if (null === $encoding) { + $encoding = mb_internal_encoding(); + } + try { - @mb_check_encoding('', $encoding); + $validEncoding = @mb_check_encoding('', $encoding); } catch (\ValueError $e) { - throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given', __METHOD__, $encoding)); + throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given.', debug_backtrace()[1]['function'], $encoding)); + } + + // BC for PHP 7.3 and lower + if (!$validEncoding) { + throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given.', debug_backtrace()[1]['function'], $encoding)); } if ('' === $characters) { return null === $encoding ? $string : mb_convert_encoding($string, $encoding); } - $regex = sprintf('^[%s]+', preg_quote($characters, '/')); + $regexCharacter = preg_quote($characters, '/'); + $regex = sprintf($regex, $regexCharacter, $regexCharacter); - if ('ASCII' === mb_detect_encoding($characters) && 'ASCII' === mb_detect_encoding($string)) { + if ('ASCII' === mb_detect_encoding($characters) && 'ASCII' === mb_detect_encoding($string) && !empty(array_intersect(str_split(self::CHARACTERS), str_split($string)))) { $options = 'g'; } else { $options = ''; } - return mb_ereg_replace($regex, "", $string, $options); - } - - public static function mb_rtrim(string $string, string $characters = self::CHARACTERS, ?string $encoding = null): string - { try { - @mb_check_encoding('', $encoding); - } catch (\ValueError $e) { - throw new \ValueError(sprintf('%s(): Argument #3 ($encoding) must be a valid encoding, "%s" given', __METHOD__, $encoding)); - } - - if ('' === $characters) { - return null === $encoding ? $string : mb_convert_encoding($string, $encoding); - } + $a = mb_ereg_replace($regex, "", $string, $options); - $regex = sprintf('[%s]+$', preg_quote($characters, '/')); + if (null === $a) { + throw new \Exception(); + } - if ('ASCII' === mb_detect_encoding($characters)) { - $options = 'g'; - } else { - $options = ''; + return $a; + } catch (\Exception $e) { + return preg_replace(sprintf('/%s/', $regex), "", $string); } - - return mb_ereg_replace($regex, "", $string, $options); - } + } } diff --git a/src/Php84/bootstrap.php b/src/Php84/bootstrap.php index 60a2d075..2910678a 100644 --- a/src/Php84/bootstrap.php +++ b/src/Php84/bootstrap.php @@ -16,22 +16,13 @@ } if (!function_exists('mb_trim')) { - function mb_trim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string - { - return p\Php84::mb_trim($string, $characters, $encoding); - } + function mb_trim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string { return p\Php84::mb_trim($string, $characters, $encoding); } } if (!function_exists('mb_ltrim')) { - function mb_ltrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string - { - return p\Php84::mb_ltrim($string, $characters, $encoding); - } + function mb_ltrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string { return p\Php84::mb_ltrim($string, $characters, $encoding); } } if (!function_exists('mb_rtrim')) { - function mb_rtrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string - { - return p\Php84::mb_rtrim($string, $characters, $encoding); - } + function mb_rtrim(string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): string { return p\Php84::mb_rtrim($string, $characters, $encoding); } } diff --git a/tests/Mbstring/MbstringTest.php b/tests/Mbstring/MbstringTest.php index 88cfc326..01f18db8 100644 --- a/tests/Mbstring/MbstringTest.php +++ b/tests/Mbstring/MbstringTest.php @@ -727,4 +727,111 @@ public static function mbStrPadInvalidArgumentsProvider(): iterable yield ['mb_str_pad(): Argument #4 ($pad_type) must be STR_PAD_LEFT, STR_PAD_RIGHT, or STR_PAD_BOTH', '▶▶', 6, ' ', 123456]; yield ['mb_str_pad(): Argument #5 ($encoding) must be a valid encoding, "unexisting" given', '▶▶', 6, ' ', \STR_PAD_BOTH, 'unexisting']; } + + /** + * @covers \Symfony\Polyfill\Mbstring\Mbstring::mb_trim + * + * @dataProvider mbTrimProvider + */ + public function testMbTrim(string $expected, string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): void + { + $this->assertSame($expected, mb_trim($string, $characters, $encoding)); + } + + /** + * @covers \Symfony\Polyfill\Mbstring\Mbstring::mb_ltrim + * + * @dataProvider mbLTrimProvider + */ + public function testMbLTrim(string $expected, string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): void + { + $this->assertEquals($expected, mb_ltrim($string, $characters, $encoding)); + } + + /** + * @covers \Symfony\Polyfill\Mbstring\Mbstring::mb_rtrim + * + * @dataProvider mbRTrimProvider + */ + public function testMbRTrim(string $expected, string $string, string $characters = " \f\n\r\t\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}", ?string $encoding = null): void + { + $this->assertSame($expected, mb_rtrim($string, $characters, $encoding)); + } + + public function testMbTrimException(): void + { + $this->expectException(\ValueError::class); + mb_trim("\u{180F}", "", "NULL"); + } + + public function testMbTrimEncoding(): void + { + $this->assertSame('あ', mb_convert_encoding(mb_trim("\x81\x40\x82\xa0\x81\x40", "\x81\x40", "SJIS"), "UTF-8", "SJIS")); + $this->assertSame('226f575b', bin2hex(mb_ltrim(mb_convert_encoding("\u{FFFE}漢字", "UTF-16LE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16LE"))); + $this->assertSame('6f225b57', bin2hex(mb_ltrim(mb_convert_encoding("\u{FEFF}漢字", "UTF-16BE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16BE", "UTF-8"), "UTF-16BE"))); + } + + public static function mbTrimProvider(): iterable + { + yield ['ABC', 'ABC']; + yield ['ABC', "\0\t\nABC \0\t\n"]; + yield ["\0\t\nABC \0\t\n", "\0\t\nABC \0\t\n", '']; + + yield ['', '']; + + yield ["あいうえおあお", " あいうえおあお ", " ", "UTF-8"]; + yield ["foo BAR Spa", "foo BAR Spaß", "ß", "UTF-8"]; + yield ["oo BAR Spaß", "oo BAR Spaß", "f", "UTF-8"]; + + yield ["oo BAR Spa", "foo BAR Spaß", "ßf", "UTF-8"]; + yield ["oo BAR Spa", "foo BAR Spaß", "fß", "UTF-8"]; + yield ["いうおえお", " あいうおえお あ", " あ", "UTF-8"]; + yield ["いうおえお", " あいうおえお あ", "あ ", "UTF-8"]; + yield [" あいうおえお ", " あいうおえお a", "あa", "UTF-8"]; + yield [" あいうおえお a", " あいうおえお a", "\xe3", "UTF-8"]; + + yield ["", str_repeat(" ", 129)]; + yield ["a", str_repeat(" ", 129) . "a"]; + + yield ["", " \f\n\r\v\x00\u{00A0}\u{1680}\u{2000}\u{2001}\u{2002}\u{2003}\u{2004}\u{2005}\u{2006}\u{2007}\u{2008}\u{2009}\u{200A}\u{2028}\u{2029}\u{202F}\u{205F}\u{3000}\u{0085}\u{180E}"]; + + yield [' abcd ', ' abcd ', '']; + + yield ['f', 'foo', 'oo']; + + yield ["foo\n", "foo\n", 'o']; + } + + public static function mbLTrimProvider(): iterable + { + yield ['ABC', 'ABC']; + yield ["ABC \0\t\n", "\0\t\nABC \0\t\n"]; + yield ["\0\t\nABC \0\t\n", "\0\t\nABC \0\t\n", '']; + + yield ['', '']; + + yield [' test ', ' test ', '']; + + yield ['いああああ', 'あああああああああああああああああああああああああああああああああいああああ', 'あ']; + + yield ["漢字", "\u{FFFE}漢字", "\u{FFFE}\u{FEFF}"]; + yield [' abcd ', ' abcd ', '']; + } + + public static function mbRTrimProvider(): iterable + { + yield ['ABC', 'ABC']; + yield ["ABC", "ABC \0\t\n"]; + yield ["\0\t\nABC \0\t\n", "\0\t\nABC \0\t\n", '']; + + yield ['', '']; + + yield [" a", str_repeat(" ", 129) . "a"]; + + yield ['あああああああああああああああああああああああああああああああああい', 'あああああああああああああああああああああああああああああああああいああああ', 'あ']; + + yield [' abcd ', ' abcd ', '']; + + yield ["foo\n", "foo\n", 'o']; + } } diff --git a/tests/Php84/Php84Test.php b/tests/Php84/Php84Test.php index 72b0c48a..0dcf64da 100644 --- a/tests/Php84/Php84Test.php +++ b/tests/Php84/Php84Test.php @@ -51,6 +51,12 @@ public function testMbTrimException(): void mb_trim("\u{180F}", "", "NULL"); } + public function testMbTrimEncoding(): void + { + $this->assertSame('あ', mb_convert_encoding(mb_trim("\x81\x40\x82\xa0\x81\x40", "\x81\x40", "SJIS"), "UTF-8", "SJIS")); + $this->assertSame('226f575b', bin2hex(mb_ltrim(mb_convert_encoding("\u{FFFE}漢字", "UTF-16LE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16LE"))); + $this->assertSame('6f225b57', bin2hex(mb_ltrim(mb_convert_encoding("\u{FEFF}漢字", "UTF-16BE", "UTF-8"), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16BE", "UTF-8"), "UTF-16BE"))); + } public static function mbTrimProvider(): iterable { @@ -69,7 +75,7 @@ public static function mbTrimProvider(): iterable yield ["いうおえお", " あいうおえお あ", " あ", "UTF-8"]; yield ["いうおえお", " あいうおえお あ", "あ ", "UTF-8"]; yield [" あいうおえお ", " あいうおえお a", "あa", "UTF-8"]; - // yield [" あいうおえお a", " あいうおえお a", "\xe3", "UTF-8"]; + yield [" あいうおえお a", " あいうおえお a", "\xe3", "UTF-8"]; yield ["", str_repeat(" ", 129)]; yield ["a", str_repeat(" ", 129) . "a"]; @@ -78,9 +84,6 @@ public static function mbTrimProvider(): iterable yield [' abcd ', ' abcd ', '']; - // May not work - // yield ['あ', mb_convert_encoding('', "UTF-8", "SJIS"), "\x81\x40", "SJIS"]; - yield ['f', 'foo', 'oo']; yield ["foo\n", "foo\n", 'o']; @@ -99,10 +102,6 @@ public static function mbLTrimProvider(): iterable yield ['いああああ', 'あああああああああああああああああああああああああああああああああいああああ', 'あ']; yield ["漢字", "\u{FFFE}漢字", "\u{FFFE}\u{FEFF}"]; - // May does not work - // yield ['226f575b', \bin2hex(mb_convert_encoding("\u{FFFE}漢字", "UTF-16LE", "UTF-8")), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16LE"]; - // yield ['漢字', \bin2hex(mb_convert_encoding("\u{FFFE}漢字", "UTF-16BE", "UTF-8")), mb_convert_encoding("\u{FFFE}\u{FEFF}", "UTF-16LE", "UTF-8"), "UTF-16BE"]; - yield [' abcd ', ' abcd ', '']; }