Merge pull request #364 from thephpleague/text-encoding

Drop support for non-UTF-8-compatible encoding (fixes #361)
thephpleague · May 25, 2019 · 290a7fd · 290a7fd
2 parents c30c991 + 2d90443
commit 290a7fd
Show file tree

Hide file tree

Showing 11 changed files with 25 additions and 55 deletions.
diff --git a/.phpstorm.meta.php b/.phpstorm.meta.php
@@ -11,8 +11,8 @@
 
 namespace PHPSTORM_META
 {
-    expectedArguments(\League\CommonMark\Context::setEncoding(), 0, 'UTF-8', 'ASCII', 'ISO-8859-1');
-    expectedArguments(\League\CommonMark\Cursor::__construct(), 1, 'UTF-8', 'ASCII', 'ISO-8859-1');
+    expectedArguments(\League\CommonMark\Context::setEncoding(), 0, 'UTF-8', 'ASCII');
+    expectedArguments(\League\CommonMark\Cursor::__construct(), 1, 'UTF-8', 'ASCII');
     expectedArguments(\League\CommonMark\HtmlElement::__construct(), 0, 'a', 'abbr', 'address', 'area', 'article', 'aside', 'audio', 'b', 'base', 'bdi', 'bdo', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'cite', 'code', 'col', 'colgroup', 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'div', 'dl', 'dt', 'em', 'embed', 'fieldset', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'hr', 'html', 'i', 'iframe', 'img', 'input', 'ins', 'kdb', 'keygen', 'label', 'legend', 'li', 'link', 'main', 'map', 'mark', 'menu', 'menuitem', 'meta', 'meter', 'nav', 'noscript', 'object', 'ol', 'optgroup', 'option', 'output', 'p', 'param', 'pre', 'progress', 'q', 's', 'samp', 'script', 'section', 'select', 'small', 'source', 'span', 'strong', 'style', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'tr', 'track', 'u', 'ul', 'var', 'video', 'wbr');
     expectedArguments(\League\CommonMark\Block\Element\Heading::__construct(), 0, 1, 2, 3, 4, 5, 6);
     expectedReturnValues(\League\CommonMark\Block\Element\Heading::getLevel(), 1, 2, 3, 4, 5, 6);

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -51,6 +51,10 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi
    - Removed `EmphasisProcessor`
    - Removed `InlineProcessorInterface`
  - Removed `EmphasisParser` now that we have proper delimiter support
+ - Removed support for non-UTF-8-compatible encodings
+    - Removed `getEncoding()` from `ContextInterface`
+    - Removed `getEncoding()`, `setEncoding()`, and `$encoding` from `Context`
+    - Removed `getEncoding()` and the second `$encoding` constructor param from `Cursor`
  - Removed now-unused methods
    - Removed `DelimiterStack::getTop()` (no replacement)
    - Removed `DelimiterStack::iterateByCharacters()` (use the new `processDelimiters()` method instead)

diff --git a/README.md b/README.md
@@ -46,6 +46,8 @@ echo $converter->convertToHtml('# Hello World!');
 
 :warning: **Security warning:** If you will be parsing untrusted input from users, please consider setting the `html_input` and `allow_unsafe_links` options. See <https://commonmark.thephpleague.com/security/> for more details. If you also do choose to allow raw HTML input from untrusted users, considering using a library (like [HTML Purifier](https://github.com/ezyang/htmlpurifier)) to provide additional HTML filtering.
 
+Please note that only UTF-8 and ASCII encodings are supported.  If your Markdown uses a different encoding please convert it to UTF-8 before running it through this library.
+
 ## Documentation
 
 Full documentation on advanced usage, configuration, and customization can be found at [commonmark.thephpleague.com][docs].

diff --git a/UPGRADE.md b/UPGRADE.md
@@ -4,6 +4,12 @@
 
 ## UNRELEASED
 
+### Text Encoding
+
+This library used to claim it supported ISO-8859-1 encoding but that never truly worked - everything assumed the text was encoded as UTF-8 or ASCII. We've therefore dropped support for ISO-8859-1 and any other unexpected encodings. If you were using some other encoding, you'll now need to convert your Markdown to UTF-8 prior to running it through this library.
+
+Additionally, all public `getEncoding()` or `setEncoding()` methods have been removed, so assume that you're working with UTF-8.
+
 ### Inline Processors
 
 The "inline processor" functionality has been removed and replaced with a proper "delimiter processor" feature geared specifically towards dealing with delimiters (which is what the previous implementation tried to do - poorly).

diff --git a/docs/0.20/basic-usage.md b/docs/0.20/basic-usage.md
@@ -50,3 +50,5 @@ echo $htmlRenderer->renderBlock($document);
 ~~~
 
 [Additional customization](/0.20/customization/overview/) is also possible.
+
+Please note that only UTF-8 and ASCII encodings are supported.  If your Markdown uses a different encoding please convert it to UTF-8 before running it through this library.
diff --git a/docs/0.20/customization/cursor.md b/docs/0.20/customization/cursor.md
@@ -10,13 +10,7 @@ At it's core, a `Cursor` is just a fancy string that remembers your current posi
 
 ## Supported Encodings
 
-As of now, only ASCII-compatible encodings are supported.  These include:
-
- - UTF-8 (preferred)
- - ISO-8859-1
- - ASCII
-
-Other encodings are not fully supported or tested so consider using UTF-8.
+As of now, only UTF-8 (and, by extension, ASCII) encoding is supported.
 
 ## Usage
 

diff --git a/src/Block/Parser/SetExtHeadingParser.php b/src/Block/Parser/SetExtHeadingParser.php
@@ -47,7 +47,7 @@ public function parse(ContextInterface $context, Cursor $cursor): bool
         $level = $match[0][0] === '=' ? 1 : 2;
         $strings = $context->getContainer()->getStrings();
 
-        $strings = $this->resolveReferenceLinkDefinitions($strings, $cursor->getEncoding(), $context->getReferenceParser());
+        $strings = $this->resolveReferenceLinkDefinitions($strings, $context->getReferenceParser());
         if (empty($strings)) {
             return false;
         }
@@ -63,15 +63,14 @@ public function parse(ContextInterface $context, Cursor $cursor): bool
      * @see https://github.com/commonmark/commonmark.js/commit/993bbe335931af847460effa99b2411eb643577d
      *
      * @param string[]        $strings
-     * @param string          $encoding
      * @param ReferenceParser $referenceParser
      *
      * @return string[]
      */
-    private function resolveReferenceLinkDefinitions(array $strings, string $encoding, ReferenceParser $referenceParser): array
+    private function resolveReferenceLinkDefinitions(array $strings, ReferenceParser $referenceParser): array
     {
         foreach ($strings as &$string) {
-            $cursor = new Cursor($string, $encoding);
+            $cursor = new Cursor($string);
             while ($cursor->getCharacter() === '[' && $referenceParser->parse($cursor)) {
                 $string = $cursor->getRemainder();
             }

diff --git a/src/Context.php b/src/Context.php
@@ -62,11 +62,6 @@ class Context implements ContextInterface
      */
     protected $blocksParsed = false;
 
-    /**
-     * @var string
-     */
-    protected $encoding = 'UTF-8';
-
     /**
      * @var ReferenceParser
      */
@@ -231,24 +226,4 @@ public function getReferenceParser(): ReferenceParser
     {
         return $this->referenceParser;
     }
-
-    /**
-     * @return string
-     */
-    public function getEncoding(): string
-    {
-        return $this->encoding;
-    }
-
-    /**
-     * @param string $encoding
-     *
-     * @return $this
-     */
-    public function setEncoding(string $encoding): self
-    {
-        $this->encoding = $encoding;
-
-        return $this;
-    }
 }
diff --git a/src/ContextInterface.php b/src/ContextInterface.php
@@ -87,9 +87,4 @@ public function setBlocksParsed(bool $bool);
      * @return ReferenceParser
      */
     public function getReferenceParser(): ReferenceParser;
-
-    /**
-     * @return string
-     */
-    public function getEncoding(): string;
 }
diff --git a/src/Cursor.php b/src/Cursor.php
@@ -79,23 +79,17 @@ class Cursor
     private $charCache = [];
 
     /**
-     * @param string $line     The line being parsed
-     * @param string $encoding The encoding of that line
+     * @param string $line The line being parsed (ASCII or UTF-8)
      */
-    public function __construct(string $line, string $encoding = 'UTF-8')
+    public function __construct(string $line)
     {
         $this->line = $line;
-        $this->encoding = $encoding;
-        $this->length = \mb_strlen($line, $this->encoding) ?: 0;
+        $this->length = \mb_strlen($line, 'UTF-8') ?: 0;
         $this->isMultibyte = $this->length !== \strlen($line);
+        $this->encoding = $this->isMultibyte ? 'UTF-8' : 'ASCII';
         $this->lineContainsTabs = \preg_match('/\t/', $line) > 0;
     }
 
-    public function getEncoding(): string
-    {
-        return $this->encoding;
-    }
-
     /**
      * Returns the position of the next character which is not a space (or tab)
      *

diff --git a/src/DocParser.php b/src/DocParser.php
@@ -86,7 +86,6 @@ private function preProcessInput(string $input): array
     public function parse(string $input): Document
     {
         $context = new Context(new Document(), $this->environment);
-        $context->setEncoding(\mb_detect_encoding($input, 'ASCII,UTF-8', true) ?: 'ISO-8859-1');
 
         $lines = $this->preProcessInput($input);
         foreach ($lines as $line) {
@@ -111,7 +110,7 @@ private function incorporateLine(ContextInterface $context)
         $context->getBlockCloser()->resetTip();
         $context->setBlocksParsed(false);
 
-        $cursor = new Cursor($context->getLine(), $context->getEncoding());
+        $cursor = new Cursor($context->getLine());
 
         $this->resetContainer($context, $cursor);
         $context->getBlockCloser()->setLastMatchedContainer($context->getContainer());