src/Wt2Html/Grammar.pegphp

/**
 * Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several
 * chunks of tokens (one chunk per top-level block matched) and eventually an
 * end event. Tokens map to HTML tags as far as possible, with custom tokens
 * used where further processing on the token stream is needed.
 */
{
	use Wikimedia\Assert\UnreachableException;
	use Wikimedia\JsonCodec\JsonCodec;
	use Wikimedia\Parsoid\Config\Env;
	use Wikimedia\Parsoid\Config\SiteConfig;
	use Wikimedia\Parsoid\Core\DomSourceRange;
	use Wikimedia\Parsoid\NodeData\DataMw;
	use Wikimedia\Parsoid\NodeData\DataParsoid;
	use Wikimedia\Parsoid\NodeData\TempData;
	use Wikimedia\Parsoid\Tokens\CommentTk;
	use Wikimedia\Parsoid\Tokens\EOFTk;
	use Wikimedia\Parsoid\Tokens\EndTagTk;
	use Wikimedia\Parsoid\Tokens\KV;
	use Wikimedia\Parsoid\Tokens\KVSourceRange;
	use Wikimedia\Parsoid\Tokens\NlTk;
	use Wikimedia\Parsoid\Tokens\SelfclosingTagTk;
	use Wikimedia\Parsoid\Tokens\SourceRange;
	use Wikimedia\Parsoid\Tokens\TagTk;
	use Wikimedia\Parsoid\Tokens\Token;
	use Wikimedia\Parsoid\Utils\DOMDataUtils;
	use Wikimedia\Parsoid\Utils\PHPUtils;
	use Wikimedia\Parsoid\Utils\TokenUtils;
	use Wikimedia\Parsoid\Utils\Utils;
	use Wikimedia\Parsoid\Utils\WTUtils;
	use Wikimedia\Parsoid\Wikitext\Consts;
}
{
	/** @var Env */
	private $env;

	/** @var SiteConfig */
	private $siteConfig;

	/** @var array */
	private $pipelineOpts;

	/** @var int */
	private $pipelineOffset;

	private $extTags;

	private $startTime;

	/** @var string */
	private $reUrltextLookahead;

	/** @var string */
	private $urltextPlainSegment = '';

	/** @var bool */
	private $urltextFoundAutolink = false;

	protected function initialize() {
		$this->env = $this->options['env'];
		$this->siteConfig = $this->env->getSiteConfig();

		$tokenizer = $this->options['pegTokenizer'];
		$this->pipelineOpts = $tokenizer->getOptions();
		// FIXME: inTemplate option may not always be set in
		// standalone tokenizers user by some pipelines handlers.
		$this->pipelineOffset = $this->options['pipelineOffset'] ?? 0;
		$this->extTags = $this->siteConfig->getExtensionTagNameMap();

		// Non-greedy text_char sequence: stop at ampersand, double-underscore,
		 // magic link prefix or protocol
		$this->reUrltextLookahead = '!(?:' .
			'([^-\'<[{\n\r:;\]}|\!=&]*?)' .
			'(?:__|$|[-\'<[{\n\r:;\]}|\!=&]|(RFC|PMID|ISBN|' .
			'(?i)' . $this->siteConfig->getProtocolsRegex( true ) .
			')))!A';
	}

	private $prevOffset = 0;
	private $headingIndex = 0;

	public function resetState() {
		$this->prevOffset = 0;
		$this->headingIndex = 0;
	}

	private function assert( $condition, $text ) {
		if ( !$condition ) {
			throw new \RuntimeException( "Grammar.pegphp assertion failure: $text" );
		}
	}

	private function unreachable() {
		throw new UnreachableException( "Grammar.pegphp: this should be unreachable" );
	}

	// Some shorthands for legibility
	private function startOffset() {
		return $this->savedPos;
	}

	private function endOffset() {
		return $this->currPos;
	}

	private function tsrOffsets( $flag = 'default' ): SourceRange {
		switch ( $flag ) {
			case 'start':
				return new SourceRange( $this->savedPos, $this->savedPos );
			case 'end':
				return new SourceRange( $this->currPos, $this->currPos );
			default:
				return new SourceRange( $this->savedPos, $this->currPos );
		}
	}

	/*
	 * Emit a chunk of tokens to our consumers.  Once this has been done, the
	 * current expression can return an empty list (true).
	 */
	private function emitChunk( $tokens ) {
		// FIXME: We don't expect nulls here, but looks like
		// hack from I1c695ab6cdd3655e98877c175ddbabdee9dc44b7
		// introduces them. Work around it for now!
		if ( !$tokens ) {
			return [];
		}

		// Shift tsr of all tokens by the pipeline offset
		TokenUtils::shiftTokenTSR( $tokens, $this->pipelineOffset );
		$this->env->log( 'trace/peg', $this->options['pipelineId'] ?? '0', '---->   ', $tokens );

		$i = null;
		$n = count( $tokens );

		// Enforce parsing resource limits
		for ( $i = 0;  $i < $n;  $i++ ) {
			TokenizerUtils::enforceParserResourceLimits( $this->env, $tokens[ $i ] );
		}

		return $tokens;
	}

	/* ------------------------------------------------------------------------
	 * Extension tags should be parsed with higher priority than anything else.
	 *
	 * The trick we use is to strip out the content inside a matching tag-pair
	 * and not tokenize it. The content, if it needs to parsed (for example,
	 * for <ref>, <*include*> tags), is parsed in a fresh tokenizer context
	 * which means any error correction that needs to happen is restricted to
	 * the scope of the extension content and doesn't spill over to the higher
	 * level.  Ex: <math><!--foo</math>.
	 *
	 * IGNORE: {{ this just balances the blocks in this comment for pegjs
	 *
	 * This trick also lets us prevent extension content (that don't accept WT)
	 * from being parsed as wikitext (Ex: <math>\frac{foo\frac{bar}}</math>)
	 * We don't want the "}}" being treated as a template closing tag and
	 * closing outer templates.
	 * --------------------------------------------------------------------- */

	private function isXMLTag( string $name ): bool {
		$lName = mb_strtolower( $name );
		return isset( Consts::$HTML['HTML5Tags'][$lName] ) ||
			isset( Consts::$HTML['OlderHTMLTags'][$lName] );
	}

	private function maybeAnnotationOrExtensionTag( Token $t, ?bool $end, array $attribs, SourceRange $tsr ) {
		$tagName = mb_strtolower( $t->getName() );

		$isAnnotationTag = $this->siteConfig->isAnnotationTag( $tagName );
		if ( !$isAnnotationTag ) {
			$pipepos = strpos( $tagName, '|' );
			if ( $pipepos ) {
				$strBeforePipe = substr( $tagName, 0, $pipepos );
				$isAnnotationTag = $this->siteConfig->isAnnotationTag( $strBeforePipe );
				if ( $isAnnotationTag ) {
					$attribs = [ new KV( "name", substr( $tagName, $pipepos + 1, strlen( $tagName ) - $pipepos - 1 ) ) ];
					$tagName = $strBeforePipe;
				}
			}
		}

		if ( $isAnnotationTag ) {
			$metaAttrs = [ new KV( 'typeof', 'mw:Annotation/' . $tagName . ($end ? '/End' : '') ) ];
			$datamw = null;
			if ( count( $attribs ) > 0 ) {
				$attrMap = [];
				foreach ( $attribs as $attr ) {
					// If the key or the value is not a string, we replace it by the thing that generated it and
					// consider that wikitext as a raw string instead.
					$k = is_string( $attr->k ) ? $attr->k : $attr->ksrc;
					$v = is_string( $attr->v ) ? $attr->v : $attr->vsrc;
					$attrMap[$k] = $v;
				}
				// Possible follow-up in T295168 for attribute sanitation
				// T367616: 'attrs' should be renamed to 'extAttrs'
				$datamw = new DataMw( [ 'attrs' => (object)$attrMap ] );
			}
			$dp = new DataParsoid();
			$dp->tsr = $tsr;
			$this->env->hasAnnotations = true;
			// FIXME: Suppress annotation meta tokens from template pipelines
			// since they may not have TSR values and won't get recognized as
			// annotation ranges. Without TSR, they might end up stuck in
			// fosterable positions and cause havoc on edits by breaking selser.
			if ( empty( $this->pipelineOpts['inTemplate'] ) ) {
				return [ new SelfclosingTagTk ( 'meta', $metaAttrs, $dp, $datamw ) ];
			} else {
				return [];
			}
		}

		$isInstalledExt = isset( $this->extTags[$tagName] );
		$isIncludeTag = WTUtils::isIncludeTag( $tagName );

		// Extensions have higher precedence when they shadow html tags.
		if ( !( $isInstalledExt || $isIncludeTag ) ) {
			return $t;
		}

		$dp = $t->dataParsoid;
		$skipPos = $this->currPos;

		switch ( get_class( $t ) ) {
			case EndTagTk::class:
				if ( $isIncludeTag ) {
					return $t;
				}
				// Similar to TagTk, we rely on the sanitizer to convert to text
				// where necessary and emit tokens to ease the wikitext escaping
				// code.  However, extension tags that shadow html tags will see
				// their unmatched end tags dropped while tree building, since
				// the sanitizer will let them through.
				return $t; // not text()

			case SelfclosingTagTk::class:
				$dp->src = $dp->tsr->substr( $this->input );
				$dp->extTagOffsets = new DomSourceRange(
					$dp->tsr->start, $dp->tsr->end,
					$dp->tsr->length(), 0
				);
				if ( $isIncludeTag ) {
					return $t;
				}
				break;

			case TagTk::class:
				$endTagRE = '~.*?(</' . preg_quote( $tagName, '~' ) . '\s*>)~iusA';
				$tagContentFound = preg_match( $endTagRE, $this->input, $tagContent, 0, $dp->tsr->start );

				if ( !$tagContentFound ) {
					$dp->src = $dp->tsr->substr( $this->input );
					$dp->extTagOffsets = new DomSourceRange(
						$dp->tsr->start, $dp->tsr->end,
						$dp->tsr->length(), 0
					);
					if ( $isIncludeTag ) {
						return $t;
					} else {
						// This is undefined behaviour.  The old parser currently
						// returns text here (see core commit 674e8388cba),
						// whereas this results in unclosed
						// extension tags that shadow html tags falling back to
						// their html equivalent.  The sanitizer will take care
						// of converting to text where necessary.  We do this to
						// simplify `hasWikitextTokens` when escaping wikitext,
						// which wants these as tokens because it's otherwise
						// lacking in context.
						return $t; // not text()
					}
				}

				$extSrc = $tagContent[0];
				$extEndOffset = $dp->tsr->start + strlen( $extSrc );
				$extEndTagWidth = strlen( $tagContent[1] );

				if ( !empty( $this->pipelineOpts['inTemplate'] ) ) {
					// Support nesting in extensions tags while tokenizing in templates
					// to support the #tag parser function.
					//
					// It's necessary to permit this broadly in templates because
					// there's no way to distinguish whether the nesting happened
					// while expanding the #tag parser function, or just a general
					// syntax errors.  In other words,
					//
					//   hi<ref>ho<ref>hi</ref>ho</ref>
					//
					// and
					//
					//   hi{{#tag:ref|ho<ref>hi</ref>ho}}
					//
					// found in template are returned indistinguishably after a
					// preprocessing request, though the old parser renders them
					// differently.  #tag in template is probably a common enough
					// use case that we want to accept these false positives,
					// though another approach could be to drop this code here, and
					// invoke a native #tag handler and forgo those in templates.
					//
					// Expand `extSrc` as long as there is a <tagName> found in the
					// extension source body.
					$startTagRE = '~<' . preg_quote( $tagName, '~' ) . '(?:[^/>]|/(?!>))*>~i';
					$s = substr( $extSrc, $dp->tsr->end - $dp->tsr->start );
					$openTags = 0;
					while ( true ) {
						if ( preg_match_all( $startTagRE, $s, $matches ) ) {
							$openTags += count( $matches[0] );
						}
						if ( !$openTags ) {
							break;
						}
						if ( !preg_match( $endTagRE, $this->input, $tagContent, 0, $extEndOffset ) ) {
							break;
						}
						$openTags -= 1;
						$s = $tagContent[0];
						$extEndOffset += strlen( $s );
						$extEndTagWidth = strlen( $tagContent[1] );
						$extSrc .= $s;
					}
				}

				// Extension content source
				$dp->src = $extSrc;
				$dp->extTagOffsets = new DomSourceRange(
					$dp->tsr->start, $extEndOffset,
					$dp->tsr->length(), $extEndTagWidth
				);

				$skipPos = $dp->extTagOffsets->innerEnd();

				// If the xml-tag is a known installed (not native) extension,
				// skip the end-tag as well.
				if ( $isInstalledExt ) {
					$skipPos = $dp->extTagOffsets->end;
				}
				break;

			default:
				$this->unreachable();
		}

		$this->currPos = $skipPos;

		if ( $isInstalledExt ) {
			// update tsr->end to span the start and end tags.
			$dp->tsr->end = $this->endOffset(); // was just modified above
			return new SelfclosingTagTk( 'extension', [
					new KV( 'typeof', 'mw:Extension' ),
					new KV( 'name', $tagName ),
					new KV( 'about', $this->env->newAboutId() ),
					new KV( 'source', $dp->src ),
					new KV( 'options', $t->attribs )
				], $dp
			);
		} elseif ( $isIncludeTag ) {
			// Parse ext-content, strip eof, and shift tsr
			$extContent = $dp->extTagOffsets->stripTags( $dp->src );
			$tokenizer = new PegTokenizer( $this->env );
			$tokenizer->setSourceOffsets( new SourceRange( $dp->extTagOffsets->innerStart(), $dp->extTagOffsets->innerEnd() ) );
			$extContentToks = $tokenizer->tokenizeSync( $extContent );
			if ( $dp->extTagOffsets->closeWidth > 0 ) {
				TokenUtils::stripEOFTkFromTokens( $extContentToks );
			}
			array_unshift( $extContentToks, $t );
			return $extContentToks;
		} else {
			$this->unreachable();
		}
	}
}

/* ********************************************************
 * "start" is the top-level start rule
 *
 * Other allowed start rules:
 * -  table_start_tag
 * -  url
 * -  row_syntax_table_args
 * -  table_attributes
 * -  generic_newline_attributes
 * -  tplarg_or_template_or_bust
 * -  extlink
 * -  list_item
 *
 * These start rules are listed in lib/wt2html/tokenizer.js
 * and is used during PEG grammar compilation.
 * ******************************************************** */

start "start" =
	t:tlb* n:newlineToken* {
		if ( count( $t ) ) {
			$ret = TokenizerUtils::flattenIfArray( $t );
		} else {
			$ret = [];
		}
		if ( count( $n ) ) {
			PHPUtils::pushArray($ret, $n);
		}
		$ret[] = new EOFTk();
		return $ret;
	}

generic_newline_attributes "generic_newline_attributes" = generic_newline_attribute*

table_attributes "table_attributes" =
	(table_attribute / optionalSpaceToken b:broken_table_attribute_name_char { return $b; })*

/*
 * This rule exists to support tokenizing the document in chunks.
 * The parser's streaming interface will stop tokenization after each iteration
 * of the starred subexpression, and yield to the node.js event-loop to
 * schedule other pending event handlers.
 */
start_async =
	(
		(
			& {
				$this->startTime = null;
				if ( $this->env->profiling() ) {
					$profile = $this->env->getCurrentProfile();
					$this->startTime = microtime( true );
				}
				return true;
			}
			t:tlb
			& {
				if ( $this->env->profiling() ) {
					$profile = $this->env->getCurrentProfile();
					$profile->bumpTimeUse(
						'PEG', 1000 * ( microtime( true ) - $this->startTime ), 'PEG' );
				}
				return true;
			}
		) { return $t; }
		/ newlineToken* &{
			// "tlb" matches "block" matches "sol" matches "newlineToken"
			// But, "tlb" is prefixed with a !eof clause, so, we should only
			// get here on eof. So, safe to unconditionally terminate the
			// generator loop here.
			return false;
		}
	)*

/*
 * A document (start rule) is a sequence of toplevelblocks. Tokens are
 * emitted in chunks per toplevelblock to avoid buffering the full document.
 */
tlb "tlb" =
	!eof b:block {
		// Clear the tokenizer's backtracking cache after matching each
		// toplevelblock. There won't be any backtracking as a document is just a
		// sequence of toplevelblocks, so the cache for previous toplevelblocks
		// will never be needed.
		$end = $this->startOffset();
		for ( ;  $this->prevOffset < $end;  $this->prevOffset++ ) {
			unset( $this->cache[$this->prevOffset] );
		}

		$tokens = null;
		if ( is_array( $b ) && count( $b ) ) {
			$tokens = TokenizerUtils::flattenIfArray( $b );
		} elseif ( is_string( $b ) ) {
			$tokens = [ $b ];
		}

		// Emit tokens for this toplevelblock. This feeds a chunk to the parser pipeline.
		return $this->emitChunk( $tokens );
	}

/*
 * Redirects can only occur as the first thing in a document.  See
 * WikitextContent::getRedirectTarget()
 */
redirect =
	rw:redirect_word
	sp:$space_or_newline*
	c:$(":" space_or_newline*)?
	wl:wikilink
	& {
		return count( $wl ) === 1 && $wl[0] instanceof Token;
	}
	{
		$link = $wl[0];
		if ( $sp ) {
			$rw .= $sp;
		}
		if ( $c ) {
			$rw .= $c;
		}
		// Build a redirect token
		$dp = new DataParsoid;
		$dp->src = $rw;
		$dp->tsr = $this->tsrOffsets();
		$dp->linkTk = $link;
		$redirect = new SelfclosingTagTk( 'mw:redirect',
			// Put 'href' into attributes so it gets template-expanded
			[ $link->getAttributeKV( 'href' ) ],
			$dp
		);
		return $redirect;
	}

/* The 'redirect' magic word.
 * The leading whitespace allowed is due to the PHP trim() function.
 */
redirect_word =
	$(
		[ \t\n\r\0\x0b]*
		rw:$(!space_or_newline ![:\[] .)+
		& {
			return preg_match( $this->env->getSiteConfig()->getMagicWordMatcher( 'redirect' ), $rw );
		}
	)

/*
 * The actual contents of each block.
 */
block =
	// Redirect has to be the first alternative; otherwise it gets parsed as a <ol>
	// In practice, WikiContent::getRedirectTargetAndText() in MediaWiki strips
	// the redirect from the source so it never sees it.
	&sof r:redirect cil:sol_transparent* bl:block_line? {
		return array_merge( [ $r ], $cil, $bl ?: [] );
	}
	/ block_lines
	// Inlineline includes generic tags; wrapped into paragraphs in token
	// transform and DOM postprocessor
	/ inlineline
	/ s:sol !sof !inline_breaks { return $s; }

/*
 * A block nested in other constructs. Avoid eating end delimiters for other
 * constructs by checking against inline_breaks first.
 */
nested_block = !inline_breaks b:block { return $b; }

/*
 * The same, but suitable for use inside a table construct.
 * Doesn't match table_heading_tag, table_row_tag, table_data_tag,
 * table_caption tag, or table_end_tag, although it does allow
 * table_start_tag (for nested tables).
 */
nested_block_in_table =
	// XXX: don't rely on a lame look-ahead like this; use syntax stops
	// instead, so that multi-line th content followed by a line prefixed with
	// a comment is also handled. Alternatively, implement a sol look-behind
	// assertion accepting spaces and comments.
	!(sol (space* sol)? space* (pipe / "!"))

	// avoid recursion via nested_block_in_table, as that can lead to stack
	// overflow in large tables
	// See https://phabricator.wikimedia.org/T59670
	b:nested_block<tableDataBlock> {
		return $b;
	}

/*
 * Line-based block constructs.
 */
block_lines =
	s:sol
	// eat an empty line before the block
	s2:(os:optionalSpaceToken so:sol { return array_merge( $os, $so ); })?
	bl:block_line {
		return array_merge( $s, $s2 ?: [], $bl );
	}

// Horizontal rules
hr =
	"----" d:$"-"*
	// Check if a newline or content follows
	lineContent:( &sol "" { return null; } / "" { return true; } ) {
		$dataParsoid = new DataParsoid;
		$dataParsoid->tsr = $this->tsrOffsets();
		if ( $lineContent !== null ) {
			$dataParsoid->lineContent = $lineContent;
		}
		if ( strlen( $d ) > 0 ) {
			$dataParsoid->extra_dashes = strlen( $d );
		}
		return [new SelfclosingTagTk( 'hr', [], $dataParsoid )];
	}

/*
 * Block structures with start-of-line wiki syntax
 */
block_line =
	heading
	/ list_item
	/ hr
	/ st:optionalSpaceToken &[ <{}|!] tl:table_line {
		return array_merge( $st, $tl );
	}

br =
	s:optionalSpaceToken &newline {
		$dp = new DataParsoid;
		$dp->tsr = $this->tsrOffsets();
		return array_merge( $s, [
			new SelfclosingTagTk( 'br', [], $dp )
		] );
	}

inline_breaks =
	& [=|!{}:;\r\n[\]\-]
	(
		annOrExtTag: <annOrExtTag>
		h: <h>
		extlink: <extlink>
		intemplate: <intemplate>
		preproc: <preproc>
		equal: <equal>
		table: <table>
		templateArg: <templateArg>
		tableCellArg: <tableCellArg>
		semicolon: <semicolon>
		arrow: <arrow>
		linkdesc: <linkdesc>
		colon: <colon>
		th: <th>
		& {
			return TokenizerUtils::inlineBreaks( $this->input, $this->endOffset(), [
				'annOrExtTag' => $annOrExtTag,
				'h' => $h,
				'extlink' => $extlink,
				'intemplate' => $intemplate,
				'preproc' => $preproc,
				'equal' => $equal,
				'table' => $table,
				'templateArg' => $templateArg,
				'tableCellArg' => $tableCellArg,
				'semicolon' => $semicolon,
				'arrow' => $arrow,
				'linkdesc' => $linkdesc,
				'colon' => $colon,
				'th' => $th
			], $this->env );
		}
	)

inlineline =
	c:(
		urltext
		/ !inline_breaks
		r:(inline_element / !newline s:. { return $s; }) { return $r; }
	)+ {
		return TokenizerUtils::flattenStringlist( $c );
	}

inline_xmlish_tag =
	"<" tag:(xmlish_tag<annOrExtTag> / xmlish_tag<annOrExtTag=false> / tvar_old_syntax_closing_HACK )
	{ return $tag; }

inline_element =
	& '<' r:( inline_xmlish_tag / comment ) { return $r; }
	/ & '{' r:tplarg_or_template { return $r; }
	/ & "-{" r:lang_variant_or_tpl { return $r; }
	// FIXME: The old parser's handleInternalLinks2 splits on [[, resulting
	// in sequences with odd number of brackets parsing as text, and sequences
	// with even number of brackets having its innermost pair parse as a
	// wikilink.  For now, we faithfully reproduce what's found there but
	// wikitext, the language, shouldn't be defined by odd tokenizing behaviour
	// in the old parser.  Flagging this for a future cleanup.
	/ $('[[' &'[')+
	/ & '[' r:( wikilink / extlink ) { return $r; }
	/ & "'" r:quote { return $r; }

/* Headings  */

heading =
	& "=" // guard, to make sure '='+ will match.
	// XXX: Also check to end to avoid inline parsing?
	r:(
		s:$'='+ // moved in here to make s accessible to inner action
		ce:(
			(ill:inlineline<h>? { return $ill ?: []; })
			$'='+
		)?
		& { return $ce || strlen( $s ) > 2; }
		endTPos:("" { return $this->endOffset(); })
		spc:( space / sol_transparent )*
		&eolf
		{
			$c = null;
			$e = null;
			$level = null;
			if ( $ce ) {
				$c = $ce[0];
				$e = $ce[1];
				$level = min( strlen( $s ), strlen( $e ) );
			} else {
				// split up equal signs into two equal parts, with at least
				// one character in the middle.
				$level = (int)floor( ( strlen( $s ) - 1 ) / 2 );
				$c = [ str_repeat( '=', strlen( $s ) - 2 * $level ) ];
				$s = $e = str_repeat( '=', $level );
			}
			$level = min( 6, $level );
			// convert surplus equals into text
			if ( strlen( $s ) > $level ) {
				$extras1 = substr( $s, 0, strlen( $s ) - $level );
				if ( is_string( $c[0] ) ) {
					$c[0] = $extras1 . $c[0];
				} else {
					array_unshift( $c, $extras1 );
				}
			}
			if ( strlen( $e ) > $level ) {
				$extras2 = substr( $e, 0, strlen( $e ) - $level );
				$lastElem = PHPUtils::lastItem( $c );
				if ( is_string( $lastElem ) ) {
					$c[count( $c ) - 1] .= $extras2;
				} else {
					$c[] = $extras2;
				}
			}

			$tagDP = new DataParsoid;
			$tagDP->tsr = $this->tsrOffsets( 'start' );
			$tagDP->tsr->end += $level;
			// Match the old parser's behavior by
			// (a) making headingIndex part of tokenizer state
			//   (don't reuse pipeline! see $this->resetState above)
			// (b) assigning the index when ==*== is tokenized,
			//   even if we're inside a template argument
			//   or other context which won't end up putting the heading
			//   on the output page.  T213468/T214538
			$this->headingIndex++;
			$tagDP->getTemp()->headingIndex = $this->headingIndex;
			$res = [ new TagTk( 'h' . $level, [], $tagDP ) ];

			PHPUtils::pushArray( $res, $c );

			$endTagDP = new DataParsoid;
			$endTagDP->tsr = new SourceRange( $endTPos - $level, $endTPos );
			$res[] = new EndTagTk( 'h' . $level, [], $endTagDP );
			$res[] = $spc;
			return $res;
		}
	) { return $r; }


// Behavior switches. See:
// https://www.mediawiki.org/wiki/Help:Magic_words#Behavior_switches
behavior_switch =
	bs:$('__' behavior_text '__') {
		if ( $this->siteConfig->isBehaviorSwitch( $bs ) ) {
			$dp = new DataParsoid;
			$dp->tsr = $this->tsrOffsets();
			$dp->src = $bs;
			$dp->magicSrc = $bs;
			return [
				new SelfclosingTagTk( 'behavior-switch', [ new KV( 'word', $bs ) ], $dp )
			];
		} else {
			return [ $bs ];
		}
	}

// Instead of defining a charset, the old parser's doDoubleUnderscore concats a
// regexp of all the language specific aliases of the behavior switches and
// then does a match and replace. Just be as permissive as possible and let the
// BehaviorSwitchPreprocessor back out of any overreach.
behavior_text = $( !'__' ( text_char / "-" ) )+


/**************************************************************
 * External (bracketed and autolinked) links
 **************************************************************/

autolink =
	! <extlink>
	// this must be a word boundary, so previous character must be non-word
	! { return Utils::isUniWord(Utils::lastUniChar( $this->input, $this->endOffset() ) ); }
	r:(
		autourl
		/ autoref
		/ isbn
	) { return $r; }

extlink "extlink" =
	! <extlink> // extlink cannot be nested
	r:(
		"["
		p0:( "" { return $this->endOffset(); })
		flat:(
			addr:(url_protocol ipv6urladdr / "")
			target:(extlink_nonipv6url<extlink> / "")
			{ return TokenizerUtils::flattenString( [ $addr, $target ] ); }
		)
		& {
			// Protocol must be valid and there ought to be at least one
			// post-protocol character.  So strip last char off target
			// before testing protocol.
			if ( is_array( $flat ) ) {
				// There are templates present, alas.
				return count( $flat ) > 0;
			}
			return Utils::isProtocolValid( substr( $flat, 0, -1 ), $this->env );
		}
		p1:( "" { return $this->endOffset(); })
		sp:$( space / unispace )*
		p2:( "" { return $this->endOffset(); })
		content:inlineline<extlink>?
		p3:( "" { return $this->endOffset(); })
		"]" {
			$tsr1 = new SourceRange( $p0, $p1 );
			$tsr2 = new SourceRange( $p2, $p3 );
			$dp = new DataParsoid;
			$dp->tsr = $this->tsrOffsets();
			$dp->extLinkContentOffsets = $tsr2;
			return [
				new SelfclosingTagTk(
					'extlink',
					[
						new KV( 'href', $flat, $tsr1->expandTsrV() ),
						new KV( 'mw:content', $content ?? '', $tsr2->expandTsrV() ),
						new KV( 'spaces', $sp )
					],
					$dp
				)
			]; }
	) { return $r; }

autoref =
	ref:('RFC' / 'PMID') sp:space_or_nbsp+ identifier:$[0-9]+ end_of_word
	{
		$base_urls = [
			'RFC' => 'https://tools.ietf.org/html/rfc%s',
			'PMID' => '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract'
		];
		$tsr = $this->tsrOffsets();
		$dp = new DataParsoid;
		$dp->tsr = $tsr;
		$dp->stx = 'magiclink';
		return [
			new SelfclosingTagTk( 'extlink', [
					new KV( 'href', sprintf( $base_urls[ $ref ], $identifier ) ),
					new KV( 'mw:content', TokenizerUtils::flattenString( [ $ref, $sp, $identifier ] ), $tsr->expandTsrV() ),
					new KV( 'typeof', 'mw:ExtLink/' . $ref )
				],
				$dp
			)
		];
	}

isbn =
	'ISBN' sp:space_or_nbsp+ isbn:(
		[0-9]
		((space_or_nbsp_or_dash / "") [0-9])+
		((space_or_nbsp_or_dash / "") [xX] / "")
	)
	isbncode:(
		end_of_word
		{
			// Convert isbn token-and-entity array to stripped string.
			$stripped = '';
			foreach ( TokenizerUtils::flattenStringlist( $isbn ) as $part ) {
				if ( is_string( $part ) ) {
					$stripped .= $part;
				}
			}
			return strtoupper( preg_replace( '/[^\dX]/i', '', $stripped ) );
		}
	)
	&{
		// ISBNs can only be 10 or 13 digits long (with a specific format)
		return strlen( $isbncode ) === 10
			|| ( strlen( $isbncode ) === 13 && preg_match( '/^97[89]/', $isbncode ) );
	}
	{
		$tsr = $this->tsrOffsets();
		$dp = new DataParsoid;
		$dp->stx = 'magiclink';
		$dp->tsr = $tsr;
		return [
			new SelfclosingTagTk( 'extlink', [
					new KV( 'href', 'Special:BookSources/' . $isbncode ),
					new KV( 'mw:content', TokenizerUtils::flattenString( [ 'ISBN', $sp, $isbn ] ), $tsr->expandTsrV() ),
					new KV( 'typeof', 'mw:WikiLink/ISBN' )
				],
				$dp
			)
		];
	}


/* Default URL protocols in MediaWiki (see DefaultSettings). Normally
 * these can be configured dynamically. */

url_protocol =
	p:$( '//' / [A-Za-z] [-A-Za-z0-9+.]* ':' '//'? )
	& { return Utils::isProtocolValid( $p, $this->env ); }
	{ return $p; }

// no punctuation, and '{<' to trigger directives
no_punctuation_char = [^ \]\[\r\n"'<>\x00-\x20\x7f&\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]

// this is the general url rule
// on the PHP side, the path part matches EXT_LINK_URL_CLASS
// which is '[^][<>"\\x00-\\x20\\x7F\p{Zs}]'
url =
	proto:url_protocol
	addr:(ipv6urladdr / "")
	path:(
		!inline_breaks c:(
			no_punctuation_char
			/ comment
			/ tplarg_or_template
			/ ['{]
			/ ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
				r:(
					& "&" he:htmlentity { return $he; }
					/ "&"
				) { return $r; }
		) { return $c; }
	)*
	// Must be at least one character after the protocol
	& { return $addr !== '' || count( $path ) > 0; }
	{
		return TokenizerUtils::flattenString( array_merge( [ $proto, $addr ], $path ) );
	}

// this is the somewhat-restricted rule used in autolinks
// See Parser::doMagicLinks and Parser.php::makeFreeExternalLink.
// The `path` portion matches EXT_LINK_URL_CLASS, as in the general
// url rule.  As in PHP, we do some fancy fixup to yank out
// trailing punctuation, perhaps including parentheses.
autourl =
	! '//' // protocol-relative autolinks not allowed (T32269)
	r:(
		proto:url_protocol
		addr:(ipv6urladdr / "")
		path:(
			!inline_breaks c:(
				no_punctuation_char
				/ comment
				/ tplarg_or_template
				/ $("'" !"'") // single quotes are ok, double quotes are bad
				/ "{"
				/ ! ( rhe:raw_htmlentity &{ return $rhe === '<' || $rhe === '>' || $rhe === "\u{A0}"; } )
					r:(
						& "&" he:htmlentity { return $he; }
						/ "&"
					) { return $r; }
			) { return $c; }
		)*
		{
			// as in Parser.php::makeFreeExternalLink, we're going to
			// yank trailing punctuation out of this match.
			$url = TokenizerUtils::flattenStringlist( array_merge( [ $proto, $addr ], $path ) );
			// only need to look at last element; HTML entities are strip-proof.
			$last = PHPUtils::lastItem( $url );
			$trim = 0;
			if ( is_string( $last ) ) {
				$strip = TokenizerUtils::getAutoUrlTerminatingChars( in_array( '(', $path, true ) );
				$trim = strspn( strrev( $last ), $strip );
				$url[ count( $url ) - 1 ] = substr( $last, 0, strlen( $last ) - $trim );
			}
			$url = TokenizerUtils::flattenStringlist( $url );
			if ( count( $url ) === 1 && is_string( $url[0] ) && strlen( $url[0] ) <= strlen( $proto ) ) {
				return null; // ensure we haven't stripped everything: T106945
			}
			$this->currPos -= $trim;
			return $url;
		}
	)
	&{ return $r !== null; }
	{
		$tsr = $this->tsrOffsets();
		$dp = new DataParsoid;
		$dp->tsr = $tsr;
		$res = [ new SelfclosingTagTk( 'urllink', [ new KV( 'href', $r, $tsr->expandTsrV() ) ], $dp ) ];
		return $res;
	}

// This is extracted from EXT_LINK_ADDR in Parser.php: a simplified
// expression to match an IPv6 address.  The IPv4 address and "at least
// one character of a host name" portions are punted to the `path`
// component of the `autourl` and `url` productions
ipv6urladdr =
	$( "[" [0-9A-Fa-f:.]+ "]" )

/**************************************************************
 * Templates, -arguments and wikilinks
 **************************************************************/

/*
 * Precedence: template arguments win over templates. See
 * http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
 * 4: {{{{·}}}} → {·{{{·}}}·}
 * 5: {{{{{·}}}}} → {{·{{{·}}}·}}
 * 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}}
 * 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·}
 * This is only if close has > 3 braces; otherwise we just match open
 * and close as we find them.
 */
tplarg_or_template =
	&'{{'
	t:tplarg_or_template_guarded<intemplate=true> {
		return $t;
	}

tplarg_or_template_guarded =
	&('{{' &('{{{'+ !'{') tplarg) a:(template/broken_template) { return $a; }
	/ a:$('{' &('{{{'+ !'{'))? b:tplarg { return [ $a, $b ]; }
	/ a:$('{' &('{{' !'{'))? b:template { return [ $a, $b ]; }
	/ broken_template

tplarg_or_template_or_bust =
	r:(tplarg_or_template / .)+ { return TokenizerUtils::flattenIfArray( $r ); }

template =
	template_preproc<&preproc="}}">

// The old preprocessor maintains a single stack of "closing token we
// are currently looking for", with no backtracking.  This means that
// once you see `[[ {{` you are looking only for `}}` -- if that template
// turns out to be broken you will never pop the `}}` and there is no way
// to close the `[[`.  Since the PEG tokenizer in Parsoid uses backtracking
// and parses in a single pass (instead of PHP's split preprocessor/parser)
// we have to be a little more careful when we emulate this behavior.
// If we use a rule like:
//   template = "{{" tplname tplargs* "}}"?
// Then we end up having to reinterpret `tplname tplargs*` as a tlb if it
// turns out we never find the `}}`, which involves a lot of tedious gluing
// tokens back together with fingers crossed we haven't discarded any
// significant newlines/whitespace/etc.  An alternative would be a rule like:
//   broken_template = "{{" tlb
// but again, `template` is used in many different contexts; `tlb` isn't
// necessarily the right one to recursively invoke.  Instead we get the
// broken template off of the PEGjs production stack by returning immediately
// after `{{`, but we set the "preproc" reference parameter to false (the
// reference parameter feature having been introduced for this sole purpose)
// to indicate to the parent rule that we're "still in" the {{ context and
// shouldn't ever inlineBreak for any closing tokens above this one.  For
// example:
//   [[Foo{{Bar]]
// This will match as:
//   wikilink->text,template->text             --> FAILS looking for }}
//     backtracks, popping "bracket_bracket" and "brace_brace" off preproc stack
//   wikilink->text,broken_template,text       --> FAILS looking for ]]
//     backtracks, popping "bracket_bracket" and false off preproc stack
//   broken_wikilink,text,broken_template,text --> OK
//     with [false, false] left on the preproc stack

broken_template =
	preproc:<&preproc>
	t:"{{" {
		$preproc = null;
		return $t;
	}

template_preproc =
	"{{" leadWS:$( nl_comment_space* )
	target:template_param_value
	params:(
		nl_comment_space* "|"
		r:(
			p0:("" { return $this->endOffset(); })
			v:nl_comment_space*
			p1:("" { return $this->endOffset(); })
			&("|" / "}}")
			{
				// empty argument
				$tsr0 = new SourceRange( $p0, $p1 );
				return new KV( '', TokenizerUtils::flattenIfArray( $v ), $tsr0->expandTsrV() );
			}
			/ template_param
		) { return $r; }
	)*
	trailWS:$( nl_comment_space* )
	inline_breaks "}}"
	{
		// Insert target as first positional attribute, so that it can be
		// generically expanded. The TemplateHandler then needs to shift it out
		// again.
		array_unshift( $params, new KV( TokenizerUtils::flattenIfArray( $target['tokens'] ), '', $target['srcOffsets']->expandTsrK() ) );
		$dp = new DataParsoid;
		$dp->tsr = $this->tsrOffsets();
		$dp->src = $this->text();
		$tmp = $dp->getTemp();
		$tmp->leadWS = $leadWS;
		$tmp->trailWS = $trailWS;
		$obj = new SelfclosingTagTk( 'template', $params, $dp );
		return $obj;
	}
	/ $('{{' space_or_newline* '}}')

tplarg =
	tplarg_preproc<&preproc="}}">

tplarg_preproc =
	"{{{"
	p:("" { return $this->endOffset(); })
	target:template_param_value?
	params:(
		nl_comment_space* "|"
		r:(
			p0:("" { return $this->endOffset(); })
			v:nl_comment_space*
			p1:("" { return $this->endOffset(); })
			&("|" / "}}}")
			{
				// empty argument
				return [ 'tokens' => $v, 'srcOffsets' => new SourceRange( $p0, $p1 ) ];
			}
			/ template_param_value
		) { return $r; }
	)*
	nl_comment_space*
	inline_breaks "}}}"
	{
		$kvs = [];

		if ( $target === null ) {
			$target = [ 'tokens' => '', 'srcOffsets' => new SourceRange( $p, $p ) ];
		}
		// Insert target as first positional attribute, so that it can be
		// generically expanded. The TemplateHandler then needs to shift it out
		// again.
		$kvs[] = new KV( TokenizerUtils::flattenIfArray( $target['tokens'] ), '', $target['srcOffsets']->expandTsrK() );

		foreach ( $params as $o ) {
			$s = $o['srcOffsets'];
			$kvs[] = new KV( '', TokenizerUtils::flattenIfArray( $o['tokens'] ), $s->expandTsrV() );
		}

		$dp = new DataParsoid;
		$dp->tsr = $this->tsrOffsets();
		$dp->src = $this->text();
		$obj = new SelfclosingTagTk( 'templatearg', $kvs, $dp );
		return $obj;
	}

template_param =
	name:template_param_name
	val:(
		kEndPos:("" { return $this->endOffset(); })
		// no optionalSpaceToken here, it's eaten by template_param_name
		"="
		vStartPos:("" { return $this->endOffset(); })
		optSp:optionalSpaceToken
		tpv:template_param_value? {
			return [
				'kEndPos' => $kEndPos,
				'vStartPos' => $vStartPos,
				'value' => TokenizerUtils::flattenString( [ $optSp, $tpv['tokens'] ?? [] ] ),
			];
		}
	)? {
		if ( $val !== null ) {
			if ( $val['value'] !== null ) {
				$so = new KVSourceRange(
					$this->startOffset(), $val['kEndPos'],
					$val['vStartPos'], $this->endOffset()
				);
				return new KV(
					$name,
					TokenizerUtils::flattenIfArray( $val['value'] ),
					$so
				);
			} else {
				return new KV(
					TokenizerUtils::flattenIfArray( $name ),
					'',
					$so
				);
			}
		} else {
			$so = new SourceRange( $this->startOffset(), $this->endOffset() );
			return new KV(
				'',
				TokenizerUtils::flattenIfArray( $name ),
				$so->expandTsrV()
			);
		}
	}
	// empty parameter
	/ & [|}] {
		$so = new SourceRange( $this->startOffset(), $this->endOffset() );
		return new KV( '', '', $so->expandTsrV() );
	}

template_param_name =
	template_param_text<equal> / (&'=' { return ''; })

template_param_value =
	tpt:template_param_text<equal=false>
	{
		return [ 'tokens' => $tpt, 'srcOffsets' => $this->tsrOffsets() ];
	}

template_param_text =
	il:(nested_block<table=false, extlink=false, templateArg=true, tableCellArg=false> / newlineToken)+
	{
		// il is guaranteed to be an array -- so, tu.flattenIfArray will
		// always return an array
		$r = TokenizerUtils::flattenIfArray( $il );
		if ( count( $r ) === 1 && is_string( $r[0] ) ) {
			$r = $r[0];
		}
		return $r;
	}

//// Language converter block markup of language variants: -{ ... }-

// Note that "rightmost opening" precedence rule (see
// https://www.mediawiki.org/wiki/Preprocessor_ABNF ) means
// that neither -{{ nor -{{{ are parsed as a -{ token, although
// -{{{{ is (since {{{ has precedence over {{).

lang_variant_or_tpl =
	&('-{' &('{{{'+ !'{') tplarg) a:lang_variant { return $a; }
	/ a:$('-' &('{{{'+ !'{')) b:tplarg { return [ $a, $b ]; }
	/ a:$('-' &('{{' '{{{'* !'{')) b:template { return [ $a, $b ]; }
	/ &'-{' a:lang_variant { return $a; }

broken_lang_variant =
	r:"-{"
	preproc:<&preproc>
	{
		$preproc = null;
		return $r;
	}

lang_variant =
	// FIXME: Maybe this should suppress "table" and "tableCellArg" like 'template_param_text' d too
	lang_variant_preproc<&preproc="}-", extlink=false>
	/ broken_lang_variant

lang_variant_preproc =
	lv0:("-{" { return $this->startOffset(); })
	f:(
		&{ return $this->env->langConverterEnabled(); }
		ff:opt_lang_variant_flags {
			// if flags contains 'R', then don't treat ; or : specially inside.
			if ( isset( $ff['flags'] ) ) {
				$ff['raw'] = isset( $ff['flags']['R'] ) || isset( $ff['flags']['N'] );
			} elseif ( isset( $ff['variants'] ) ) {
				$ff['raw'] = true;
			}
			return $ff;
		} /
		&{ return !$this->env->langConverterEnabled(); }
		"" {
			// if language converter not enabled, don't try to parse inside.
			return [ 'raw' => true ];
		}
	)
	ts:(
		&{ return $f['raw']; }
		lv:lang_variant_text { return [ [ 'text' => $lv ] ]; }
		/
		&{ return !$f['raw']; }
		lv:lang_variant_option_list { return $lv; }
	)
	inline_breaks
	lv1:("}-" { return $this->endOffset(); })
	{
		if ( !$this->env->langConverterEnabled() ) {
			return [ '-{', $ts[0]['text']['tokens'], '}-' ];
		}
		$lvsrc = substr( $this->input, $lv0, $lv1 - $lv0 );
		$attribs = [];

		foreach ( $ts as &$t ) {
			// move token strings into KV attributes so that they are
			// properly expanded by early stages of the token pipeline
			foreach ( [ 'text', 'from', 'to' ] as $fld ) {
				if ( !isset( $t[$fld] ) ) {
					continue;
				}
				$name = 'mw:lv' . count( $attribs );
				// Note that AttributeExpander will expect the tokens array to be
				// flattened.  We do that in lang_variant_text / lang_variant_nowiki
				$attribs[] = new KV( $name, $t[$fld]['tokens'], $t[$fld]['srcOffsets']->expandTsrV() );
				$t[$fld] = $name;
			}
		}
		unset( $t );

		$flags = isset( $f['flags'] ) ? array_keys( $f['flags'] ) : [];
		sort( $flags );
		$variants = isset( $f['variants'] ) ? array_keys( $f['variants'] ) : [];
		sort( $variants );

		$dp = new DataParsoid;
		$dp->tsr = new SourceRange( $lv0, $lv1 );
		$dp->src = $lvsrc;
		$dp->flags = $flags;
		$dp->variants = $variants;
		$dp->original = $f['original'];
		$dp->flagSp = $f['sp'];
		$dp->texts = $ts;

		return [
			new SelfclosingTagTk(
				'language-variant',
				$attribs,
				$dp
			)
		];
	}

opt_lang_variant_flags =
	f:( ff:lang_variant_flags "|" { return $ff; } )?
	{
		// Collect & separate flags and variants into a hashtable (by key) and ordered list
		$flags = [];
		$variants = [];
		$flagList = [];
		$flagSpace = [];
		$variantList = [];
		$variantSpace = [];
		$useVariants = false;
		if ( $f !== null ) {
			// lang_variant_flags returns arrays in reverse order.
			$spPtr = count( $f['sp'] ) - 1;
			for ( $i = count( $f['flags'] ) - 1; $i >= 0; $i--) {
				$item = $f['flags'][$i];
				if ( isset( $item['flag'] ) ) {
					$flagSpace[] = $f['sp'][$spPtr--];
					$flags[$item['flag']] = true;
					$flagList[] = $item['flag'];
					$flagSpace[] = $f['sp'][$spPtr--];
				}
				if ( isset( $item['variant'] ) ) {
					$variantSpace[] = $f['sp'][$spPtr--];
					$variants[$item['variant']] = true;
					$variantList[] = $item['variant'];
					$variantSpace[] = $f['sp'][$spPtr--];
				}
			}
			if ( $spPtr >= 0 ) {
				// handle space after a trailing semicolon
				$flagSpace[] = $f['sp'][$spPtr];
				$variantSpace[] = $f['sp'][$spPtr];
			}
		}
		// Parse flags (this logic is from core/languages/ConverterRule.php
		// in the parseFlags() function)
		if ( count( $flags ) === 0 && count( $variants ) === 0 ) {
			$flags['$S'] = true;
		} elseif ( isset( $flags['R'] ) ) {
			$flags = [ 'R' => true ]; // remove other flags
		} elseif ( isset( $flags['N'] ) ) {
			$flags = [ 'N' => true ]; // remove other flags
		} elseif ( isset( $flags['-'] ) ) {
			$flags = [ '-' => true ]; // remove other flags
		} elseif ( isset( $flags['T'] ) && count( $flags ) === 1 ) {
			$flags['H'] = true;
		} elseif ( isset( $flags['H'] ) ) {
			// Replace A flag, and remove other flags except T and D
			$nf = [ '$+' => true, 'H' => true ];
			if ( isset( $flags['T'] ) ) { $nf['T'] = true; }
			if ( isset( $flags['D'] ) ) { $nf['D'] = true; }
			$flags = $nf;
		} elseif ( count( $variants ) > 0 ) {
			$useVariants = true;
		} else {
			if ( isset( $flags['A'] ) ) {
				$flags['$+'] = true;
				$flags['$S'] = true;
			}
			if ( isset( $flags['D'] ) ) {
				unset( $flags['$S'] );
			}
		}
		if ( $useVariants ) {
			return [ 'variants' => $variants, 'original' => $variantList, 'sp' => $variantSpace ];
		} else {
			return [ 'flags' => $flags, 'original' => $flagList, 'sp' => $flagSpace ];
		}
	}

lang_variant_flags =
	sp1:$(space_or_newline*) f:lang_variant_flag sp2:$(space_or_newline*)
	more:( ";" lang_variant_flags? )?
	{
		$r = ( $more && $more[1] ) ? $more[1] : [ 'sp' => [], 'flags' => [] ];
		// Note that sp and flags are in reverse order, since we're using
		// right recursion and want to push instead of unshift.
		$r['sp'][] = $sp2;
		$r['sp'][] = $sp1;
		$r['flags'][] = $f;
		return $r;
	}
	/ sp:$(space_or_newline*) {
		return [ 'sp' => [ $sp ], 'flags' => [] ];
	}

lang_variant_flag =
	f:[-+A-Z]           { return [ 'flag' => $f ]; }
	/ v:lang_variant_name { return [ 'variant' => $v ]; }
	/ b:$(!space_or_newline !nowiki [^{}|;])+ { return [ 'bogus' => $b ]; /* bad flag */}

// language variant name, like zh, zh-cn, etc.
lang_variant_name =
	$([a-z] [-a-zA-Z]+)
	// Escaped otherwise-unrepresentable language names
	// Primarily for supporting html2html round trips; PHP doesn't support
	// using nowikis here (yet!)
	/ nowiki_text

lang_variant_option_list =
	o:lang_variant_option rest:( ";" oo:lang_variant_option { return $oo; })*
	tr:( ";" $bogus_lang_variant_option )* // optional trailing crap
	{
		array_unshift( $rest, $o );
		// if the last bogus option is just spaces, keep them; otherwise
		// drop all this bogus stuff on the ground
		if ( count($tr) > 0 ) {
			$last = $tr[count($tr)-1];
			if (preg_match('/^\s*$/Du', $last[1])) {
				$rest[] = [ 'semi' => true, 'sp' => $last[1] ];
			}
		}
		return $rest;
	}
	/ lvtext:lang_variant_text { return [ [ 'text' => $lvtext ] ]; }

bogus_lang_variant_option =
	lang_variant_text?

lang_variant_option =
	sp1:$(space_or_newline*) lang:lang_variant_name
	sp2:$(space_or_newline*) ":"
	sp3:$(space_or_newline*)
	lvtext:(lang_variant_nowiki / lang_variant_text_no_semi)
	{
		return [
			'twoway' => true,
			'lang' => $lang,
			'text' => $lvtext,
			'sp' => [ $sp1, $sp2, $sp3 ]
		];
	}
	/ sp1:$(space_or_newline*)
	from:(lang_variant_nowiki / lang_variant_text_no_semi_or_arrow)
	"=>"
	sp2:$(space_or_newline*) lang:lang_variant_name
	sp3:$(space_or_newline*) ":"
	sp4:$(space_or_newline*)
	to:(lang_variant_nowiki / lang_variant_text_no_semi)
	{
		return [
			'oneway' => true,
			'from' => $from,
			'lang' => $lang,
			'to' => $to,
			'sp' => [ $sp1, $sp2, $sp3, $sp4 ]
		];
	}

// html2wt support: If a language name or conversion string can't be
// represented w/o breaking wikitext, just wrap it in a <nowiki>.
// PHP doesn't support this (yet), but Parsoid does.
lang_variant_nowiki =
	n:nowiki_text
	sp:$space_or_newline* {
		$tsr = $this->tsrOffsets();
		$tsr->end -= strlen( $sp );
		return [
			'tokens' => [ $n ],
			'srcOffsets' => $tsr,
		];
	}

lang_variant_text =
	tokens:(inlineline / "|" )*
	{
		return [
			'tokens' => TokenizerUtils::flattenStringlist( $tokens ),
			'srcOffsets' => $this->tsrOffsets(),
		];
	}

lang_variant_text_no_semi =
	lang_variant_text<semicolon>

lang_variant_text_no_semi_or_arrow =
	lang_variant_text_no_semi<arrow>

wikilink_content =
	(
		pipe
		startPos:("" { return $this->endOffset(); })
		lt:link_text? {
			$tsr = new SourceRange( $startPos, $this->endOffset() );
			$maybeContent = new KV( 'mw:maybeContent', $lt ?? [], $tsr->expandTsrV() );
			$maybeContent->vsrc = substr( $this->input, $startPos, $this->endOffset() - $startPos );
			return $maybeContent;
		}
	)*

wikilink =
	wikilink_preproc<&preproc="]]">
	/ broken_wikilink

// `broken-link` (see [[:mw:Preprocessor_ABNF]]), but careful because the
// second bracket could start an extlink.  Set preproc to false as a reference
// parameter in the parent since we haven't seen a double-close bracket.
// (See full explanation above broken_template production.)
broken_wikilink =
	&"[["
	preproc:<&preproc>
	&{ $preproc =  null; return true; }
	a:("[" (extlink / "[")) {
		return $a;
	}

wikilink_preproc =
	"[["
	spos:("" { return $this->endOffset(); })
	target:wikilink_preprocessor_text?
	tpos:("" { return $this->endOffset(); })
	lcs:wikilink_content
	inline_breaks "]]"
	{
		$pipeTrick = count( $lcs ) === 1 && count( $lcs[0]->v ) === 0;
		$textTokens = [];
		if ( $target === null || $pipeTrick ) {
			$textTokens[] = '[[';
			if ( $target ) {
				$textTokens[] = $target;
			}
			foreach ( $lcs as $a ) {
				// a is a mw:maybeContent attribute
				$textTokens[] = '|';
				if ( count( $a->v ) > 0 ) {
					$textTokens[] = $a->v;
				}
			}
			$textTokens[] = ']]';
			return $textTokens;
		}
		$obj = new SelfclosingTagTk( 'wikilink' );
		$tsr = new SourceRange( $spos, $tpos );
		$hrefKV = new KV( 'href', $target, $tsr->expandTsrV() );
		$hrefKV->vsrc = $tsr->substr( $this->input );
		// XXX: Point to object with path, revision and input information
		// obj.source = input;
		$obj->attribs[] = $hrefKV;
		$obj->attribs = array_merge( $obj->attribs, $lcs );
		$dp = new DataParsoid;
		$dp->tsr = $this->tsrOffsets();
		$dp->src = $this->text();
		$obj->dataParsoid = $dp;
		return [ $obj ];
	}

// Tables are allowed inside image captions.
// Suppress the equal flag temporarily in this rule to consume the '=' here.
link_text = link_text_parameterized<equal = false, linkdesc = true>

link_text_parameterized =
	c:(
		// This group is similar to "block_line" but "list_item"
		// is omitted since `doBlockLevels` happens after
		// `handleInternalLinks2`, where newlines are stripped.
		(sol (heading / hr / full_table_in_link_caption))
		/ urltext
		/ (
			!inline_breaks
			r:( inline_element / '[' text_char+ ']' $(&(!']' / ']]')) / . ) { return $r; }
		)
	)+ {
		return TokenizerUtils::flattenStringlist( $c );
	}

/* Generic quote rule for italic and bold, further processed in a token
 * stream transformation in doQuotes. Relies on NlTk tokens being emitted
 * for each line of text to balance quotes per line.
 *
 * We are not using a simple pair rule here as we need to support mis-nested
 * bolds/italics and MediaWiki's special heuristics for apostrophes, which are
 * all not context free. */
quote =
	quotes:$("''" "'"*) {
		// sequences of four or more than five quotes are assumed to start
		// with some number of plain-text apostrophes.
		$plainticks = 0;
		$result = [];
		if ( strlen( $quotes ) === 4 ) {
			$plainticks = 1;
		} elseif ( strlen( $quotes ) > 5 ) {
			$plainticks = strlen( $quotes ) - 5;
		}
		if ( $plainticks > 0 ) {
			$result[] = substr( $quotes, 0, $plainticks );
		}
		// mw-quote token will be consumed in token transforms
		$tsr = $this->tsrOffsets();
		$tsr->start += $plainticks;
		$dp = new DataParsoid;
		$dp->tsr = $tsr;
		$mwq = new SelfclosingTagTk( 'mw-quote',
			[ new KV( 'value', substr( $quotes, $plainticks ) ) ],
			$dp );
		if ( strlen( $quotes ) > 2 ) {
			$mwq->addAttribute( 'isSpace_1', $tsr->start > 0 && substr( $this->input, $tsr->start - 1, 1 ) === ' ');
			$mwq->addAttribute( 'isSpace_2', $tsr->start > 1 && substr( $this->input, $tsr->start - 2, 1 ) === ' ');
		}
		$result[] = $mwq;
		return $result;
	}


/***********************************************************
 * Xmlish tags
 ***********************************************************/

// FIXME: Temporary (?) hack to let us not horribly break on old tvar syntax
// In coordination with language team, get rid of this hack once all old uses
// are migrated to new syntax (T274881).
tvar_old_syntax_closing_HACK =
	"/>"
	& { return $this->env->hasAnnotations && $this->siteConfig->isAnnotationTag( 'tvar' ); }
	{
		$metaAttrs = [ new KV( 'typeof', 'mw:Annotation/tvar/End' ) ];
		$dp = new DataParsoid();
		$dp->tsr = $this->tsrOffsets();
		$dp->tsr->start--; // For "<" matched at the start of xmlish_tag rule
		if ( empty( $this->pipelineOpts['inTemplate'] ) ) {
			return [ new SelfclosingTagTk ( 'meta', $metaAttrs, $dp ) ];
		} else {
			// suppress meta tags from pipeline output
			return [];
		}
	}

annotation_tag =
	annToken:extension_annotation_tag
	&{
		return ( $annToken instanceof Token && $annToken->getName() !== 'extension' );
	}
	{ return $annToken; }

extension_annotation_tag =
	!<annOrExtTag>
	"<" tag:(
		extToken:xmlish_tag<annOrExtTag>
		// Account for `maybeAnnotationOrExtensionTag` returning unmatched start / end tags
		&{ return !$extToken || $extToken[0]->getName() === 'extension' ||
			($extToken[0]->getName() === 'meta' && preg_match( WTUtils::ANNOTATION_META_TYPE_REGEXP, $extToken[0]->getAttributeV( 'typeof' ) ?? '' ) > 0); }
			{ return !$extToken ? '' : $extToken[0]; }
	/ tvar_old_syntax_closing_HACK ) { return $tag; }

nowiki =
	& ("<" "/"? "nowiki"i )
	extToken:extension_annotation_tag
	{ return $extToken; }

// Used by lang_variant productions to protect special language names or
// conversion strings.
nowiki_text =
	extToken:nowiki
	{
		$txt = Utils::extractExtBody( $extToken );
		return Utils::decodeWtEntities( $txt );
	}

/* Generic XML-like tags
 *
 * These also cover extensions (including Cite), which will hook into the
 * token stream for further processing. The content of extension tags is
 * parsed as regular inline, but the source positions of the tag are added
 * to allow reconstructing the unparsed text from the input. */

// See http://www.w3.org/TR/html5/syntax.html#tag-open-state and the following
// paragraphs.  Note that we don't enforce ascii alpha for the first character
// here because we need to be more permissive for extension tag names.  That
// happens in xmlish_tag below.
tag_name = $[^\t\n\v />\0]+

// This rule is used in carefully crafted places of xmlish tag tokenizing with
// the inclusion of solidus to match where the spec would ignore those
// characters.  In particular, it does not belong in between attribute name
// and value.
space_or_newline_or_solidus = space_or_newline / (s:"/" !">" { return $s; })

xmlish_tag =
	& {
		$this->assert(
			$this->input[$this->currPos - 1] === '<',
			'Failed to open xmlish_tag before entering.'
		);
		return true;
	}
	end:"/"?
	name: tag_name
	annOrExtTag: <annOrExtTag>
	& {
		if ( $annOrExtTag ) {
			return WTUtils::isAnnOrExtTag( $this->env, $name );
		} else {
			// Only enforce ascii alpha first char for non-extension tags.
			// See tag_name above for the details.
			return preg_match( '/^[A-Za-z]/', $name ) && $this->isXMLTag( $name );
		}
	}
	// By the time we get to `doTableStuff` in the old parser, we've already
	// safely encoded element attributes. See 55313f4e in core.
	attribs:generic_newline_attributes<table=false, tableCellArg=false>
	space_or_newline_or_solidus* // No need to preserve this -- canonicalize on RT via dirty diff
	selfclose:"/"?
	space* // not preserved - canonicalized on RT via dirty diff
	">"
	{
		$lcName = mb_strtolower( $name );

		// Extension tags don't necessarily have the same semantics as html tags,
		// so don't treat them as void elements.
		$isVoidElt = Utils::isVoidElement( $lcName ) && !$annOrExtTag;

		// Support </br>
		if ( $lcName === 'br' && $end ) {
			$end = null;
		}

		$tsr = $this->tsrOffsets();
		$tsr->start--; // For "<" matched at the start of xmlish_tag rule
		$res = TokenizerUtils::buildXMLTag( $name, $lcName, $attribs, $end, !!$selfclose || $isVoidElt, $tsr );

		// change up data-attribs in one scenario
		// void-elts that aren't self-closed ==> useful for accurate RT-ing
		if ( !$selfclose && $isVoidElt ) {
			unset( $res->dataParsoid->selfClose );
			$res->dataParsoid->noClose = true;
		}

		$met = $this->maybeAnnotationOrExtensionTag( $res, $end, $attribs, $tsr );
		return is_array( $met ) ? $met : [ $met ];
	}

// A generic attribute that can span multiple lines.
generic_newline_attribute =
	space_or_newline_or_solidus*
	namePos0:("" { return $this->endOffset(); })
	name:generic_attribute_name
	namePos1:("" { return $this->endOffset(); })
	vd:(space_or_newline* "=" v:generic_att_value? { return $v; })?
{
	// NB: Keep in sync w/ table_attibute
	$res = null;
	// Encapsulate protected attributes.
	if ( is_string( $name ) ) {
		$name = TokenizerUtils::protectAttrs( $name );
	}
	$nameSO = new SourceRange( $namePos0, $namePos1 );
	if ( $vd !== null ) {
		$res = new KV( $name, $vd['value'], $nameSO->join( $vd['srcOffsets'] ) );
		$res->vsrc = $vd['srcOffsets']->substr( $this->input );
	} else {
		$res = new KV( $name, '', $nameSO->expandTsrK() );
	}
	if ( is_array( $name ) ) {
		$res->ksrc = $nameSO->substr( $this->input );
	}
	return $res;
}

// A single-line attribute.
table_attribute =
	s:optionalSpaceToken
	namePos0:("" { return $this->endOffset(); })
	name:table_attribute_name
	namePos1:("" { return $this->endOffset(); })
	vd:(optionalSpaceToken "=" v:table_att_value? { return $v; })?
{
	// NB: Keep in sync w/ generic_newline_attribute
	$res = null;
	// Encapsulate protected attributes.
	if ( gettype( $name ) === 'string' ) {
		$name = TokenizerUtils::protectAttrs( $name );
	}
	$nameSO = new SourceRange( $namePos0, $namePos1 );
	if ( $vd !== null ) {
		$res = new KV( $name, $vd['value'], $nameSO->join( $vd['srcOffsets'] ) );
		$res->vsrc = $vd['srcOffsets']->substr( $this->input );
	} else {
		$res = new KV( $name, '', $nameSO->expandTsrK() );
	}
	if ( is_array( $name ) ) {
		$res->ksrc = $nameSO->substr( $this->input );
	}
	return $res;
}

// The old parser's Sanitizer::removeHTMLtags explodes on < so that it can't
// be found anywhere in xmlish tags.  This is a divergence from html5 tokenizing
// which happily permits it in attribute positions.  Extension tags being the
// exception, since they're stripped beforehand.
less_than =
	$(
		&<annOrExtTag>
		"<"
	)

// The arrangement of chars is to emphasize the split between what's disallowed
// by html5 and what's necessary to give directive a chance.
// See: http://www.w3.org/TR/html5/syntax.html#attributes-0
generic_attribute_name =
	q:$(["'=]?) // From #before-attribute-name-state, < is omitted for directive
	r:(
		$[^ \t\r\n\0/=><&{}\-!|]+
		/ !inline_breaks
		// \0/=> is the html5 attribute name set we do not want.
		t:(
			directive
			/ less_than
			/ $( !( space_or_newline / [\0/=><] ) . )
		) { return $t; }
	)*
	& { return count( $r ) > 0 || $q !== ''; }
	{
		array_unshift( $r, $q );
		return TokenizerUtils::flattenString( $r );
	}

// Also accept these chars in a wikitext table or tr attribute name position.
// They are normally not matched by the table_attribute_name.
broken_table_attribute_name_char = c:[\0/=>] { return new KV( $c, '' ); }

// Same as generic_attribute_name, except for accepting tags and wikilinks.
// (That doesn't make sense (ie. match the old parser) in the generic case.)
// We also give a chance to break on \[ (see T2553).
table_attribute_name =
	q:$(["'=]?) // From #before-attribute-name-state, < is omitted for directive
	r:(
		$[^ \t\r\n\0/=><&{}\-!|\[]+
		/ !inline_breaks
		// \0/=> is the html5 attribute name set we do not want.
		t:(
			$wikilink
			/ directive
			// Accept tags-inside-attributes as attribute names.
			// The sanitizer will strip and shadow them for roundtripping.
			// Example: <hiddentext>generated with.. </hiddentext>
			/ x:inline_xmlish_tag ill:inlineline? { return array_merge( $x, $ill ?: [] ); }
			/ $( !( space_or_newline / [\0/=>] ) . )
		) { return $t; }
	)*
	& { return count( $r ) > 0 || $q !== ''; }
	{
		array_unshift( $r, $q );
		return TokenizerUtils::flattenString( $r );
	}

// Attribute value, quoted variants can span multiple lines.
// Missing end quote: accept /> look-ahead as heuristic.
// These need to be kept in sync with the attribute_preprocessor_text_*
generic_att_value =
	s:$(space_or_newline* "'") t:attribute_preprocessor_text_single? q:$("'" / &('/'? '>')) {
		return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) );
	}
	/ s:$(space_or_newline* '"') t:attribute_preprocessor_text_double? q:$('"' / &('/'? '>')) {
		return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) );
	}
	/ s:$space_or_newline* t:attribute_preprocessor_text &(space_or_newline / eof / '/'? '>') {
		return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() );
	}

// Attribute value, restricted to a single line.
// Missing end quote: accept |, !!, \r, and \n look-ahead as heuristic.
// These need to be kept in sync with the table_attribute_preprocessor_text_*
table_att_value =
	s:$(space* "'") t:table_attribute_preprocessor_text_single? q:$("'" / &('!!' / [|\r\n])) {
		return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) );
	}
	/ s:$(space* '"') t:table_attribute_preprocessor_text_double? q:$('"' / &('!!' / [|\r\n])) {
		return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() - strlen( $q ) );
	}
	/ s:$space* t:table_attribute_preprocessor_text &(space_or_newline/ eof / '!!' / '|') {
		return TokenizerUtils::getAttrVal( $t, $this->startOffset() + strlen( $s ), $this->endOffset() );
	}

/*********************************************************
 *   Lists
 *********************************************************/
list_item = dtdd / hacky_dl_uses / li

li =
	bullets:list_char+
	c:inlineline?
	// The inline_break is to check if we've hit a template end delimiter.
	&(eolf / inline_breaks)
	{
		// Leave bullets as an array -- list handler expects this
		$tsr = $this->tsrOffsets( 'start' );
		$tsr->end += count( $bullets );
		$dp = new DataParsoid;
		$dp->tsr = $tsr;
		$li = new TagTk( 'listItem', [ new KV( 'bullets', $bullets, $tsr->expandTsrV() ) ], $dp );
		return array_merge( [ $li ], $c ?: [] );
	}

/*
 * This rule is required to support wikitext of this form
 *   ::{|border="1"|foo|bar|baz|}
 * where the leading colons are used to indent the entire table.
 * This hack was added back in 2006 in commit
 * a0746946312b0f1eda30a2c793f5f7052e8e5f3a based on a patch by Carl
 * Fürstenberg.
 */
hacky_dl_uses =
	bullets:":"+
	tbl:(table_line (sol+ table_line)*)
	line:inlineline?
	&((space / comment)* eolf)
{
	// Leave bullets as an array -- list handler expects this
	$tsr = $this->tsrOffsets( 'start' );
	$tsr->end += count( $bullets );
	$dp = new DataParsoid;
	$dp->tsr = $tsr;
	$li = new TagTk( 'listItem', [ new KV( 'bullets', $bullets, $tsr->expandTsrV() ) ], $dp );
	return TokenizerUtils::flattenIfArray( [ $li, $tbl, $line ?: [] ] );
}

dtdd_colon = c:inlineline_break_on_colon? cpos:(":" { return $this->endOffset(); })
{
	return [ $c, $cpos ];
}

dtdd =
	bullets:(!(";" !list_char) lc:list_char { return $lc; })*
	";"
	colons:dtdd_colon*
	d:inlineline?
	&eolf {
		$bulletToks = [];
		// Leave bullets as an array -- list handler expects this
		// TSR: +1 for the leading ";"
		$numBullets = count( $bullets ) + 1;
		$tsr = $this->tsrOffsets( 'start' );
		$tsr->end += $numBullets;
		$li1Bullets = $bullets;
		$li1Bullets[] = ';';
		$dp = new DataParsoid;
		$dp->tsr = $tsr;
		$bulletToks[] = new TagTk( 'listItem', [ new KV( 'bullets', $li1Bullets, $tsr->expandTsrV() ) ], $dp );
		foreach ( $colons as $colon) {
			if ( $colon[0] ) { // can be null because of "?" in dtdd_colon
				$bulletToks[] = $colon[0];
			}
			$cpos = $colon[1];
			// TSR: -1 for the intermediate ":"
			$li2Bullets = $bullets;
			$li2Bullets[] = ':';
			$tsr2 = new SourceRange( $cpos - 1, $cpos );
			$dp2 = new DataParsoid;
			$dp2->tsr = $tsr2;
			$dp2->stx = 'row';
			$bulletToks[] = new TagTk( 'listItem', [ new KV( 'bullets', $li2Bullets, $tsr2->expandTsrV() ) ], $dp2 );
		}

		if ( $d ) {
			$bulletToks = array_merge( $bulletToks, $d );
		}
		return $bulletToks;
	}

list_char = [*#:;]

inlineline_break_on_colon =
	inlineline<colon>

/******************************************************************************
 * Tables
 * ------
 * Table rules are geared to support independent parsing of fragments in
 * templates (the common table start / row / table end use case). The tokens
 * produced by these fragments then match up to a table while building the
 * DOM tree. For similar reasons, table rows do not emit explicit end tag
 * tokens.
 *
 * The separate table_line rule is faster than moving those rules
 * directly to block_lines.
 *
 * Notes about the full_table_in_link_caption rule
 * -----------------------------------------------------
 * However, for link-tables, we have introduced a stricter parse wherein
 * we require table-start and table-end tags to not come from a template.
 * In addition, this new rule doesn't accept fosterable-content in
 * the table unlike the more lax (sol table_line)+ rule.
 *
 * This is the best we can do at this time since we cannot distinguish
 * between table rows and image options entirely in the tokenizer.
 *
 * Consider the following examples:
 *
 * Example 1:
 *
 * [[Image:Foo.jpg|left|30px|Example 1
 * {{This-template-returns-a-table-start-tag}}
 * |foo
 * {{This-template-returns-a-table-end-tag}}
 * ]]
 *
 * Example 2:
 *
 * [[Image:Foo.jpg|left|30px|Example 1
 * {{1x|a}}
 * |foo
 * {{1x|b}}
 * ]]
 *
 * So, we cannot know a priori (without preprocessing or fully expanding
 * all templates) if "|foo" in the two examples is a table cell or an image
 * option. This is a limitation of our tokenizer-based approach compared to
 * the preprocessing-based approach of the old parser.
 *
 * Given this limitation, we are okay forcing a full-table context in
 * link captions (if necessary, we can relax the fosterable-content requirement
 * but that is broken wikitext anyway, so we can force that edge-case wikitext
 * to get fixed by rejecting it).
 ******************************************************************************/

full_table_in_link_caption =
	!inline_breaks
	// Note that "linkdesc" is suppressed here to provide a nested parsing
	// context in which to parse the table.  Otherwise, we may break on
	// on pipes in the `table_start_tag` and `table_row_tag` attributes.
	// However, as a result, this can be more permissive than the old
	// implementation (legacy parser?), but likelier to match the users intent.
	// Suppress the recursion protection from tableDataBlock since we're trying
	// to parse a full table and if the link is itself nested in a table this
	// will always stop.  Hopefully, this won't result in any overflows.
	r: full_table_in_link_caption_parameterized<linkdesc=false, table, tableDataBlock=false> {
		return $r;
	}

full_table_in_link_caption_parameterized =
	table_start_tag
	// Accept multiple end tags since a nested table may have been
	// opened in the table content line.
	(
		(sol+ (table_content_line / tplarg_or_template))*
		sol+ table_end_tag
	)+

// This rule assumes start-of-line position!
table_line =
	(! inline_breaks / & '{{!}}' )
	tl:(
		table_start_tag
		/ table_content_line<table>
		/ table_end_tag
	) {
		return $tl;
	}

table_content_line =
	(space / comment)* (
		table_heading_tags
		/ table_row_tag
		/ table_data_tags
		/ table_caption_tag
	)

table_start_tag "table_start_tag" =
	sc:(space / comment)*
	startPos:("" { return $this->endOffset(); })
	b:"{" p:pipe
	// ok to normalize away stray |} on rt (see T59360)
	ta:(table_attributes<table=false> / &{ $this->unreachable(); })
	tsEndPos:("" { return $this->endOffset(); })
	s2:space*
	{
		$coms = TokenizerUtils::popComments( $ta );
		if ( $coms ) {
			$tsEndPos = $coms['commentStartPos'];
		}

		$dp = new DataParsoid;
		$dp->tsr = new SourceRange( $startPos, $tsEndPos );
		if ( $p !== '|' ) {
			// Variation from default
			$dp->startTagSrc = $b . $p;
		}

		return array_merge( $sc,
			[ new TagTk( 'table', $ta, $dp ) ],
			$coms ? $coms['buf'] : [],
			$s2 );
	}

// FIXME: Not sure if we want to support it, but this should allow columns.
table_caption_tag =
	// avoid recursion via nested_block_in_table
	! <tableDataBlock>
	p:pipe "+"
	args:row_syntax_table_args?
	tagEndPos:("" { return $this->endOffset(); })
	c:nested_block_in_table* {
		$tsr = new SourceRange( $this->startOffset(), $tagEndPos );
		return TokenizerUtils::buildTableTokens(
			'caption', '|+', $args, $tsr, $this->endOffset(), $c, true );
	}

table_row_tag =
	// avoid recursion via nested_block_in_table
	! <tableDataBlock>
	p:pipe dashes:$"-"+
	a:(table_attributes<table=false> / &{ $this->unreachable(); })
	tagEndPos:("" { return $this->endOffset(); })
	s2:space*
	{
		$coms = TokenizerUtils::popComments( $a );
		if ( $coms ) {
			$tagEndPos = $coms['commentStartPos'];
		}

		$da = new DataParsoid;
		$da->tsr = new SourceRange( $this->startOffset(), $tagEndPos );
		$da->startTagSrc = $p . $dashes;

		// We rely on our tree builder to close the row as needed. This is
		// needed to support building tables from fragment templates with
		// individual cells or rows.
		$trToken = new TagTk( 'tr', $a, $da );

		return array_merge( [ $trToken ], $coms ? $coms['buf'] : [], $s2 );
	}

table_heading_tags = table_heading_tags_parameterized<&th>

table_heading_tags_parameterized =
	"!"
	thTag:table_heading_tag
	thTags:ths {
		// Avoid modifying a cached result
		$thTag[0] = clone $thTag[0];
		$da = $thTag[0]->dataParsoid = clone $thTag[0]->dataParsoid;
		$da->tsr = clone $da->tsr;
		$da->tsr->start--; // include "!"
		array_unshift( $thTags, $thTag );
		return $thTags;
	}

table_heading_tag =
	arg:row_syntax_table_args?
	tagEndPos:("" { return $this->endOffset(); })
	c:(
		th:<&th>
		d:nested_block_in_table {
			// Ignore newlines found in transclusions!
			// This is not perfect (since {{..}} may not always tokenize to transclusions).
			if ( $th !== false && strpos( preg_replace( "/{{[\s\S]+?}}/", "", $this->text() ), "\n" ) !== false ) {
				// There's been a newline. Remove the break and continue
				// tokenizing nested_block_in_tables.
				$th = false;
			}
			return $d;
		}
	)* {
		$tsr = new SourceRange( $this->startOffset(), $tagEndPos );
		return TokenizerUtils::buildTableTokens( 'th', '!', $arg,
			$tsr, $this->endOffset(), $c );
	}

ths = (
		pp:("!!" / pipe_pipe)
		tht:table_heading_tag {
			// Avoid modifying cached dataParsoid object
			$tht[0] = clone $tht[0];
			$da = $tht[0]->dataParsoid = clone $tht[0]->dataParsoid;
			$da->tsr = clone $da->tsr;
			$da->stx = 'row';
			$da->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL );
			$da->tsr->start -= strlen( $pp ); // include "!!" or "||"
			if ( $pp !== '!!' || ( isset( $da->startTagSrc ) && $da->startTagSrc !== $pp ) ) {
				// Variation from default
				$da->startTagSrc = $pp . ( isset( $da->startTagSrc ) ? substr( $da->startTagSrc, 1 ) : '' );
			}
			return $tht;
		}
	)*

table_data_tags =
	// avoid recursion via nested_block_in_table
	! <tableDataBlock>
	p:pipe
	![+-] td:table_data_tag
	tds:tds {
		// Avoid modifying a cached result
		$td[0] = clone $td[0];
		$da = $td[0]->dataParsoid = clone $td[0]->dataParsoid;
		$da->tsr = clone $da->tsr;
		$da->tsr->start -= strlen( $p ); // include "|"
		if ( $p !== '|' ) {
			// Variation from default
			$da->startTagSrc = $p;
		}
		return array_merge( $td, $tds );
	}

table_data_tag =
	! "}"
	arg:row_syntax_table_args?
	// use inline_breaks to break on tr etc
	tagEndPos:("" { return $this->endOffset(); })
	td:nested_block_in_table*
	{
		$tsr = new SourceRange( $this->startOffset(), $tagEndPos );
		return TokenizerUtils::buildTableTokens( 'td', '|', $arg,
			$tsr, $this->endOffset(), $td );
	}

tds = (
		pp:pipe_pipe
		tdt:table_data_tag {
			// Avoid modifying cached dataParsoid object
			$tdt[0] = clone $tdt[0];
			$da = $tdt[0]->dataParsoid = clone $tdt[0]->dataParsoid;
			$da->tsr = clone $da->tsr;
			$da->stx = 'row';
			$da->setTempFlag( TempData::NON_MERGEABLE_TABLE_CELL );
			$da->tsr->start -= strlen( $pp ); // include "||"
			if ( $pp !== '||' || ( isset( $da->startTagSrc ) && $da->startTagSrc !== $pp ) ) {
				// Variation from default
				$da->startTagSrc = $pp . ( isset( $da->startTagSrc ) ? substr( $da->startTagSrc, 1 ) : '' );
			}
			return $tdt;
		}
	)*

table_end_tag =
	sc:(space / comment)*
	startPos:("" { return $this->endOffset(); })
	p:pipe
	b:"}"
	{
		$dp = new DataParsoid;
		$dp->tsr = new SourceRange( $startPos, $this->endOffset() );
		$tblEnd = new EndTagTk( 'table', [], $dp );
		if ( $p !== '|' ) {
			// p+"<brace-char>" is triggering some bug in pegJS
			// I cannot even use that expression in the comment!
			$tblEnd->dataParsoid->endTagSrc = $p . $b;
		}
		array_push( $sc, $tblEnd );
		return $sc;
	}

/**
 * Table parameters separated from the content by a single pipe. Does *not*
 * match if followed by double pipe (row-based syntax).
 */
row_syntax_table_args =
	as:table_attributes<tableCellArg> s:optional_spaces p:pipe !pipe {
		return [ $as, $s, $p ];
	}


/*******************************************************************
 * Text variants and other general rules
 *******************************************************************/

/* All chars that cannot start syntactic structures in the middle of a line
 * XXX: ] and other end delimiters should probably only be activated inside
 * structures to avoid unnecessarily leaving the text rule on plain
 * content.
 *
 * TODO: Much of this is should really be context-dependent (syntactic
 * flags). The wikilink_preprocessor_text rule is an example where
 * text_char is not quite right and had to be augmented. Try to minimize /
 * clarify this carefully!
 *
 * This character class is inlined into $this->reUrltextLookahead. Changes
 * here may also need to be reflected there.
 */

text_char = [^-'<[{\n\r:;\]}|!=]

/* Legend
 * '    quotes (italic/bold)
 * <    start of xmlish_tag
 * [    start of links
 * {    start of parser functions, transclusion and template args
 * \n   all sort of block-level markup at start of line
 * \r   ditto
 * A-Za-z autolinks (http(s), nttp(s), mailto, ISBN, PMID, RFC)
 *
 * _    behavior switches (e.g., '__NOTOC__') (XXX: not URL related)
 * ! and | table cell delimiters, might be better to specialize those
 * =    headings - also specialize those!
 *
 * The following chars are also included for now, but only apply in some
 * contexts and should probably be enabled only in those:
 * :    separate definition in ; term : definition
 * ]    end of link
 * }    end of parser func/transclusion/template arg
 * -    start of lang_variant -{ ... }-
 * ;    separator in lang_variant
 */

urltext =
	(
		/* Very special performance hack:
		 * Look for a plain text sequence, and if found, pretend to match the
		 * empty string, but then advance currPos in the action and return the
		 * whole plain text segment as a single result.
		 */
		& {
			if ( preg_match( $this->reUrltextLookahead, $this->input, $m, 0, $this->currPos ) ) {
				$plain = $m[1];
				$this->urltextPlainSegment = $plain;
				$this->urltextFoundAutolink = ( $m[2] ?? '' ) !== '';
				return (bool)strlen( $plain );
			} else {
				$this->urltextFoundAutolink = false;
				return false;
			}
		}
		'' {
			$this->currPos += strlen( $this->urltextPlainSegment );
			return $this->urltextPlainSegment;
		}
		/ & { return $this->urltextFoundAutolink; } al:autolink { return $al; }
		/ & "&" he:htmlentity { return $he; }
		/ & ('__') bs:behavior_switch { return $bs; }
		/ text_char
	)+

raw_htmlentity =
	m:$("&" [#0-9a-zA-Zרלמرلم]+ ";") {
		return Utils::decodeWtEntities( $m );
	}

htmlentity =
	cc:raw_htmlentity {
		// if this is an invalid entity, don't tag it with 'mw:Entity'
		// note that some entities (like &acE;) decode to 2 codepoints!
		if ( mb_strlen( $cc ) > 2 /* decoded entity would be 1-2 codepoints */ ) {
			return $cc;
		}
		$dpStart = new DataParsoid;
		$dpStart->src = $this->text();
		$dpStart->srcContent = $cc;
		$dpStart->tsr = $this->tsrOffsets( 'start' );
		$dpEnd = new DataParsoid;
		$dpEnd->tsr = $this->tsrOffsets( 'end' );
		return [
			// If this changes, the nowiki extension's toDOM will need to follow suit
			new TagTk( 'span', [ new KV( 'typeof', 'mw:Entity' ) ], $dpStart ),
			$cc,
			new EndTagTk( 'span', [], $dpEnd )
		];
	}

/**
 * noinclude / includeonly / onlyinclude rules. These are normally
 * handled by the xmlish_tag rule, except where generic tags are not
 * allowed- for example in directives, which are allowed in various attribute
 * names and -values.
 *
 * Example test case:
 * {|
 * |-<includeonly>
 * foo
 * </includeonly>
 * |Hello
 * |}
 */

include_limits =
	& ("<" "/"? ( "includeonly"i / "noinclude"i / "onlyinclude"i ) )
	"<" il:xmlish_tag<annOrExtTag>
	sol_il: <sol_il>
	& {
		$il = $il[0];
		$lname = mb_strtolower( $il->getName() );
		if ( !WTUtils::isIncludeTag( $lname ) ) { return false;  }
		// Preserve SOL where necessary (for onlyinclude and noinclude)
		// Note that this only works because we encounter <*include*> tags in
		// the toplevel content and we rely on the php preprocessor to expand
		// templates, so we shouldn't ever be tokenizing inInclude.
		// Last line should be empty (except for comments)
		if ( $lname !== 'includeonly' && $sol_il && $il instanceof TagTk ) {
			$dp = $il->dataParsoid;
			$inclContent = $dp->extTagOffsets->stripTags( $dp->src );
			$nlpos = strrpos( $inclContent, "\n" );
			$last = $nlpos === false ? $inclContent : substr( $inclContent, $nlpos + 1 );
			if ( !preg_match( '/^(<!--([^-]|-(?!->))*-->)*$/D', $last ) ) {
				return false;
			}
		}
		return true;
	}
	{
		return $il;
	}

// 'Preprocessor' directive- higher-level things that can occur in otherwise
// plain-text content.
directive =
	comment
	/ extension_annotation_tag
	/ tplarg_or_template
	/ & "-{" v:lang_variant_or_tpl { return $v; }
	/ & "&" e:htmlentity { return $e; }
	/ include_limits

wikilink_preprocessor_text =
	r:(
		t:$[^<[{\n\r\t|!\]}{ &\-]+
		// XXX gwicke: any more chars we need to allow here?
		/ !inline_breaks wr:( directive / $( !"]]" ( text_char / [!<\-\}\]\n\r] ) ) )
		{ return $wr; }
	)+ {
		return TokenizerUtils::flattenStringlist( $r );
	}

// added special separator character class inline: separates url from
// description / text
extlink_nonipv6url =
	// Prevent breaking on pipes when we're in a link description.
	// See the test, 'Images with the "|" character in the comment'.
	extlink_nonipv6url_parameterized<linkdesc=false>

extlink_nonipv6url_parameterized =
	r:(
		$[^<[{\n\r|!\]}\-\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+
		/ !inline_breaks s:( directive / [&|{\-!}=] ) { return $s; }
		/ $(['] ![']) // single quotes are ok, double quotes are bad
	)+ {
		return TokenizerUtils::flattenString( $r );
	}

// Attribute values with preprocessor support

// n.b. / is a permissible char in the three rules below.
// We only break on />, enforced by the negated expression.
// Hence, it isn't included in the stop set.

// The stop set is space_or_newline and > which matches generic_att_value.
attribute_preprocessor_text =
	r:(
		$[^{}&<\-|/ \t\n\r\x0c>]+
		/ !inline_breaks
		!'/>'
		s:( directive / less_than / [{}&\-|/] ) { return $s; }
	)+ {
		return TokenizerUtils::flattenString( $r );
	}

// The stop set is '> which matches generic_att_value.
attribute_preprocessor_text_single =
	r:(
		$[^{}&<\-|/'>]+
		/ !inline_breaks
		!'/>'
		s:( directive / less_than / [{}&\-|/] ) { return $s; }
	)* {
		return TokenizerUtils::flattenString( $r );
	}

// The stop set is "> which matches generic_att_value.
attribute_preprocessor_text_double =
	r:(
		$[^{}&<\-|/">]+
		/ !inline_breaks
		!'/>'
		s:( directive / less_than / [{}&\-|/] ) { return $s; }
	)* {
		return TokenizerUtils::flattenString( $r );
	}

// Variants with the entire attribute on a single line

// n.b. ! is a permissible char in the three rules below.
// We only break on !! in th, enforced by the inline break.
// Hence, it isn't included in the stop set.
// [ is also permissible but we give a chance to break
// for the [[ special case in the old parser's doTableStuff (See T2553).

// The stop set is space_or_newline and | which matches table_att_value.
table_attribute_preprocessor_text =
	r:(
		$[^{}&<\-!\[ \t\n\r\x0c|]+
		/ !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; }
	)+ {
		return TokenizerUtils::flattenString( $r );
	}

// The stop set is '\r\n| which matches table_att_value.
table_attribute_preprocessor_text_single =
	r:(
		$[^{}&<\-!\['\r\n|]+
		/ !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; }
	)* {
		return TokenizerUtils::flattenString( $r );
	}

// The stop set is "\r\n| which matches table_att_value.
table_attribute_preprocessor_text_double =
	r:(
		$[^{}&<\-!\["\r\n|]+
		/ !inline_breaks s:( directive / [{}&<\-!\[] ) { return $s; }
	)* {
		return TokenizerUtils::flattenString( $r );
	}

// Special-case support for those pipe templates
pipe = "|" / "{{!}}"

// SSS FIXME: what about |{{!}} and {{!}}|
pipe_pipe = "||" / "{{!}}{{!}}"

space = [ \t]

optional_spaces = $[ \t]*

// Start of file
sof = & { return $this->endOffset() === 0 && !$this->pipelineOffset; }

// End of file
eof = & { return $this->endOffset() === $this->inputLength; }

newline = '\n' / '\r\n'

newlineToken = newline { return [ new NlTk( $this->tsrOffsets() ) ]; }

eolf = newline / eof

// The old parser does a straight str.replace(/<!--((?!-->).)*-->/g, "")
// but, as always, things around here are a little more complicated.
//
// We accept the same comments, but because we emit them as HTML comments
// instead of deleting them, we have to encode the data to ensure that
// we always emit a valid HTML5 comment.  See the encodeComment helper
// for further details.
comment =
	'<!--' c:$(!"-->" .)* cEnd:$('-->' / eof) {
		$data = WTUtils::encodeComment( $c );
		$dp = new DataParsoid;
		$dp->tsr = $this->tsrOffsets();
		if ( $cEnd !== '-->' ) {
			$dp->unclosedComment = true;
		}
		return [ new CommentTk( $data, $dp ) ];
	}

nl_comment_space = newlineToken / space / comment

optionalSpaceToken =
	s:optional_spaces {
		if ( $s !== '' ) {
			return [ $s ];
		} else {
			return [];
		}
	}

/* This rule corresponds to \s in the PHP preg_* functions,
 * which is used frequently in the old parser.  The inclusion of
 * form feed (but not other whitespace, like vertical tab) is a quirk
 * of Perl, which PHP inherited via the PCRE (Perl-Compatible Regular
 * Expressions) library.
 */
space_or_newline = [ \t\n\r\x0c]

/* This rule corresponds to \b in the PHP preg_* functions,
 * after a word character.  That is, it's a zero-width lookahead that
 * the next character is not a word character.
 */
end_of_word = eof / ![A-Za-z0-9_]

// Unicode "separator, space" category.  It covers the \u0020 space as well
// as \u3000 IDEOGRAPHIC SPACE (see bug 19052).  In PHP this is \p{Zs}.
// Keep this up-to-date with the characters tagged ;Zs; in
// http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
unispace = [ \u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]

// Non-newline whitespace, including non-breaking spaces.  Used for magic links.
space_or_nbsp =
	space // includes \t
	/ unispace
	/ & "&" he:htmlentity &{ return is_array( $he ) && $he[ 1 ] === "\u{A0}"; }
	{ return $he; }

// Used within ISBN magic links
space_or_nbsp_or_dash =
	space_or_nbsp / "-"

// Elements that do not break beginning or end of line for blocks (headers for instance)
sol_transparent = comment / include_limits<sol_il> / annotation_tag / behavior_switch

sol = (empty_line_with_comments / sol_prefix) sol_transparent*

sol_prefix =
	newlineToken
	/ & {
		// Use the sol flag only at the start of the input
		// Flag should always be an actual boolean (not falsy or undefined)
		$this->assert( is_bool( $this->options['sol'] ), 'sol should be boolean' );
		return $this->endOffset() === 0 && $this->options['sol'];
	} {
		return [];
	}

// This rule requires at least one comment to be matched
empty_line_with_comments =
	sp:sol_prefix p:("" { return $this->endOffset(); }) c:(space* comment (space / comment)* newline)+ {
		$dp = new DataParsoid;
		$dp->tsr = new SourceRange( $p, $this->endOffset() );
		$dp->tokens = TokenizerUtils::flattenIfArray( $c );
		return [
			$sp,
			new SelfclosingTagTk( 'meta', [ new KV( 'typeof', 'mw:EmptyLine' ) ], $dp )
		];
	}