Skip to content

Commit

Permalink
Add support for new parse-latin, parse-english
Browse files Browse the repository at this point in the history
This is a breaking change: these parsers were updated,
which cleans their API a lot, and adds types.
  • Loading branch information
wooorm committed Jan 9, 2023
1 parent 9b8cca5 commit 95bc616
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 17 deletions.
102 changes: 91 additions & 11 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
* @typedef {import('nlcst').Root} NlcstRoot
* @typedef {import('nlcst').Paragraph} NlcstParagraph
* @typedef {import('nlcst').WhiteSpace} NlcstWhiteSpace
* @typedef {import('nlcst').Sentence} NlcstSentence
* @typedef {import('nlcst').Source} NlcstSource
* @typedef {import('nlcst').Content} NlcstContent
* @typedef {import('nlcst').SentenceContent} NlcstSentenceContent
* @typedef {NlcstRoot|NlcstContent} NlcstNode
* @typedef {Extract<NlcstNode, import('unist').Parent>} NlcstParent
*
* @typedef {import('hast').Root} HastRoot
* @typedef {import('hast').Element} HastElement
Expand All @@ -19,11 +21,11 @@
* @typedef {import('vfile').VFile} VFile
*
* @typedef {{
* parse(nodes: Array<NlcstContent>): NlcstRoot
* tokenizeSource(value: string): NlcstSource
* tokenizeWhiteSpace(value: string): NlcstWhiteSpace
* tokenizeParagraph(nodes: Array<NlcstSentenceContent>): NlcstParagraph
* tokenize(value: string): Array<NlcstSentenceContent>
* tokenizeSentencePlugins: Array<(node: NlcstSentence) => void>,
* tokenizeParagraphPlugins: Array<(node: NlcstParagraph) => void>,
* parse(value: string | null | undefined): NlcstRoot
* tokenizeParagraph(value: string | null | undefined): NlcstParagraph
* tokenize(value: string | null | undefined): Array<NlcstSentenceContent>
* }} ParserInstance
* @typedef {new () => ParserInstance} ParserConstructor
*/
Expand All @@ -34,7 +36,7 @@ import {phrasing} from 'hast-util-phrasing'
import {toString} from 'hast-util-to-string'
import {whitespace} from 'hast-util-whitespace'
import {toString as nlcstToString} from 'nlcst-to-string'
import {pointStart} from 'unist-util-position'
import {pointStart, pointEnd} from 'unist-util-position'
import {location} from 'vfile-location'

const source = convertElement(['code', dataNlcstSourced])
Expand Down Expand Up @@ -74,6 +76,10 @@ const flowAccepting = convertElement([
'dialog'
])

// Ported from:
// <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/expressions.js#L5>
const terminalMarker = /^([!.?\u2026\u203D]+)$/

/**
* Transform `tree` to nlcst.
*
Expand Down Expand Up @@ -179,11 +185,37 @@ export function toNlcst(tree, file, Parser) {
* @param {HastElementContent|Array<HastElementContent>} node
*/
function add(node) {
/** @type {Array<NlcstSentenceContent>|undefined} */
/** @type {Array<NlcstSentenceContent> | undefined} */
const result = Array.isArray(node) ? all(node) : one(node)

if (result && result.length > 0) {
results.push(parser.tokenizeParagraph(result))
const start = pointStart(result[0])
const end = pointEnd(result[result.length - 1])

// Turn into a sentence.
/** @type {NlcstSentence} */
const sentence = {type: 'SentenceNode', children: result}
if (start && end) sentence.position = {start, end}

let index = -1
while (parser.tokenizeSentencePlugins[++index]) {
parser.tokenizeSentencePlugins[index](sentence)
}

// Turn into a paragraph.
/** @type {NlcstParagraph} */
const paragraph = {
type: 'ParagraphNode',
children: splitNode(sentence, 'PunctuationNode', terminalMarker)
}
if (start && end) paragraph.position = {start: {...start}, end: {...end}}

index = -1
while (parser.tokenizeParagraphPlugins[++index]) {
parser.tokenizeParagraphPlugins[index](paragraph)
}

results.push(paragraph)
}
}

Expand Down Expand Up @@ -238,13 +270,13 @@ export function toNlcst(tree, file, Parser) {
change = true
} else if (node.type === 'element' && !ignore(node)) {
if (node.tagName === 'wbr') {
replacement = [parser.tokenizeWhiteSpace(' ')]
replacement = [{type: 'WhiteSpaceNode', value: ' '}]
change = true
} else if (node.tagName === 'br') {
replacement = [parser.tokenizeWhiteSpace('\n')]
replacement = [{type: 'WhiteSpaceNode', value: '\n'}]
change = true
} else if (source(node)) {
replacement = [parser.tokenizeSource(toString(node))]
replacement = [{type: 'SourceNode', value: toString(node)}]
change = true
} else {
replacement = all(node.children)
Expand Down Expand Up @@ -327,3 +359,51 @@ function dataNlcstSourced(node) {
function dataNlcstIgnore(node) {
return Boolean(node.properties && node.properties.dataNlcst === 'ignore')
}

// Ported from:
// <https://github.com/wooorm/parse-latin/blob/ea33f09/lib/index.js#L266-L310>
/**
* A function that splits one node into several nodes.
*
* @template {NlcstParent} TheNode
* @param {TheNode} node
* @param {RegExp} expression
* @param {NlcstContent['type']} childType
* @returns {Array<TheNode>}
*/
function splitNode(node, childType, expression) {
/** @type {Array<TheNode>} */
const result = []
let index = -1
let start = 0

while (++index < node.children.length) {
const token = node.children[index]

if (
index === node.children.length - 1 ||
(token.type === childType && expression.test(nlcstToString(token)))
) {
/** @type {TheNode} */
// @ts-expect-error: fine
const parent = {
type: node.type,
children: node.children.slice(start, index + 1)
}

const first = node.children[start]
const last = token
if (first.position && last.position) {
parent.position = {
start: first.position.start,
end: last.position.end
}
}

result.push(parent)
start = index + 1
}
}

return result
}
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@
"@types/tape": "^4.0.0",
"c8": "^7.0.0",
"is-hidden": "^2.0.0",
"parse-dutch": "^5.0.0",
"parse-english": "^5.0.0",
"parse-latin": "^5.0.0",
"parse-dutch": "^6.0.0",
"parse-english": "^6.0.0",
"parse-latin": "^6.0.0",
"prettier": "^2.0.0",
"rehype": "^12.0.0",
"remark-cli": "^11.0.0",
Expand Down
3 changes: 0 additions & 3 deletions test/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,8 @@ import path from 'node:path'
import test from 'tape'
import {rehype} from 'rehype'
import {VFile} from 'vfile'
// @ts-expect-error: to do type.
import {ParseLatin} from 'parse-latin'
// @ts-expect-error: to do type.
import {ParseDutch} from 'parse-dutch'
// @ts-expect-error: to do type.
import {ParseEnglish} from 'parse-english'
import {isHidden} from 'is-hidden'
import {toNlcst} from '../index.js'
Expand Down

0 comments on commit 95bc616

Please sign in to comment.