-
-
Notifications
You must be signed in to change notification settings - Fork 4.8k
Safe UTF-8 Extraction and Robust <style> Block Detection in Upgrade Codemods #19410
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -553,11 +553,17 @@ where | |
| a | ||
| }) | ||
| .into_iter() | ||
| .map(|s| unsafe { String::from_utf8_unchecked(s.to_vec()) }) | ||
| .filter_map(|s| match String::from_utf8(s.to_vec()) { | ||
| Ok(s) => Some(s), | ||
| Err(_e) => { | ||
| // Optionally log or handle invalid UTF-8 here | ||
| // eprintln!("Skipped invalid UTF-8 candidate, error: {:?}", _e); | ||
| None | ||
| } | ||
| }) | ||
| .collect(); | ||
|
|
||
| // SAFETY: Unstable sort is faster and in this scenario it's also safe because we are | ||
| // guaranteed to have unique candidates. | ||
| // Unstable sort is performant & remains correct with unique candidates | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While not a traditional safety message this is 100% intentional because behavior would be incorrect in the face of non-unique candidates + an unstable sort. |
||
| result.par_sort_unstable(); | ||
|
|
||
| result | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| import { parseCandidate } from '../../../../tailwindcss/src/candidate' | ||
| import { parse as parseHtml } from 'node-html-parser' | ||
| import type { DesignSystem } from '../../../../tailwindcss/src/design-system' | ||
| import { DefaultMap } from '../../../../tailwindcss/src/utils/default-map' | ||
| import * as version from '../../utils/version' | ||
|
|
@@ -187,31 +188,24 @@ export function isSafeMigration( | |
| return true | ||
| } | ||
|
|
||
| // Assumptions: | ||
| // - All `<style` tags appear before the next `</style>` tag | ||
| // - All `<style` tags are closed with `</style>` | ||
| // - No nested `<style>` tags | ||
| // Robustly locates all <style> blocks (with or without attributes) using an HTML parser. | ||
| const styleBlockRanges = new DefaultMap((source: string) => { | ||
| let ranges: number[] = [] | ||
| let offset = 0 | ||
|
|
||
| while (true) { | ||
| let startTag = source.indexOf('<style', offset) | ||
| if (startTag === -1) return ranges | ||
|
|
||
| offset = startTag + 1 | ||
|
|
||
| // Ensure the style looks like: | ||
| // - `<style>` (closed) | ||
| // - `<style …>` (with attributes) | ||
| if (!source[startTag + 6].match(/[>\s]/)) continue | ||
|
|
||
| let endTag = source.indexOf('</style>', offset) | ||
| if (endTag === -1) return ranges | ||
| offset = endTag + 1 | ||
|
|
||
| ranges.push(startTag, endTag) | ||
| const ranges: number[] = [] | ||
| try { | ||
| const root = parseHtml(source, { lowerCaseTagName: false, comment: false, blockTextElements: { style: true } }) | ||
| const styleNodes = root.querySelectorAll('style') | ||
| styleNodes.forEach(node => { | ||
| const nodeHtml = node.toString() | ||
| const start = typeof node.range === 'object' && node.range !== null | ||
| ? node.range[0] | ||
| : source.indexOf(nodeHtml) | ||
| const end = start + nodeHtml.length | ||
| ranges.push(start, end) | ||
| }) | ||
| } catch (_) { | ||
| // fallback: do nothing if parser fails | ||
| } | ||
| return ranges | ||
| }) | ||
|
Comment on lines
+191
to
209
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Robust HTML parser implementation with minor fallback concern. The replacement of manual string scanning with
However, the fallback on line 201 ( The silent error handling (line 205-207) is reasonable for graceful degradation, but consider logging errors in development mode to aid debugging. Optional improvement for the indexOf fallback: - const start = typeof node.range === 'object' && node.range !== null
- ? node.range[0]
- : source.indexOf(nodeHtml)
+ // Only use indexOf as last resort; range should be available in most cases
+ const start = typeof node.range === 'object' && node.range !== null && node.range[0] !== undefined
+ ? node.range[0]
+ : source.indexOf(nodeHtml)🤖 Prompt for AI Agents |
||
|
|
||
| const BACKSLASH = 0x5c | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an intentional performance optimization.
String::from_utf8does a validation pass over the string which should be uncnessary here. The extractor's implementation should guarantee that the resulting candidates are already UTF-8.If you have a test case for this that crashes when scanning files please open a separate issue and I will take a look.