Skip to content
Permalink
Browse files

New: Add utils to traverse the dom using an HTMLDocument

  • Loading branch information...
sarvaje authored and antross committed Mar 8, 2019
1 parent 0439f9b commit d8490b3085b4ab51bedab152327a9ef909df6871
@@ -0,0 +1,12 @@
import { HTMLDocument } from '../../types/html';
import * as parse5 from 'parse5';
import * as htmlparser2Adapter from 'parse5-htmlparser2-tree-adapter';

export default (html: string): HTMLDocument => {
const dom = parse5.parse(html, {
sourceCodeLocationInfo: true,
treeAdapter: htmlparser2Adapter
});

return new HTMLDocument(dom);
};
@@ -0,0 +1,58 @@
import { URL } from 'url';

import { HTMLDocument, HTMLElement } from '../../types/html';

const getSrcsetUrls = (srcset: string): string[] => {
if (!srcset) {
return [];
}

const parts = srcset.split(',');
const urls = parts.reduce((total, part) => {
const url = part.trim().split(' ')[0];

if (!url) {
return total;
}

total.push(url.trim());

return total;
}, [] as string[]);

return urls;
};

export default (dom: HTMLDocument, url: string): HTMLElement | null => {
// TODO: Cache dom.querySelectorAll?.
const elements = dom.querySelectorAll('[href],[src],[poster],[srcset]').filter((element: any) => {
const elementUrl = element.getAttribute('href') || element.getAttribute('src') || element.getAttribute('poster');
const elementUrls = [elementUrl, ...getSrcsetUrls(element.getAttribute('srcset'))];

if (elementUrls.includes(url)) {
return true;
}

const absoluteUrls = elementUrls.map((relativeUrl) => {
// TODO: Cache the absolute URL, so we don't run new URL() for the same URL.
return new URL(relativeUrl, url).href;
});

if (absoluteUrls.includes(url)) {
return true;
}

return false;
});

/*
* Even if there are multiple elements with the same URL,
* it's the first one that triggers the download in the browser
* and thus the one we should be reporting.
*/
if (elements.length > 0) {
return elements[0];
}

return null;
};
@@ -0,0 +1,35 @@
import { HTMLDocument, HTMLElement } from '../../types/html';
import { Engine } from '../../engine';
import { TraverseUp, TraverseDown, Event } from '../../types/events';

const traverseAndNotify = async (element: HTMLElement, document: HTMLDocument, engine: Engine, resource: string): Promise<void> => {

await engine.emitAsync(`element::${element.nodeName.toLowerCase()}` as 'element::*', {
element,
resource
});

const traverseEvent = {
element,
resource
} as TraverseDown | TraverseUp;

await engine.emitAsync(`traverse::down`, traverseEvent);

// Recursively traverse child elements.
for (const child of element.children) {
await traverseAndNotify(child, document, engine, resource);
}

await engine.emitAsync(`traverse::up`, traverseEvent);
};

export default async (document: HTMLDocument, engine: Engine, resource: string): Promise<void> => {
const documentElement = document.documentElement;

const event = { resource } as Event;

await engine.emitAsync('traverse::start', event);
await traverseAndNotify(documentElement, document, engine, resource);
await engine.emitAsync('traverse::end', event);
};

0 comments on commit d8490b3

Please sign in to comment.
You can’t perform that action at this time.