-
${item.crawlCount}
${pluralOf("items", item.crawlCount)}
@@ -69,26 +82,40 @@ export class LinkedCollectionsListItem extends TailwindElement {
}
if (this.removable) {
+ const button = html`
+ this.dispatchEvent(
+ new CustomEvent(
+ "btrix-remove",
+ {
+ detail: {
+ item: item,
+ },
+ bubbles: true,
+ composed: true,
+ },
+ ),
+ )}
+ >`;
+
content.push(
html`
-
-
- this.dispatchEvent(
- new CustomEvent(
- "btrix-remove",
- {
- detail: {
- item: item,
- },
- bubbles: true,
- composed: true,
- },
- ),
+ ${dedupeEnabled
+ ? html`
-
+ >
+ ${button}
+ `
+ : html`
+ ${button}
+ `}
+
+
`,
);
}
diff --git a/frontend/src/features/collections/linked-collections/linked-collections-list.ts b/frontend/src/features/collections/linked-collections/linked-collections-list.ts
index 7b9508f2ed..9e9837ea65 100644
--- a/frontend/src/features/collections/linked-collections/linked-collections-list.ts
+++ b/frontend/src/features/collections/linked-collections/linked-collections-list.ts
@@ -21,6 +21,9 @@ export class LinkedCollectionsList extends TailwindElement {
@property({ type: String })
baseUrl?: string;
+ @property({ type: String })
+ dedupeId?: string;
+
@property({ type: Boolean })
removable?: boolean;
@@ -37,6 +40,7 @@ export class LinkedCollectionsList extends TailwindElement {
class=${clsx(tw`contents`, i > 0 && tw`part-[base]:border-t`)}
.item=${until(request, item)}
baseUrl=${ifDefined(this.baseUrl)}
+ ?dedupeSource=${Boolean(this.dedupeId && item.id === this.dedupeId)}
?removable=${this.removable}
?loading=${until(
request.then(() => false),
diff --git a/frontend/src/features/collections/linked-collections/linked-collections.ts b/frontend/src/features/collections/linked-collections/linked-collections.ts
index ad73a6b3e4..6b9fc7568a 100644
--- a/frontend/src/features/collections/linked-collections/linked-collections.ts
+++ b/frontend/src/features/collections/linked-collections/linked-collections.ts
@@ -28,6 +28,12 @@ export class LinkedCollections extends BtrixElement {
@property({ type: Array, hasChanged: isNotEqual })
collections: (string | CollectionLikeItem)[] = [];
+ /**
+ * ID of collection that is used for deduplication
+ */
+ @property({ type: String })
+ dedupeId?: string;
+
@property({ type: Boolean })
removable?: boolean;
@@ -112,6 +118,7 @@ export class LinkedCollections extends BtrixElement {
aria-live="polite"
.collections=${collections}
baseUrl="${this.navigate.orgBasePath}/collections/view"
+ .dedupeId=${this.dedupeId}
?removable=${this.removable}
>`;
}
diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts
index cc3e629c63..2970f18dc6 100644
--- a/frontend/src/features/crawl-workflows/workflow-editor.ts
+++ b/frontend/src/features/crawl-workflows/workflow-editor.ts
@@ -40,9 +40,13 @@ import compact from "lodash/fp/compact";
import flow from "lodash/fp/flow";
import isEqual from "lodash/fp/isEqual";
import throttle from "lodash/fp/throttle";
+import union from "lodash/fp/union";
import uniq from "lodash/fp/uniq";
+import without from "lodash/fp/without";
import queryString from "query-string";
+import type { CollectionNameInputChangeEvent } from "../collections/collection-name-input";
+
import {
SELECTOR_DELIMITER,
type LinkSelectorTable,
@@ -205,6 +209,10 @@ const getDefaultProgressState = (hasConfigId = false): ProgressState => {
error: false,
completed: hasConfigId,
},
+ deduplication: {
+ error: false,
+ completed: hasConfigId,
+ },
collections: {
error: false,
completed: hasConfigId,
@@ -387,6 +395,11 @@ export class WorkflowEditor extends BtrixElement {
"": "",
};
+ private readonly dedupeTypeLabels: Record
= {
+ collection: msg("Deduplicate using a collection"),
+ none: msg("No deduplication"),
+ };
+
@query(`form[name="${formName}"]`)
private readonly formElem?: HTMLFormElement;
@@ -652,20 +665,15 @@ export class WorkflowEditor extends BtrixElement {
const el = e.currentTarget as SlDetails;
// Check if there's any invalid elements before hiding
- let invalidEl: SlInput | null = null;
-
- if (required) {
- invalidEl = el.querySelector("[required][data-invalid]");
- }
-
- invalidEl =
- invalidEl || el.querySelector("[data-user-invalid]");
+ const invalidEl =
+ el.querySelector("[data-invalid]") ||
+ el.querySelector(":invalid");
if (invalidEl) {
e.preventDefault();
invalidEl.focus();
- invalidEl.checkValidity();
+ invalidEl.reportValidity();
}
})}
@sl-after-show=${this.handleCurrentTarget(
@@ -1666,7 +1674,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
- ${labelFor.selectLink}
+ ${labelFor.selectLinks}
${isCustom
? html`${selectors.length}`
: ""}
@@ -1784,7 +1792,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
`;
return html`
- ${this.renderSectionHeading(labelFor.behaviors)}
+ ${this.renderSectionHeading(sectionStrings.behaviors)}
${inputCol(
html` {
+ this.updateProgressState({ activeTab: "collections" });
+ await this.updateComplete;
+ void this.scrollToActivePanel();
+ }}
+ >
+ ${msg("Auto-Add to Collections")}
+ `;
+
+ return html` ${inputCol(html`
+ {
+ const dedupeType = (e.target as SlRadio)
+ .value as FormState["dedupeType"];
+
+ const formState: Partial = {
+ dedupeType,
+ dedupeCollectionId: null,
+ dedupeCollectionName: null,
+ };
+
+ if (dedupeType === "none" && this.formState.dedupeCollectionId) {
+ formState.autoAddCollections = without(
+ [this.formState.dedupeCollectionId],
+ this.formState.autoAddCollections,
+ );
+ }
+
+ this.updateFormState(formState, true);
+ }}
+ >
+ ${this.dedupeTypeLabels["none"]}
+ ${this.dedupeTypeLabels["collection"]}
+
+ ${when(
+ this.formState.dedupeType === "none" &&
+ this.initialWorkflow?.dedupCollId,
+ () => html`
+
+
+
+ ${msg(
+ "Disabling deduplication will also disable auto-adding to the collection.",
+ )}
+
+ ${msg(
+ html`To continue to auto-add to the collection without
+ deduplication enabled, update the
+ ${link_to_collections_settings} setting.`,
+ )}
+
+ `,
+ )}
+
+ `)}
+ ${this.renderHelpTextCol(
+ msg(
+ `Enable duplication checks before and during a crawl to avoid duplicate content in archived items.`,
+ ),
+ )}
+ ${when(
+ this.formState.dedupeType === "collection",
+ this.renderDedupeCollection,
+ )}`;
+ }
+
+ private readonly renderDedupeCollection = () => {
+ return html`
+ ${this.renderSectionHeading(msg("Collection to Use"))}
+ ${inputCol(html`
+ {
+ const { id, name } = e.detail.value;
+
+ if (id) {
+ this.updateFormState(
+ {
+ dedupeCollectionId: id,
+ dedupeCollectionName: name || "",
+ autoAddCollections: union(this.formState.autoAddCollections, [
+ id,
+ ]),
+ },
+ true,
+ );
+ } else if (name) {
+ this.updateFormState({
+ dedupeCollectionId: null,
+ dedupeCollectionName: name,
+ });
+ }
+ }}
+ @btrix-clear=${() => {
+ if (this.formState.dedupeCollectionId) {
+ this.updateFormState(
+ {
+ dedupeCollectionId: null,
+ dedupeCollectionName: null,
+ autoAddCollections: without(
+ [this.formState.dedupeCollectionId],
+ this.formState.autoAddCollections,
+ ),
+ },
+ true,
+ );
+ } else {
+ this.updateFormState({
+ dedupeCollectionName: null,
+ });
+ }
+ }}
+ >
+
+ ${when(
+ this.formState.dedupeCollectionName &&
+ !this.formState.dedupeCollectionId,
+ () => {
+ const workflow_name = html`${this.formState.dedupeCollectionName}`;
+ return html`
+
+ ${msg(
+ html`A new collection named “${workflow_name}” will be created
+ when this workflow is saved.`,
+ )}
+
+ `;
+ },
+ )}
+ `)}
+ ${this.renderHelpTextCol(
+ msg(
+ "This collection will be used as the deduplication source for all crawls of this workflow.",
+ ),
+ )}
+ `;
+ };
+
private renderCollections() {
+ const newDedupeCollectionName =
+ this.formState.dedupeType === "collection" &&
+ !this.formState.dedupeCollectionId &&
+ this.formState.dedupeCollectionName;
+ const showDedupeWarning =
+ !isEqual(
+ this.initialWorkflow?.autoAddCollections,
+ this.formState.autoAddCollections,
+ ) &&
+ (this.formState.dedupeCollectionId || newDedupeCollectionName);
+
return html`
${inputCol(html`
this.updateFormState(
{
@@ -2298,14 +2475,37 @@ https://archiveweb.page/images/${"logo.svg"}`}
},
true,
)}
- >
+ >
+ ${when(
+ showDedupeWarning,
+ () => html`
+
+ ${msg(
+ "Adding deduplicated crawls to a collection other than the deduplication source may result in incomplete replay of the non-deduplicated collection.",
+ )}
+
+ `,
+ )}
+
${when(
- !this.formState.autoAddCollections.length,
- () => html`
-
-
${msg("No collections selected.")}
-
+ newDedupeCollectionName,
+ (name) => html`
+
`,
+ () =>
+ when(
+ !this.formState.autoAddCollections.length,
+ () => html`
+
+
${msg("No collections selected.")}
+
+ `,
+ ),
)}
`)}
${this.renderHelpTextCol(
@@ -2470,6 +2670,11 @@ https://archiveweb.page/images/${"logo.svg"}`}
desc: msg("Schedule recurring crawls."),
render: this.renderJobScheduling,
},
+ {
+ name: "deduplication",
+ desc: msg("Prevent duplicate content from being crawled and stored."),
+ render: this.renderDeduplication,
+ },
{
name: "collections",
desc: msg("Add crawls from this workflow to one or more collections."),
@@ -2938,6 +3143,25 @@ https://archiveweb.page/images/${"logo.svg"}`}
this.isSubmitting = true;
+ // Create new collection if needed
+ if (
+ this.formState.dedupeType === "collection" &&
+ this.formState.dedupeCollectionName &&
+ !this.formState.dedupeCollectionId
+ ) {
+ const { id } = await this.createCollection({
+ name: this.formState.dedupeCollectionName,
+ });
+
+ this.updateFormState(
+ {
+ dedupeCollectionId: id,
+ autoAddCollections: union(this.formState.autoAddCollections, [id]),
+ },
+ true,
+ );
+ }
+
const uploadParams: Parameters[0] = {};
// Upload seed file first if it exists, since ID will be used to
@@ -3205,6 +3429,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
maxCrawlSize: this.formState.maxCrawlSizeGB * BYTES_PER_GB,
tags: this.formState.tags,
autoAddCollections: this.formState.autoAddCollections,
+ dedupCollId: this.formState.dedupeCollectionId || "",
config: {
...(isPageScopeType(this.formState.scopeType)
? this.parseUrlListConfig(uploadParams)
@@ -3386,4 +3611,18 @@ https://archiveweb.page/images/${"logo.svg"}`}
console.debug(e);
}
}
+
+ private async createCollection(
+ params: { name: string },
+ signal?: AbortSignal,
+ ) {
+ return this.api.fetch<{ added: boolean; id: string; name: string }>(
+ `/orgs/${this.orgId}/collections`,
+ {
+ method: "POST",
+ body: JSON.stringify(params),
+ signal,
+ },
+ );
+ }
}
diff --git a/frontend/src/pages/org/archived-items.ts b/frontend/src/pages/org/archived-items.ts
index 53b35e5635..06a5f5b0e2 100644
--- a/frontend/src/pages/org/archived-items.ts
+++ b/frontend/src/pages/org/archived-items.ts
@@ -16,6 +16,7 @@ import {
type FilterChip,
} from "@/components/ui/filter-chip";
import { parsePage, type PageChangeEvent } from "@/components/ui/pagination";
+import type { BtrixSearchComboboxSelectEvent } from "@/components/ui/search-combobox";
import { ClipboardController } from "@/controllers/clipboard";
import { SearchParamsValue } from "@/controllers/searchParamsValue";
import { type BtrixChangeArchivedItemStateFilterEvent } from "@/features/archived-items/archived-item-state-filter";
@@ -720,6 +721,7 @@ export class CrawlsList extends BtrixElement {
.searchKeys=${this.searchKeys}
.searchOptions=${this.searchOptions}
.keyLabels=${CrawlsList.FieldLabels}
+ size="small"
selectedKey=${ifDefined(this.selectedSearchFilterKey)}
searchByValue=${ifDefined(
this.selectedSearchFilterKey &&
@@ -730,8 +732,11 @@ export class CrawlsList extends BtrixElement {
: this.itemType === "crawl"
? msg("Search all crawls by name or crawl start URL")
: msg("Search all items by name or crawl start URL")}
- @btrix-select=${(e: CustomEvent) => {
- const { key, value } = e.detail;
+ @btrix-select=${(e: BtrixSearchComboboxSelectEvent) => {
+ const { key, value } = e.detail.item;
+
+ if (key == null) return;
+
this.filterBy.setValue({
...this.filterBy.value,
[key]: value,
diff --git a/frontend/src/pages/org/settings/components/crawling-defaults.ts b/frontend/src/pages/org/settings/components/crawling-defaults.ts
index a0ae5f4757..633b8b7bf3 100644
--- a/frontend/src/pages/org/settings/components/crawling-defaults.ts
+++ b/frontend/src/pages/org/settings/components/crawling-defaults.ts
@@ -146,7 +146,7 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
};
const behaviors = {
customBehavior: html`
-
+
) => {
- const { key, value } = e.detail;
+ @btrix-select=${(e: BtrixSearchComboboxSelectEvent) => {
+ const { key, value } = e.detail.item;
if (key == null) return;
this.filterBy = {
[key]: value,
diff --git a/frontend/src/pages/org/workflows-new.ts b/frontend/src/pages/org/workflows-new.ts
index 6c6692fa22..46fbca3f75 100644
--- a/frontend/src/pages/org/workflows-new.ts
+++ b/frontend/src/pages/org/workflows-new.ts
@@ -72,6 +72,7 @@ export class WorkflowsNew extends BtrixElement {
jobType: "custom",
browserWindows: this.appState.settings?.numBrowsersPerInstance || 1,
autoAddCollections: [],
+ dedupCollId: null,
crawlerChannel: "default",
proxyId: null,
};
diff --git a/frontend/src/strings/crawl-workflows/labels.ts b/frontend/src/strings/crawl-workflows/labels.ts
index 38dfe8882d..50ca071464 100644
--- a/frontend/src/strings/crawl-workflows/labels.ts
+++ b/frontend/src/strings/crawl-workflows/labels.ts
@@ -1,14 +1,16 @@
import { msg } from "@lit/localize";
+import type { FormStateField } from "@/utils/workflow";
+
export const labelFor = {
- behaviors: msg("Behaviors"),
- customBehaviors: msg("Custom Behaviors"),
+ customBehavior: msg("Custom Behaviors"),
autoscrollBehavior: msg("Autoscroll"),
autoclickBehavior: msg("Autoclick"),
pageLoadTimeoutSeconds: msg("Page Load Limit"),
postLoadDelaySeconds: msg("Delay After Page Load"),
behaviorTimeoutSeconds: "Behavior Limit",
pageExtraDelaySeconds: msg("Delay Before Next Page"),
- selectLink: msg("Link Selectors"),
+ selectLinks: msg("Link Selectors"),
clickSelector: msg("Click Selector"),
-};
+ dedupeType: msg("Crawl Deduplication"),
+} as const satisfies Partial>;
diff --git a/frontend/src/strings/crawl-workflows/section.ts b/frontend/src/strings/crawl-workflows/section.ts
index 1b09c14064..2e30505bae 100644
--- a/frontend/src/strings/crawl-workflows/section.ts
+++ b/frontend/src/strings/crawl-workflows/section.ts
@@ -8,6 +8,7 @@ const section: Record = {
behaviors: msg("Page Behavior"),
browserSettings: msg("Browser Settings"),
scheduling: msg("Scheduling"),
+ deduplication: msg("Deduplication"),
collections: msg("Collections"),
metadata: msg("Metadata"),
};
diff --git a/frontend/src/types/collection.ts b/frontend/src/types/collection.ts
index dd06b2c795..6e4d44708b 100644
--- a/frontend/src/types/collection.ts
+++ b/frontend/src/types/collection.ts
@@ -2,6 +2,9 @@ import { z } from "zod";
import { storageFileSchema } from "./storage";
+export const COLLECTION_NAME_MAX_LENGTH = 50;
+export const COLLECTION_CAPTION_MAX_LENGTH = 150;
+
export enum CollectionAccess {
Private = "private",
Public = "public",
@@ -50,6 +53,7 @@ export const publicCollectionSchema = z.object({
homeUrlPageId: z.string().nullable(),
homeUrlTs: z.string().datetime().nullable(),
access: z.nativeEnum(CollectionAccess),
+ hasDedupIndex: z.boolean(),
});
export type PublicCollection = z.infer;
diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts
index 3794355138..3b5f9e69b9 100644
--- a/frontend/src/types/crawler.ts
+++ b/frontend/src/types/crawler.ts
@@ -70,6 +70,7 @@ export type WorkflowParams = {
autoAddCollections: string[];
crawlerChannel: string;
proxyId: string | null;
+ dedupCollId: string | null;
};
export type CrawlConfig = WorkflowParams & {
diff --git a/frontend/src/utils/workflow.ts b/frontend/src/utils/workflow.ts
index d6930f2fae..0aac3fd67a 100644
--- a/frontend/src/utils/workflow.ts
+++ b/frontend/src/utils/workflow.ts
@@ -39,6 +39,7 @@ export const SECTIONS = [
"behaviors",
"browserSettings",
"scheduling",
+ "deduplication",
"collections",
"metadata",
] as const;
@@ -51,6 +52,7 @@ export enum GuideHash {
Behaviors = "page-behavior",
BrowserSettings = "browser-settings",
Scheduling = "scheduling",
+ Deduplication = "deduplication",
Collections = "collections",
Metadata = "metadata",
}
@@ -66,6 +68,7 @@ export const workflowTabToGuideHash: Record = {
behaviors: GuideHash.Behaviors,
browserSettings: GuideHash.BrowserSettings,
scheduling: GuideHash.Scheduling,
+ deduplication: GuideHash.Deduplication,
collections: GuideHash.Collections,
metadata: GuideHash.Metadata,
};
@@ -169,6 +172,9 @@ export type FormState = {
* Custom schedule in cron format.
*/
scheduleCustom?: string;
+ dedupeType: "none" | "collection";
+ dedupeCollectionId: string | null;
+ dedupeCollectionName: string | null;
jobName: WorkflowParams["name"];
browserProfile: Profile | null;
tags: Tags;
@@ -231,6 +237,9 @@ export const getDefaultFormState = (): FormState => ({
minute: 0,
period: "AM",
},
+ dedupeType: "none",
+ dedupeCollectionId: null,
+ dedupeCollectionName: null,
jobName: "",
browserProfile: null,
tags: [],
@@ -335,6 +344,11 @@ export function getInitialFormState(params: {
formState.autoAddCollections = params.initialWorkflow.autoAddCollections;
}
+ if (params.initialWorkflow.dedupCollId) {
+ formState.dedupeType = "collection";
+ formState.dedupeCollectionId = params.initialWorkflow.dedupCollId;
+ }
+
const secondsToMinutes = (value: unknown, fallback = 0) => {
if (typeof value === "number" && value > 0) return value / 60;
return fallback;