handle(ScanTarget scanTarget) {
});
}
+ /**
+ * Scans a target and returns the result as a Document. This is the core scanning functionality
+ * that must be implemented by subclasses.
+ *
+ * @param scanTarget The target to scan
+ * @return The scan result as a Document
+ */
public abstract Document scan(ScanTarget scanTarget);
+ /**
+ * Initializes this worker if it hasn't been initialized yet. This method is thread-safe and
+ * will only initialize once.
+ *
+ * @return True if this call performed the initialization, false if already initialized
+ */
public final boolean init() {
// synchronize such that no thread runs before being initialized
// but only synchronize if not already initialized
@@ -78,6 +119,13 @@ public final boolean init() {
return false;
}
+ /**
+ * Cleans up this worker if it has been initialized and has no active jobs. This method is
+ * thread-safe and will only clean up once.
+ * If there are still active jobs, it will enqueue the cleanup for later.
+ *
+ * @return True if this call performed the cleanup, false otherwise
+ */
public final boolean cleanup() {
// synchronize such that init and cleanup do not run simultaneously
// but only synchronize if already initialized
@@ -99,7 +147,17 @@ public final boolean cleanup() {
return false;
}
+ /**
+ * Performs the actual initialization of this worker. This method is called exactly once by
+ * {@link #init()} when initialization is needed. Subclasses must implement this method to
+ * initialize their specific resources.
+ */
protected abstract void initInternal();
+ /**
+ * Performs the actual cleanup of this worker. This method is called exactly once by {@link
+ * #cleanup()} when cleanup is needed. Subclasses must implement this method to clean up their
+ * specific resources.
+ */
protected abstract void cleanupInternal();
}
diff --git a/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java b/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java
index 57c37e4..285e8b9 100644
--- a/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java
+++ b/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java
@@ -14,18 +14,37 @@
import de.rub.nds.crawler.data.BulkScanInfo;
import de.rub.nds.crawler.data.ScanConfig;
import de.rub.nds.crawler.data.ScanJobDescription;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.exception.UncheckedException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.bson.Document;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Each ScanJob has its own BulkScanWorker. This class manages the BulkScanWorkers and ensures that
+ * each BulkScanWorker is only created once and is cleaned up after it is not used anymore.
+ *
+ * More concretely: If a scan with ID 1 is started, a worker is created. This worker will exist as long as scan targets
+ * for scan ID 1 are processed. If the worker does not receive jobs for more than 30 Minutes, it is considered stale.
+ * Upon receiving a new job, stale workers are removed.
+ * I.e. if after 30 mins a new ID 1 job arrives, the worker persists.
+ * If a new ID 2 job arrives, a worker for that is created, and stale workers are removed.
+ *
+ * This class manages the mechanism above and acts as a singleton factory and manager for BulkScanWorker instances.
+ */
public class BulkScanWorkerManager {
private static final Logger LOGGER = LogManager.getLogger();
private static volatile BulkScanWorkerManager instance;
+ /**
+ * Gets the singleton instance of the BulkScanWorkerManager. Creates the instance if it doesn't
+ * exist yet.
+ *
+ * @return The singleton instance
+ */
public static BulkScanWorkerManager getInstance() {
if (instance == null) {
synchronized (BulkScanWorkerManager.class) {
@@ -37,6 +56,14 @@ public static BulkScanWorkerManager getInstance() {
return instance;
}
+ /**
+ * Static convenience method to handle a scan job. See also {@link #handle(ScanJobDescription, int, int)}.
+ *
+ * @param scanJobDescription The scan job to handle
+ * @param parallelConnectionThreads The number of parallel connection threads to use (used to create worker if it does not exist)
+ * @param parallelScanThreads The number of parallel scan threads to use (used to create worker if it does not exist)
+ * @return A future that returns the scan result when the target is scanned is done
+ */
public static Future handleStatic(
ScanJobDescription scanJobDescription,
int parallelConnectionThreads,
@@ -62,6 +89,17 @@ private BulkScanWorkerManager() {
.build();
}
+ /**
+ * Gets or creates a bulk scan worker for the specified bulk scan. Workers are cached and reused
+ * to ensure thread limits.
+ *
+ * @param bulkScanId The ID of the bulk scan to get the worker for (used as key in the cache)
+ * @param scanConfig The scan configuration (used to create worker if it does not exist)
+ * @param parallelConnectionThreads The number of parallel connection threads to use (used to create worker if it does not exist)
+ * @param parallelScanThreads The number of parallel scan threads to use (used to create worker if it does not exist)
+ * @return A bulk scan worker for the specified bulk scan
+ * @throws UncheckedException If a worker cannot be created
+ */
public BulkScanWorker> getBulkScanWorker(
String bulkScanId,
ScanConfig scanConfig,
@@ -83,6 +121,15 @@ public BulkScanWorker> getBulkScanWorker(
}
}
+ /**
+ * Handles a scan job by creating or retrieving the appropriate worker and submitting the scan
+ * target for processing.
+ *
+ * @param scanJobDescription The scan job to handle
+ * @param parallelConnectionThreads The number of parallel connection threads to use (used to create worker if it does not exist)
+ * @param parallelScanThreads The number of parallel scan threads to use (used to create worker if it does not exist)
+ * @return A future that returns the scan result when the target is scanned is done
+ */
public Future handle(
ScanJobDescription scanJobDescription,
int parallelConnectionThreads,
diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScan.java b/src/main/java/de/rub/nds/crawler/data/BulkScan.java
index b70b0b2..90e548f 100644
--- a/src/main/java/de/rub/nds/crawler/data/BulkScan.java
+++ b/src/main/java/de/rub/nds/crawler/data/BulkScan.java
@@ -9,17 +9,24 @@
package de.rub.nds.crawler.data;
import de.rub.nds.crawler.constant.JobStatus;
+
+import javax.persistence.Id;
import java.io.Serializable;
import java.time.Instant;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.EnumMap;
import java.util.Map;
-import javax.persistence.Id;
+/**
+ * Represents a bulk scanning operation that manages multiple scanning jobs. This class tracks
+ * metadata about a scan batch including scan configuration, timing information, job statistics, and
+ * version information.
+ */
public class BulkScan implements Serializable {
- @Id private String _id;
+ @Id
+ private String _id;
private String name;
@@ -55,8 +62,20 @@ public class BulkScan implements Serializable {
DateTimeFormatter.ofPattern("yyyy-MM-dd_HH-mm").withZone(ZoneId.systemDefault());
@SuppressWarnings("unused")
- private BulkScan() {}
-
+ private BulkScan() {
+ }
+
+ /**
+ * Creates a new bulk scan with the given parameters.
+ *
+ * @param scannerClass A scanner implementation class for retrieving version information
+ * @param crawlerClass A crawler implementation class for retrieving version information
+ * @param name The name of the bulk scan
+ * @param scanConfig The configuration to use for this scan
+ * @param startTime The start time as a timestamp in milliseconds
+ * @param monitored Whether this scan should be monitored for progress
+ * @param notifyUrl Optional URL to notify when the scan is complete
+ */
public BulkScan(
Class> scannerClass,
Class> crawlerClass,
@@ -75,141 +94,312 @@ public BulkScan(
this.collectionName = name + "_" + DATE_FORMATTER.format(Instant.ofEpochMilli(startTime));
this.notifyUrl = notifyUrl;
}
-
- // Getter naming important for correct serialization, do not change!
+
+ /**
+ * Gets the database ID for this bulk scan.
+ *
+ * @return The database ID
+ */
public String get_id() {
+ // Getter naming important for correct serialization, do not change!
return _id;
}
+ /**
+ * Gets the name of this bulk scan.
+ *
+ * @return The name
+ */
public String getName() {
return this.name;
}
+ /**
+ * Gets the collection name where scan results will be stored.
+ *
+ * @return The collection name
+ */
public String getCollectionName() {
return this.collectionName;
}
+ /**
+ * Gets the scan configuration for this bulk scan.
+ *
+ * @return The scan configuration
+ */
public ScanConfig getScanConfig() {
return this.scanConfig;
}
+ /**
+ * Checks if this bulk scan is monitored for progress.
+ *
+ * @return True if the scan is monitored, false otherwise
+ */
public boolean isMonitored() {
return this.monitored;
}
+ /**
+ * Checks if this bulk scan has finished.
+ *
+ * @return True if the scan is finished, false otherwise
+ */
public boolean isFinished() {
return this.finished;
}
+ /**
+ * Gets the start time of this bulk scan.
+ *
+ * @return The start time as a timestamp in milliseconds
+ */
public long getStartTime() {
return this.startTime;
}
+ /**
+ * Gets the end time of this bulk scan.
+ *
+ * @return The end time as a timestamp in milliseconds
+ */
public long getEndTime() {
return this.endTime;
}
+ /**
+ * Gets the total number of targets provided for this bulk scan.
+ *
+ * @return The number of targets
+ */
public int getTargetsGiven() {
return this.targetsGiven;
}
+ /**
+ * Gets the number of scan jobs published for this bulk scan.
+ *
+ * @return The number of scan jobs published
+ */
public long getScanJobsPublished() {
return this.scanJobsPublished;
}
+ /**
+ * Gets the number of successful scans completed for this bulk scan.
+ *
+ * @return The number of successful scans
+ */
public int getSuccessfulScans() {
return this.successfulScans;
}
+ /**
+ * Gets the URL to notify when this bulk scan is complete.
+ *
+ * @return The notification URL
+ */
public String getNotifyUrl() {
return this.notifyUrl;
}
+ /**
+ * Gets the version of the scanner used for this bulk scan.
+ *
+ * @return The scanner version
+ */
public String getScannerVersion() {
return this.scannerVersion;
}
+ /**
+ * Gets the version of the crawler used for this bulk scan.
+ *
+ * @return The crawler version
+ */
public String getCrawlerVersion() {
return this.crawlerVersion;
}
// Setter naming important for correct serialization, do not change!
+
+ /**
+ * Sets the database ID for this bulk scan.
+ *
+ * @param _id The database ID
+ */
public void set_id(String _id) {
this._id = _id;
}
+ /**
+ * Sets the name of this bulk scan.
+ *
+ * @param name The name
+ */
public void setName(String name) {
this.name = name;
}
+ /**
+ * Sets the collection name where scan results will be stored.
+ *
+ * @param collectionName The collection name
+ */
public void setCollectionName(String collectionName) {
this.collectionName = collectionName;
}
+ /**
+ * Sets the scan configuration for this bulk scan.
+ *
+ * @param scanConfig The scan configuration
+ */
public void setScanConfig(ScanConfig scanConfig) {
this.scanConfig = scanConfig;
}
+ /**
+ * Sets whether this bulk scan is monitored for progress.
+ *
+ * @param monitored True if the scan should be monitored, false otherwise
+ */
public void setMonitored(boolean monitored) {
this.monitored = monitored;
}
+ /**
+ * Sets whether this bulk scan is finished.
+ *
+ * @param finished True if the scan is finished, false otherwise
+ */
public void setFinished(boolean finished) {
this.finished = finished;
}
+ /**
+ * Sets the start time of this bulk scan.
+ *
+ * @param startTime The start time as a timestamp in milliseconds
+ */
public void setStartTime(long startTime) {
this.startTime = startTime;
}
+ /**
+ * Sets the end time of this bulk scan.
+ *
+ * @param endTime The end time as a timestamp in milliseconds
+ */
public void setEndTime(long endTime) {
this.endTime = endTime;
}
+ /**
+ * Sets the total number of targets for this bulk scan.
+ *
+ * @param targetsGiven The number of targets
+ */
public void setTargetsGiven(int targetsGiven) {
this.targetsGiven = targetsGiven;
}
+ /**
+ * Sets the number of scan jobs published for this bulk scan.
+ *
+ * @param scanJobsPublished The number of scan jobs published
+ */
public void setScanJobsPublished(long scanJobsPublished) {
this.scanJobsPublished = scanJobsPublished;
}
+ /**
+ * Sets the number of successful scans completed for this bulk scan.
+ *
+ * @param successfulScans The number of successful scans
+ */
public void setSuccessfulScans(int successfulScans) {
this.successfulScans = successfulScans;
}
+ /**
+ * Sets the URL to notify when this bulk scan is complete.
+ *
+ * @param notifyUrl The notification URL
+ */
public void setNotifyUrl(String notifyUrl) {
this.notifyUrl = notifyUrl;
}
+ /**
+ * Sets the version of the scanner used for this bulk scan.
+ *
+ * @param scannerVersion The scanner version
+ */
public void setScannerVersion(String scannerVersion) {
this.scannerVersion = scannerVersion;
}
+ /**
+ * Sets the version of the crawler used for this bulk scan.
+ *
+ * @param crawlerVersion The crawler version
+ */
public void setCrawlerVersion(String crawlerVersion) {
this.crawlerVersion = crawlerVersion;
}
+ /**
+ * Gets the job status counters for this bulk scan.
+ *
+ * @return A map of job status to count
+ */
public Map getJobStatusCounters() {
return jobStatusCounters;
}
+ /**
+ * Sets the job status counters for this bulk scan.
+ *
+ * @param jobStatusCounters A map of job status to count
+ */
public void setJobStatusCounters(Map jobStatusCounters) {
this.jobStatusCounters = jobStatusCounters;
}
+ /**
+ * Gets the number of scan jobs that failed due to domain resolution errors.
+ *
+ * @return The number of resolution errors
+ */
public long getScanJobsResolutionErrors() {
return scanJobsResolutionErrors;
}
+ /**
+ * Sets the number of scan jobs that failed due to domain resolution errors.
+ *
+ * @param scanJobsResolutionErrors The number of resolution errors
+ */
public void setScanJobsResolutionErrors(long scanJobsResolutionErrors) {
this.scanJobsResolutionErrors = scanJobsResolutionErrors;
}
+ /**
+ * Gets the number of scan jobs skipped due to denylisting.
+ *
+ * @return The number of denylisted scan jobs
+ */
public long getScanJobsDenylisted() {
return scanJobsDenylisted;
}
+ /**
+ * Sets the number of scan jobs skipped due to denylisting.
+ *
+ * @param scanJobsDenylisted The number of denylisted scan jobs
+ */
public void setScanJobsDenylisted(long scanJobsDenylisted) {
this.scanJobsDenylisted = scanJobsDenylisted;
}
diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java b/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java
index 1e40e41..4937ee4 100644
--- a/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java
+++ b/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java
@@ -21,24 +21,51 @@ public class BulkScanInfo implements Serializable {
private final boolean isMonitored;
+ /**
+ * Creates a new BulkScanInfo from a BulkScan.
+ *
+ * @param bulkScan The bulk scan to extract information from
+ */
public BulkScanInfo(BulkScan bulkScan) {
this.bulkScanId = bulkScan.get_id();
this.scanConfig = bulkScan.getScanConfig();
this.isMonitored = bulkScan.isMonitored();
}
+ /**
+ * Gets the ID of the bulk scan.
+ *
+ * @return The bulk scan ID
+ */
public String getBulkScanId() {
return bulkScanId;
}
+ /**
+ * Gets the scan configuration for this bulk scan.
+ *
+ * @return The scan configuration
+ */
public ScanConfig getScanConfig() {
return scanConfig;
}
+ /**
+ * Gets the scan configuration cast to a specific type.
+ *
+ * @param The type to cast the scan configuration to
+ * @param clazz The class of the type to cast to
+ * @return The scan configuration cast to the specified type
+ */
public T getScanConfig(Class clazz) {
return clazz.cast(scanConfig);
}
+ /**
+ * Checks if this bulk scan is being monitored.
+ *
+ * @return True if the scan is monitored, false otherwise
+ */
public boolean isMonitored() {
return isMonitored;
}
diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java b/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java
index bfaac3a..d452392 100644
--- a/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java
+++ b/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java
@@ -9,10 +9,16 @@
package de.rub.nds.crawler.data;
import de.rub.nds.crawler.constant.JobStatus;
+
import java.util.EnumMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
+/**
+ * Counter class for tracking job statistics during a bulk scan. This class maintains thread-safe
+ * counters for each job status type.
+ * Used to track statistics of finished jobs during a bulk scan.
+ */
public class BulkScanJobCounters {
private final BulkScan bulkScan;
@@ -20,6 +26,12 @@ public class BulkScanJobCounters {
private final AtomicInteger totalJobDoneCount = new AtomicInteger(0);
private final Map jobStatusCounters = new EnumMap<>(JobStatus.class);
+ /**
+ * Creates a new BulkScanJobCounters instance for the given bulk scan. Initializes counters for
+ * all job statuses except TO_BE_EXECUTED.
+ *
+ * @param bulkScan The bulk scan to track counters for
+ */
public BulkScanJobCounters(BulkScan bulkScan) {
this.bulkScan = bulkScan;
for (JobStatus jobStatus : JobStatus.values()) {
@@ -30,10 +42,21 @@ public BulkScanJobCounters(BulkScan bulkScan) {
}
}
+ /**
+ * Gets the bulk scan associated with these counters.
+ *
+ * @return The bulk scan
+ */
public BulkScan getBulkScan() {
return bulkScan;
}
+ /**
+ * Gets a copy of the job status counters as a non-atomic map. This creates a snapshot of the
+ * current counter values.
+ *
+ * @return A map of job status to count
+ */
public Map getJobStatusCountersCopy() {
EnumMap ret = new EnumMap<>(JobStatus.class);
for (Map.Entry entry : jobStatusCounters.entrySet()) {
@@ -42,10 +65,22 @@ public Map getJobStatusCountersCopy() {
return ret;
}
+ /**
+ * Gets the count for a specific job status.
+ *
+ * @param jobStatus The job status to get the count for
+ * @return The current count for the given status
+ */
public int getJobStatusCount(JobStatus jobStatus) {
return jobStatusCounters.get(jobStatus).get();
}
+ /**
+ * Increments the count for a specific job status and the total job count.
+ *
+ * @param jobStatus The job status to increment the count for
+ * @return The new total job count after incrementing
+ */
public int increaseJobStatusCount(JobStatus jobStatus) {
jobStatusCounters.get(jobStatus).incrementAndGet();
return totalJobDoneCount.incrementAndGet();
diff --git a/src/main/java/de/rub/nds/crawler/data/ScanConfig.java b/src/main/java/de/rub/nds/crawler/data/ScanConfig.java
index 8f91fc2..80ff97d 100644
--- a/src/main/java/de/rub/nds/crawler/data/ScanConfig.java
+++ b/src/main/java/de/rub/nds/crawler/data/ScanConfig.java
@@ -12,6 +12,10 @@
import de.rub.nds.scanner.core.config.ScannerDetail;
import java.io.Serializable;
+/**
+ * Abstract base class for scan configurations. Contains common configuration options for all
+ * scanner types and defines required factory methods to create workers.
+ */
public abstract class ScanConfig implements Serializable {
private ScannerDetail scannerDetail;
@@ -23,36 +27,82 @@ public abstract class ScanConfig implements Serializable {
@SuppressWarnings("unused")
private ScanConfig() {}
+ /**
+ * Creates a new scan configuration with the specified parameters.
+ *
+ * @param scannerDetail The level of detail for the scan
+ * @param reexecutions The number of times to retry failed scans
+ * @param timeout The timeout for each scan in seconds
+ */
protected ScanConfig(ScannerDetail scannerDetail, int reexecutions, int timeout) {
this.scannerDetail = scannerDetail;
this.reexecutions = reexecutions;
this.timeout = timeout;
}
+ /**
+ * Gets the scanner detail level.
+ *
+ * @return The scanner detail level
+ */
public ScannerDetail getScannerDetail() {
return this.scannerDetail;
}
+ /**
+ * Gets the number of reexecutions for failed scans.
+ *
+ * @return The number of reexecutions
+ */
public int getReexecutions() {
return this.reexecutions;
}
+ /**
+ * Gets the timeout for each scan in seconds.
+ *
+ * @return The timeout in seconds
+ */
public int getTimeout() {
return this.timeout;
}
+ /**
+ * Sets the scanner detail level.
+ *
+ * @param scannerDetail The scanner detail level
+ */
public void setScannerDetail(ScannerDetail scannerDetail) {
this.scannerDetail = scannerDetail;
}
+ /**
+ * Sets the number of reexecutions for failed scans.
+ *
+ * @param reexecutions The number of reexecutions
+ */
public void setReexecutions(int reexecutions) {
this.reexecutions = reexecutions;
}
+ /**
+ * Sets the timeout for each scan in seconds.
+ *
+ * @param timeout The timeout in seconds
+ */
public void setTimeout(int timeout) {
this.timeout = timeout;
}
+ /**
+ * Creates a worker for this scan configuration. Each implementation must provide a factory
+ * method to create the appropriate worker type.
+ *
+ * @param bulkScanID The ID of the bulk scan this worker is for
+ * @param parallelConnectionThreads The number of parallel connection threads to use
+ * @param parallelScanThreads The number of parallel scan threads to use
+ * @return A worker for this scan configuration
+ */
public abstract BulkScanWorker extends ScanConfig> createWorker(
String bulkScanID, int parallelConnectionThreads, int parallelScanThreads);
}
diff --git a/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java b/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java
index 841b410..3bd92a7 100644
--- a/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java
+++ b/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java
@@ -12,9 +12,12 @@
import java.io.IOException;
import java.io.Serializable;
import java.util.Optional;
+import java.util.UUID;
public class ScanJobDescription implements Serializable {
+ private final UUID id = UUID.randomUUID();
+
private final ScanTarget scanTarget;
// Metadata
@@ -52,6 +55,10 @@ public ScanJobDescription(ScanTarget scanTarget, BulkScan bulkScan, JobStatus st
status);
}
+ public UUID getId() {
+ return id;
+ }
+
private void readObject(java.io.ObjectInputStream in)
throws IOException, ClassNotFoundException {
// handle deserialization, cf. https://stackoverflow.com/a/3960558
diff --git a/src/main/java/de/rub/nds/crawler/data/ScanResult.java b/src/main/java/de/rub/nds/crawler/data/ScanResult.java
index ebd5de5..b2b1c2d 100644
--- a/src/main/java/de/rub/nds/crawler/data/ScanResult.java
+++ b/src/main/java/de/rub/nds/crawler/data/ScanResult.java
@@ -8,6 +8,7 @@
*/
package de.rub.nds.crawler.data;
+import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import de.rub.nds.crawler.constant.JobStatus;
import java.io.Serializable;
@@ -26,8 +27,12 @@ public class ScanResult implements Serializable {
private final Document result;
+ @JsonCreator
private ScanResult(
- String bulkScan, ScanTarget scanTarget, JobStatus jobStatus, Document result) {
+ @JsonProperty("bulkScan") String bulkScan,
+ @JsonProperty("scanTarget") ScanTarget scanTarget,
+ @JsonProperty("resultStatus") JobStatus jobStatus,
+ @JsonProperty("result") Document result) {
this.id = UUID.randomUUID().toString();
this.bulkScan = bulkScan;
this.scanTarget = scanTarget;
@@ -35,6 +40,14 @@ private ScanResult(
this.result = result;
}
+ public ScanResult() {
+ // Default constructor for Jackson deserialization
+ this.bulkScan = "";
+ this.scanTarget = null;
+ this.jobStatus = null;
+ this.result = null;
+ }
+
public ScanResult(ScanJobDescription scanJobDescription, Document result) {
this(
scanJobDescription.getBulkScanInfo().getBulkScanId(),
diff --git a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java
index 33fccd5..a50a5b4 100644
--- a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java
+++ b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java
@@ -10,14 +10,19 @@
import de.rub.nds.crawler.constant.JobStatus;
import de.rub.nds.crawler.denylist.IDenylistProvider;
-import java.io.Serializable;
-import java.net.InetAddress;
-import java.net.UnknownHostException;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.validator.routines.InetAddressValidator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
+import java.io.Serializable;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+/**
+ * Represents a target to be scanned by the crawler. Contains information about the hostname, IP
+ * address, port, and ranking information.
+ */
public class ScanTarget implements Serializable {
private static final Logger LOGGER = LogManager.getLogger();
@@ -25,8 +30,8 @@ public class ScanTarget implements Serializable {
* Initializes a ScanTarget object from a string that potentially contains a hostname, an ip, a
* port, the tranco rank.
*
- * @param targetString from which to create the ScanTarget object
- * @param defaultPort that used if no port is present in targetString
+ * @param targetString from which to create the ScanTarget object
+ * @param defaultPort that used if no port is present in targetString
* @param denylistProvider which provides info if a host is denylisted
* @return ScanTarget object
*/
@@ -139,49 +144,111 @@ public static Pair fromTargetString(
return Pair.of(target, JobStatus.TO_BE_EXECUTED);
}
+ /**
+ * The IP address of the target.
+ */
private String ip;
+ /**
+ * The hostname of the target.
+ */
private String hostname;
+ /**
+ * The port number to connect to.
+ */
private int port;
+ /**
+ * The Tranco rank of the target (if applicable).
+ */
private int trancoRank;
- public ScanTarget() {}
+ /**
+ * Creates a new empty scan target. Fields should be set using the setter methods.
+ */
+ public ScanTarget() {
+ }
+ /**
+ * Returns a string representation of this scan target. Uses the hostname if available,
+ * otherwise uses the IP address.
+ *
+ * @return The string representation
+ */
@Override
public String toString() {
return hostname != null ? hostname : ip;
}
+ /**
+ * Gets the IP address of this target.
+ *
+ * @return The IP address
+ */
public String getIp() {
return this.ip;
}
+ /**
+ * Gets the hostname of this target.
+ *
+ * @return The hostname
+ */
public String getHostname() {
return this.hostname;
}
+ /**
+ * Gets the port number to connect to.
+ *
+ * @return The port number
+ */
public int getPort() {
return this.port;
}
+ /**
+ * Gets the Tranco rank of this target (if applicable).
+ *
+ * @return The Tranco rank
+ */
public int getTrancoRank() {
return this.trancoRank;
}
+ /**
+ * Sets the IP address of this target.
+ *
+ * @param ip The IP address
+ */
public void setIp(String ip) {
this.ip = ip;
}
+ /**
+ * Sets the hostname of this target.
+ *
+ * @param hostname The hostname
+ */
public void setHostname(String hostname) {
this.hostname = hostname;
}
+ /**
+ * Sets the port number to connect to.
+ *
+ * @param port The port number
+ */
public void setPort(int port) {
this.port = port;
}
+ /**
+ * Sets the Tranco rank of this target.
+ *
+ * @param trancoRank The Tranco rank
+ */
public void setTrancoRank(int trancoRank) {
this.trancoRank = trancoRank;
}
diff --git a/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java b/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java
index ed1e4c5..3dba32b 100644
--- a/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java
+++ b/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java
@@ -10,7 +10,17 @@
import de.rub.nds.crawler.data.ScanTarget;
+/**
+ * Interface for providers that check if a scan target is on a denylist. This can be used to skip
+ * scanning of certain targets for various reasons (legal, ethical, or technical).
+ */
public interface IDenylistProvider {
+ /**
+ * Checks if a scan target is on the denylist.
+ *
+ * @param target The scan target to check
+ * @return True if the target is denylisted, false otherwise
+ */
boolean isDenylisted(ScanTarget target);
}
diff --git a/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java b/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java
index 9af1769..90ae8c0 100644
--- a/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java
+++ b/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java
@@ -10,8 +10,18 @@
import de.rub.nds.crawler.data.ScanJobDescription;
+/**
+ * Functional interface for consumers that handle completion notifications of scan jobs. Used to
+ * notify controllers when workers have completed their assigned tasks.
+ */
@FunctionalInterface
public interface DoneNotificationConsumer {
+ /**
+ * Consumes a notification that a scan job has completed.
+ *
+ * @param consumerTag A tag identifying the consumer
+ * @param scanJobDescription The description of the completed scan job
+ */
void consumeDoneNotification(String consumerTag, ScanJobDescription scanJobDescription);
}
diff --git a/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java b/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java
index 628b0ee..f565eab 100644
--- a/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java
+++ b/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java
@@ -10,8 +10,17 @@
import de.rub.nds.crawler.data.ScanJobDescription;
+/**
+ * Functional interface for consumers that process scan jobs. Used by workers to receive jobs from
+ * the orchestration system.
+ */
@FunctionalInterface
public interface ScanJobConsumer {
+ /**
+ * Consumes and processes a scan job.
+ *
+ * @param scanJobDescription The description of the scan job to process
+ */
void consumeScanJob(ScanJobDescription scanJobDescription);
}
diff --git a/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java
index 50e3626..734876c 100644
--- a/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java
+++ b/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java
@@ -11,6 +11,7 @@
import de.rub.nds.crawler.data.BulkScan;
import de.rub.nds.crawler.data.ScanJobDescription;
import de.rub.nds.crawler.data.ScanResult;
+import java.util.List;
/**
* Persistence provider interface. Exposes methods to write out the different stages of a task to a
@@ -40,4 +41,24 @@ public interface IPersistenceProvider {
* @param bulkScan The bulk scan to update.
*/
void updateBulkScan(BulkScan bulkScan);
+
+ /**
+ * Retrieve scan results for a specific target hostname or IP.
+ *
+ * @param dbName The database name where the scan results are stored.
+ * @param collectionName The collection name where the scan results are stored.
+ * @param target The hostname or IP address to search for.
+ * @return A list of scan results matching the target.
+ */
+ List getScanResultsByTarget(String dbName, String collectionName, String target);
+
+ /**
+ * Retrieve a specific scan result by its ID.
+ *
+ * @param dbName The database name where the scan result is stored.
+ * @param collectionName The collection name where the scan result is stored.
+ * @param id The ID of the scan result to retrieve.
+ * @return The scan result, or null if not found.
+ */
+ ScanResult getScanResultById(String dbName, String collectionName, String id);
}
diff --git a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java
index 22c0999..2f668b3 100644
--- a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java
+++ b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java
@@ -9,6 +9,7 @@
package de.rub.nds.crawler.persistence;
import com.fasterxml.jackson.annotation.JsonFormat;
+import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.Module;
import com.fasterxml.jackson.databind.ObjectMapper;
@@ -35,7 +36,9 @@
import java.math.BigDecimal;
import java.nio.file.Files;
import java.nio.file.Paths;
+import java.util.ArrayList;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.tuple.Pair;
@@ -128,6 +131,10 @@ private static MongoClient createMongoClient(MongoDbDelegate mongoDbDelegate) {
private static ObjectMapper createMapper() {
ObjectMapper mapper = new ObjectMapper();
+ mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
+ mapper.configure(DeserializationFeature.FAIL_ON_MISSING_CREATOR_PROPERTIES, false);
+ mapper.configure(DeserializationFeature.FAIL_ON_NULL_CREATOR_PROPERTIES, false);
+
if (!serializers.isEmpty()) {
SimpleModule serializerModule = new SimpleModule();
for (JsonSerializer> serializer : serializers) {
@@ -222,6 +229,7 @@ private JacksonMongoCollection getBulkScanCollection(String dbName) {
@Override
public void insertBulkScan(@NonNull BulkScan bulkScan) {
+ LOGGER.info("Inserting bulk scan with name: {}", bulkScan.getName());
this.getBulkScanCollection(bulkScan.getName()).insertOne(bulkScan);
}
@@ -243,6 +251,10 @@ private void writeResultToDatabase(
@Override
public void insertScanResult(ScanResult scanResult, ScanJobDescription scanJobDescription) {
+ LOGGER.info(
+ "Inserting scan result for job ID: {} with status: {}",
+ scanJobDescription.getId(),
+ scanResult.getResultStatus());
if (scanResult.getResultStatus() != scanJobDescription.getStatus()) {
LOGGER.error(
"ScanResult status ({}) does not match ScanJobDescription status ({})",
@@ -251,6 +263,7 @@ public void insertScanResult(ScanResult scanResult, ScanJobDescription scanJobDe
throw new IllegalArgumentException(
"ScanResult status does not match ScanJobDescription status");
}
+ scanResult.setId(scanJobDescription.getId().toString());
try {
writeResultToDatabase(
scanJobDescription.getDbName(),
@@ -270,4 +283,75 @@ public void insertScanResult(ScanResult scanResult, ScanJobDescription scanJobDe
}
}
}
+
+ @Override
+ public List getScanResultsByTarget(
+ String dbName, String collectionName, String target) {
+ LOGGER.info(
+ "Retrieving scan results for target {} from collection: {}.{}",
+ target,
+ dbName,
+ collectionName);
+
+ try {
+ var collection = resultCollectionCache.getUnchecked(Pair.of(dbName, collectionName));
+
+ // Create a query that matches either hostname or IP
+ var query = new org.bson.Document();
+ var orQuery = new ArrayList();
+ orQuery.add(new org.bson.Document("scanTarget.hostname", target));
+ orQuery.add(new org.bson.Document("scanTarget.ip", target));
+ query.append("$or", orQuery);
+
+ var iterable = collection.find(query);
+
+ List results = new ArrayList<>();
+ iterable.forEach(results::add);
+
+ LOGGER.info(
+ "Retrieved {} scan results for target {} from collection: {}.{}",
+ results.size(),
+ target,
+ dbName,
+ collectionName);
+
+ return results;
+ } catch (Exception e) {
+ LOGGER.error("Exception while retrieving scan results from MongoDB: ", e);
+ throw new RuntimeException("Failed to retrieve scan results for target: " + target, e);
+ }
+ }
+
+ @Override
+ public ScanResult getScanResultById(String dbName, String collectionName, String id) {
+ LOGGER.info(
+ "Retrieving scan result with ID {} from collection: {}.{}",
+ id,
+ dbName,
+ collectionName);
+
+ try {
+ var collection = resultCollectionCache.getUnchecked(Pair.of(dbName, collectionName));
+ var result = collection.findOneById(id);
+
+ if (result == null) {
+ LOGGER.warn(
+ "No scan result found with ID: {} in collection: {}.{}",
+ id,
+ dbName,
+ collectionName);
+ } else {
+ LOGGER.info(
+ "Retrieved scan result with ID: {} from collection: {}.{}",
+ id,
+ dbName,
+ collectionName);
+ }
+
+ return result;
+ } catch (Exception e) {
+ LOGGER.error("Exception while retrieving scan result from MongoDB: ", e);
+ throw new RuntimeException("Failed to retrieve scan result with ID: " + id, e);
+ }
+ }
}
diff --git a/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java b/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java
index 5e4662f..98bc542 100644
--- a/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java
+++ b/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java
@@ -10,7 +10,16 @@
import java.util.List;
+/**
+ * Interface for providers that supply lists of targets to scan. Implementations can retrieve
+ * targets from different sources such as files, databases, or web services.
+ */
public interface ITargetListProvider {
+ /**
+ * Gets the list of targets to scan.
+ *
+ * @return A list of target hostnames or IP addresses
+ */
List getTargetList();
}
diff --git a/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java b/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java
index f4d14fd..ae0f457 100644
--- a/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java
+++ b/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java
@@ -10,7 +10,20 @@
import java.util.concurrent.*;
+/**
+ * A custom thread pool executor that creates cancellable futures. This executor allows tasks to
+ * return a partial result even when cancelled.
+ */
public class CanceallableThreadPoolExecutor extends ThreadPoolExecutor {
+ /**
+ * Creates a new thread pool executor with the given parameters.
+ *
+ * @param corePoolSize The number of threads to keep in the pool, even if idle
+ * @param maximumPoolSize The maximum number of threads to allow in the pool
+ * @param keepAliveTime How long idle threads should be kept alive
+ * @param unit The time unit for the keepAliveTime
+ * @param workQueue The queue to use for holding tasks before they are executed
+ */
public CanceallableThreadPoolExecutor(
int corePoolSize,
int maximumPoolSize,
@@ -20,6 +33,16 @@ public CanceallableThreadPoolExecutor(
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue);
}
+ /**
+ * Creates a new thread pool executor with the given parameters.
+ *
+ * @param corePoolSize The number of threads to keep in the pool, even if idle
+ * @param maximumPoolSize The maximum number of threads to allow in the pool
+ * @param keepAliveTime How long idle threads should be kept alive
+ * @param unit The time unit for the keepAliveTime
+ * @param workQueue The queue to use for holding tasks before they are executed
+ * @param threadFactory The factory to use when creating new threads
+ */
public CanceallableThreadPoolExecutor(
int corePoolSize,
int maximumPoolSize,
@@ -30,6 +53,16 @@ public CanceallableThreadPoolExecutor(
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory);
}
+ /**
+ * Creates a new thread pool executor with the given parameters.
+ *
+ * @param corePoolSize The number of threads to keep in the pool, even if idle
+ * @param maximumPoolSize The maximum number of threads to allow in the pool
+ * @param keepAliveTime How long idle threads should be kept alive
+ * @param unit The time unit for the keepAliveTime
+ * @param workQueue The queue to use for holding tasks before they are executed
+ * @param handler The handler to use when execution is blocked
+ */
public CanceallableThreadPoolExecutor(
int corePoolSize,
int maximumPoolSize,
@@ -40,6 +73,17 @@ public CanceallableThreadPoolExecutor(
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, handler);
}
+ /**
+ * Creates a new thread pool executor with the given parameters.
+ *
+ * @param corePoolSize The number of threads to keep in the pool, even if idle
+ * @param maximumPoolSize The maximum number of threads to allow in the pool
+ * @param keepAliveTime How long idle threads should be kept alive
+ * @param unit The time unit for the keepAliveTime
+ * @param workQueue The queue to use for holding tasks before they are executed
+ * @param threadFactory The factory to use when creating new threads
+ * @param handler The handler to use when execution is blocked
+ */
public CanceallableThreadPoolExecutor(
int corePoolSize,
int maximumPoolSize,
@@ -58,11 +102,26 @@ public CanceallableThreadPoolExecutor(
handler);
}
+ /**
+ * Creates a new cancellable future for the given callable.
+ *
+ * @param The type of the result
+ * @param callable The callable to be executed
+ * @return A new cancellable future for the callable
+ */
@Override
protected RunnableFuture newTaskFor(Callable callable) {
return new CancellableFuture<>(callable);
}
+ /**
+ * Creates a new cancellable future for the given runnable and result value.
+ *
+ * @param The type of the result
+ * @param runnable The runnable to be executed
+ * @param value The result value to return when the runnable completes
+ * @return A new cancellable future for the runnable
+ */
@Override
protected RunnableFuture newTaskFor(Runnable runnable, T value) {
return new CancellableFuture<>(runnable, value);
diff --git a/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java b/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java
index d7706b1..b166c85 100644
--- a/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java
+++ b/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java
@@ -12,12 +12,25 @@
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicReference;
+/**
+ * A cancellable future implementation that can return partial results even when cancelled. This
+ * class wraps a standard FutureTask but captures the result when available, allowing it to be
+ * retrieved even after cancellation.
+ *
+ * @param The result type returned by this future
+ */
public class CancellableFuture implements RunnableFuture {
private final AtomicReference result = new AtomicReference<>();
private final RunnableFuture innerFuture;
private final Semaphore resultWritten = new Semaphore(0);
+ /**
+ * Creates a new cancellable future for the given callable. When the callable completes, the
+ * result is stored for retrieval even after cancellation.
+ *
+ * @param callable The callable to be executed
+ */
public CancellableFuture(Callable callable) {
innerFuture =
new FutureTask<>(
@@ -29,6 +42,13 @@ public CancellableFuture(Callable callable) {
});
}
+ /**
+ * Creates a new cancellable future for the given runnable and result value. When the runnable
+ * completes, the result value is stored for retrieval even after cancellation.
+ *
+ * @param runnable The runnable to be executed
+ * @param res The result value to return when the runnable completes
+ */
public CancellableFuture(Runnable runnable, V res) {
innerFuture =
new FutureTask<>(
@@ -40,21 +60,46 @@ public CancellableFuture(Runnable runnable, V res) {
});
}
+ /**
+ * Attempts to cancel execution of this task.
+ *
+ * @param mayInterruptIfRunning True if the thread executing this task should be interrupted
+ * @return True if the task was cancelled, false otherwise
+ */
@Override
- public boolean cancel(boolean b) {
- return innerFuture.cancel(b);
+ public boolean cancel(boolean mayInterruptIfRunning) {
+ return innerFuture.cancel(mayInterruptIfRunning);
}
+ /**
+ * Returns true if this task was cancelled before it completed normally.
+ *
+ * @return True if this task was cancelled before it completed
+ */
@Override
public boolean isCancelled() {
return innerFuture.isCancelled();
}
+ /**
+ * Returns true if this task completed. Completion may be due to normal termination, an
+ * exception, or cancellation.
+ *
+ * @return True if this task completed
+ */
@Override
public boolean isDone() {
return innerFuture.isDone();
}
+ /**
+ * Waits if necessary for the computation to complete, and then retrieves its result. If the
+ * task was cancelled but the result was captured, returns the captured result.
+ *
+ * @return The computed result
+ * @throws InterruptedException If the current thread was interrupted while waiting
+ * @throws ExecutionException If the computation threw an exception
+ */
@Override
public V get() throws InterruptedException, ExecutionException {
try {
@@ -65,19 +110,32 @@ public V get() throws InterruptedException, ExecutionException {
}
}
+ /**
+ * Waits if necessary for at most the given time for the computation to complete, and then
+ * retrieves its result. If the task was cancelled but the result was captured, returns the
+ * captured result if available within the timeout.
+ *
+ * @param timeout The maximum time to wait
+ * @param timeUnit The time unit of the timeout argument
+ * @return The computed result
+ * @throws InterruptedException If the current thread was interrupted while waiting
+ * @throws ExecutionException If the computation threw an exception
+ * @throws TimeoutException If the wait timed out
+ */
@Override
- public V get(long l, @NonNull TimeUnit timeUnit)
+ public V get(long timeout, @NonNull TimeUnit timeUnit)
throws InterruptedException, ExecutionException, TimeoutException {
try {
- return innerFuture.get(l, timeUnit);
+ return innerFuture.get(timeout, timeUnit);
} catch (CancellationException e) {
- if (resultWritten.tryAcquire(l, timeUnit)) {
+ if (resultWritten.tryAcquire(timeout, timeUnit)) {
return result.get();
}
throw new TimeoutException("Timeout while waiting for cancelled result");
}
}
+ /** Executes the underlying task. */
@Override
public void run() {
innerFuture.run();
diff --git a/src/test/java/de/rub/nds/crawler/dummy/DummyPersistenceProvider.java b/src/test/java/de/rub/nds/crawler/dummy/DummyPersistenceProvider.java
index 9c2bd00..dabd334 100644
--- a/src/test/java/de/rub/nds/crawler/dummy/DummyPersistenceProvider.java
+++ b/src/test/java/de/rub/nds/crawler/dummy/DummyPersistenceProvider.java
@@ -13,6 +13,7 @@
import de.rub.nds.crawler.data.ScanResult;
import de.rub.nds.crawler.persistence.IPersistenceProvider;
import java.util.ArrayList;
+import java.util.LinkedList;
import java.util.List;
public class DummyPersistenceProvider implements IPersistenceProvider {
@@ -31,4 +32,18 @@ public void insertBulkScan(BulkScan bulkScan) {
@Override
public void updateBulkScan(BulkScan bulkScan) {}
+
+ @Override
+ public List getScanResultsByTarget(
+ String dbName, String collectionName, String target) {
+ return new LinkedList<>();
+ }
+
+ @Override
+ public ScanResult getScanResultById(String dbName, String collectionName, String id) {
+ return results.stream()
+ .filter(result -> result.getId().equals(id))
+ .findFirst()
+ .orElse(null);
+ }
}