diff --git a/pom.xml b/pom.xml index 2ca6ca8..2f88a1a 100644 --- a/pom.xml +++ b/pom.xml @@ -125,7 +125,7 @@ de.rub.nds scanner-core - 5.5.0 + 6.2.3 org.apache.commons diff --git a/src/main/java/de/rub/nds/crawler/CommonMain.java b/src/main/java/de/rub/nds/crawler/CommonMain.java index ce13f5f..78abeaa 100644 --- a/src/main/java/de/rub/nds/crawler/CommonMain.java +++ b/src/main/java/de/rub/nds/crawler/CommonMain.java @@ -18,9 +18,19 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +/** + * Common functionality for crawler main entry points. Provides functionality to parse command line arguments and start the worker/controller as specified by the user. + */ public class CommonMain { private static final Logger LOGGER = LogManager.getLogger(); + /** + * Main entry point for the application. Uses JCommander to parse the controller/worker command and parse the arguments into the respective configuration object. Then starts the controller/worker. + * + * @param args Command line arguments + * @param controllerCommandConfig Configuration for the controller. Will be filled by JCommander from the command line if first argument is "controller". + * @param workerCommandConfig Configuration for the worker. Will be filled by JCommander from the command line if first argument is "worker". + */ public static void main( String[] args, ControllerCommandConfig controllerCommandConfig, @@ -71,6 +81,13 @@ public static void main( } } + /** + * Convenience method to start the application with just a controller configuration. Creates a + * default worker configuration. See {@link #main(String[], ControllerCommandConfig, WorkerCommandConfig)} for details. + * + * @param args Command line arguments + * @param controllerConfig Configuration for the controller. Will be filled by JCommander from the command line if first argument is "controller". + */ public static void main(String[] args, ControllerCommandConfig controllerConfig) { main(args, controllerConfig, new WorkerCommandConfig()); } diff --git a/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java b/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java index becc425..efb0d3b 100644 --- a/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java +++ b/src/main/java/de/rub/nds/crawler/config/ControllerCommandConfig.java @@ -22,11 +22,19 @@ import org.apache.commons.validator.routines.UrlValidator; import org.quartz.CronScheduleBuilder; +/** + * Configuration class for controller instances used to parse command line parameters. Contains settings for the controller's behavior, + * including scan parameters, target selection, and notification settings. This abstract class + * provides the base configuration, while specific scanner implementations must extend it to provide + * scanner-specific configuration. + */ public abstract class ControllerCommandConfig { - @ParametersDelegate private final RabbitMqDelegate rabbitMqDelegate; + @ParametersDelegate + private final RabbitMqDelegate rabbitMqDelegate; - @ParametersDelegate private final MongoDbDelegate mongoDbDelegate; + @ParametersDelegate + private final MongoDbDelegate mongoDbDelegate; @Parameter(names = "-portToBeScanned", description = "The port that should be scanned.") private int port = 443; @@ -112,7 +120,17 @@ public void validate() { } } + /** + * Validator that ensures parameter values are positive integers. + */ public static class PositiveInteger implements IParameterValidator { + /** + * Validates that the parameter value is a positive integer. + * + * @param name The parameter name + * @param value The parameter value + * @throws ParameterException If the value is not a positive integer + */ public void validate(String name, String value) throws ParameterException { int n = Integer.parseInt(value); if (n < 0) { @@ -122,7 +140,17 @@ public void validate(String name, String value) throws ParameterException { } } + /** + * Validator that ensures parameter values are valid cron expressions. + */ public static class CronSyntax implements IParameterValidator { + /** + * Validates that the parameter value is a valid cron expression. + * + * @param name The parameter name + * @param value The parameter value + * @throws ParameterException If the value is not a valid cron expression + */ public void validate(String name, String value) throws ParameterException { CronScheduleBuilder.cronSchedule(value); } diff --git a/src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java b/src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java index 63dc681..ce1f37d 100644 --- a/src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java +++ b/src/main/java/de/rub/nds/crawler/config/WorkerCommandConfig.java @@ -13,11 +13,17 @@ import de.rub.nds.crawler.config.delegate.MongoDbDelegate; import de.rub.nds.crawler.config.delegate.RabbitMqDelegate; +/** + * Configuration class for worker instances used to parse command line parameters. Contains settings for the worker's behavior, including + * thread counts and timeouts, as well as MongoDB and RabbitMQ connection settings. + */ public class WorkerCommandConfig { - @ParametersDelegate private final RabbitMqDelegate rabbitMqDelegate; + @ParametersDelegate + private final RabbitMqDelegate rabbitMqDelegate; - @ParametersDelegate private final MongoDbDelegate mongoDbDelegate; + @ParametersDelegate + private final MongoDbDelegate mongoDbDelegate; @Parameter( names = "-numberOfThreads", @@ -43,34 +49,74 @@ public WorkerCommandConfig() { mongoDbDelegate = new MongoDbDelegate(); } + /** + * Gets the configuration for RabbitMQ, which contains the connection settings for RabbitMQ. + * + * @return The RabbitMQ connection settings + */ public RabbitMqDelegate getRabbitMqDelegate() { return rabbitMqDelegate; } + /** + * Gets the configuration for MongoDB, which contains the connection settings for MongoDB. + * + * @return The MongoDB connection settings + */ public MongoDbDelegate getMongoDbDelegate() { return mongoDbDelegate; } + /** + * Gets the number of parallel scan threads to use. Each thread starts a scanner instance. + * + * @return The number of scan threads + */ public int getParallelScanThreads() { return parallelScanThreads; } + /** + * Gets the number of parallel connection threads to use per bulk scan (i.e., per scan ID). + * + * @return The number of connection threads + */ public int getParallelConnectionThreads() { return parallelConnectionThreads; } + /** + * Gets the timeout for scanning an individual target. + * + * @return The scan timeout in milliseconds + */ public int getScanTimeout() { return scanTimeout; } + /** + * Sets the number of parallel scan threads to use. Each thread starts a scanner instance. + * + * @param parallelScanThreads The number of scan threads + */ public void setParallelScanThreads(int parallelScanThreads) { this.parallelScanThreads = parallelScanThreads; } + /** + * Sets the number of parallel connection threads to use bulk scan (i.e., per scan ID). + * + * @param parallelConnectionThreads The number of connection threads + */ public void setParallelConnectionThreads(int parallelConnectionThreads) { this.parallelConnectionThreads = parallelConnectionThreads; } + /** + * Sets the timeout for scanning an individual target. + * + * @param scanTimeout The scan timeout in milliseconds + */ public void setScanTimeout(int scanTimeout) { this.scanTimeout = scanTimeout; } diff --git a/src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java b/src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java index 3cfd571..58a8958 100644 --- a/src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java +++ b/src/main/java/de/rub/nds/crawler/config/delegate/MongoDbDelegate.java @@ -10,16 +10,19 @@ import com.beust.jcommander.Parameter; +/** + * Configuration delegate that holds MongoDB connection settings. + */ public class MongoDbDelegate { @Parameter( names = "-mongoDbHost", - description = "Host of the MongoDB instance this crawler saves to.") + description = "Host of the MongoDB instance this crawler saves results to.") private String mongoDbHost; @Parameter( names = "-mongoDbPort", - description = "Port of the MongoDB instance this crawler saves to.") + description = "Port of the MongoDB instance this crawler saves results to.") private int mongoDbPort; @Parameter( @@ -29,12 +32,12 @@ public class MongoDbDelegate { @Parameter( names = "-mongoDbPass", - description = "The passwort to be used to authenticate with MongoDB.") + description = "The password to be used to authenticate with MongoDB.") private String mongoDbPass; @Parameter( names = "-mongoDbPassFile", - description = "The passwort to be used to authenticate with MongoDB.") + description = "The password to be used to authenticate with MongoDB.") private String mongoDbPassFile; @Parameter( @@ -42,50 +45,110 @@ public class MongoDbDelegate { description = "The DB within the MongoDB instance, in which the user:pass is defined.") private String mongoDbAuthSource; + /** + * Gets the MongoDB host address. + * + * @return The MongoDB host address + */ public String getMongoDbHost() { return mongoDbHost; } + /** + * Gets the MongoDB port number. + * + * @return The MongoDB port number + */ public int getMongoDbPort() { return mongoDbPort; } + /** + * Gets the MongoDB username for authentication. + * + * @return The MongoDB username + */ public String getMongoDbUser() { return mongoDbUser; } + /** + * Gets the MongoDB password for authentication. + * + * @return The MongoDB password + */ public String getMongoDbPass() { return mongoDbPass; } + /** + * Gets the file path containing the MongoDB password. + * + * @return The MongoDB password file path + */ public String getMongoDbPassFile() { return mongoDbPassFile; } + /** + * Gets the MongoDB authentication source database name. + * + * @return The authentication source database name + */ public String getMongoDbAuthSource() { return mongoDbAuthSource; } + /** + * Sets the MongoDB host address. + * + * @param mongoDbHost The MongoDB host address + */ public void setMongoDbHost(String mongoDbHost) { this.mongoDbHost = mongoDbHost; } + /** + * Sets the MongoDB port number. + * + * @param mongoDbPort The MongoDB port number + */ public void setMongoDbPort(int mongoDbPort) { this.mongoDbPort = mongoDbPort; } + /** + * Sets the MongoDB username for authentication. + * + * @param mongoDbUser The MongoDB username + */ public void setMongoDbUser(String mongoDbUser) { this.mongoDbUser = mongoDbUser; } + /** + * Sets the MongoDB password for authentication. + * + * @param mongoDbPass The MongoDB password + */ public void setMongoDbPass(String mongoDbPass) { this.mongoDbPass = mongoDbPass; } + /** + * Sets the file path containing the MongoDB password. + * + * @param mongoDbPassFile The MongoDB password file path + */ public void setMongoDbPassFile(String mongoDbPassFile) { this.mongoDbPassFile = mongoDbPassFile; } + /** + * Sets the MongoDB authentication source database name. + * + * @param mongoDbAuthSource The authentication source database name + */ public void setMongoDbAuthSource(String mongoDbAuthSource) { this.mongoDbAuthSource = mongoDbAuthSource; } diff --git a/src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java b/src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java index 9d89180..1959997 100644 --- a/src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java +++ b/src/main/java/de/rub/nds/crawler/config/delegate/RabbitMqDelegate.java @@ -10,70 +10,137 @@ import com.beust.jcommander.Parameter; +/** + * Configuration delegate that holds RabbitMQ connection settings. + */ public class RabbitMqDelegate { - @Parameter(names = "-rabbitMqHost") + @Parameter(names = "-rabbitMqHost", description = "Host of the RabbitMQ instance") private String rabbitMqHost; - @Parameter(names = "-rabbitMqPort") + @Parameter(names = "-rabbitMqPort", description = "Port of the RabbitMQ instance") private int rabbitMqPort; - @Parameter(names = "-rabbitMqUser") + @Parameter(names = "-rabbitMqUser", description = "Username for RabbitMQ authentication") private String rabbitMqUser; - @Parameter(names = "-rabbitMqPass") + @Parameter(names = "-rabbitMqPass", description = "Password for RabbitMQ authentication. Alternatively use -rabbitMqPassFile") private String rabbitMqPass; - @Parameter(names = "-rabbitMqPassFile") + @Parameter( + names = "-rabbitMqPassFile", + description = "File containing the password for RabbitMQ authentication. Alternatively use -rabbitMqPass") private String rabbitMqPassFile; - @Parameter(names = "-rabbitMqTLS") + @Parameter( + names = "-rabbitMqTLS", + description = "Whether to use TLS for the RabbitMQ connection") private boolean rabbitMqTLS; + /** + * Gets the RabbitMQ host address. + * + * @return The RabbitMQ host address + */ public String getRabbitMqHost() { return rabbitMqHost; } + /** + * Gets the RabbitMQ port number. + * + * @return The RabbitMQ port number + */ public int getRabbitMqPort() { return rabbitMqPort; } + /** + * Gets the RabbitMQ username for authentication. + * + * @return The RabbitMQ username + */ public String getRabbitMqUser() { return rabbitMqUser; } + /** + * Gets the RabbitMQ password for authentication. + * + * @return The RabbitMQ password + */ public String getRabbitMqPass() { return rabbitMqPass; } + /** + * Gets the file path containing the RabbitMQ password. + * + * @return The RabbitMQ password file path + */ public String getRabbitMqPassFile() { return rabbitMqPassFile; } + /** + * Checks if TLS should be used for the RabbitMQ connection. + * + * @return True if TLS should be used, false otherwise + */ public boolean isRabbitMqTLS() { return rabbitMqTLS; } + /** + * Sets the RabbitMQ host address. + * + * @param rabbitMqHost The RabbitMQ host address + */ public void setRabbitMqHost(String rabbitMqHost) { this.rabbitMqHost = rabbitMqHost; } + /** + * Sets the RabbitMQ port number. + * + * @param rabbitMqPort The RabbitMQ port number + */ public void setRabbitMqPort(int rabbitMqPort) { this.rabbitMqPort = rabbitMqPort; } + /** + * Sets the RabbitMQ username for authentication. + * + * @param rabbitMqUser The RabbitMQ username + */ public void setRabbitMqUser(String rabbitMqUser) { this.rabbitMqUser = rabbitMqUser; } + /** + * Sets the RabbitMQ password for authentication. + * + * @param rabbitMqPass The RabbitMQ password + */ public void setRabbitMqPass(String rabbitMqPass) { this.rabbitMqPass = rabbitMqPass; } + /** + * Sets the file path containing the RabbitMQ password. + * + * @param rabbitMqPassFile The RabbitMQ password file path + */ public void setRabbitMqPassFile(String rabbitMqPassFile) { this.rabbitMqPassFile = rabbitMqPassFile; } + /** + * Sets whether TLS should be used for the RabbitMQ connection. + * + * @param rabbitMqTLS True if TLS should be used, false otherwise + */ public void setRabbitMqTLS(boolean rabbitMqTLS) { this.rabbitMqTLS = rabbitMqTLS; } diff --git a/src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java b/src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java index 8eafb0e..a5e27f0 100644 --- a/src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java +++ b/src/main/java/de/rub/nds/crawler/constant/CruxListNumber.java @@ -8,21 +8,42 @@ */ package de.rub.nds.crawler.constant; +/** + * Enumeration of different Crux list sizes available for scanning. Each enum constant represents a + * specific list of top websites, with the value indicating the number of entries in that list. + */ public enum CruxListNumber { + /** Top 1,000 websites */ TOP_1k(1000), + /** Top 5,000 websites */ TOP_5K(5000), + /** Top 10,000 websites */ TOP_10K(10000), + /** Top 50,000 websites */ TOP_50K(50000), + /** Top 100,000 websites */ TOP_100K(100000), + /** Top 500,000 websites */ TOP_500k(500000), + /** Top 1,000,000 websites */ TOP_1M(1000000); private final int number; + /** + * Constructor for the enum constants. + * + * @param number The number of entries in the list + */ CruxListNumber(int number) { this.number = number; } + /** + * Gets the number of entries in this list. + * + * @return The number of entries + */ public int getNumber() { return number; } diff --git a/src/main/java/de/rub/nds/crawler/constant/JobStatus.java b/src/main/java/de/rub/nds/crawler/constant/JobStatus.java index fe6d26d..4297f0f 100644 --- a/src/main/java/de/rub/nds/crawler/constant/JobStatus.java +++ b/src/main/java/de/rub/nds/crawler/constant/JobStatus.java @@ -8,6 +8,10 @@ */ package de.rub.nds.crawler.constant; +/** + * Enumeration of possible job status values. Indicates the current state or final result of a scan + * job. + */ public enum JobStatus { /** Job is waiting to be executed. */ TO_BE_EXECUTED(false), @@ -38,10 +42,20 @@ public enum JobStatus { private final boolean isError; + /** + * Constructor for the enum constants. + * + * @param isError Whether this status represents an error condition + */ JobStatus(boolean isError) { this.isError = isError; } + /** + * Checks if this status represents an error condition. + * + * @return True if this status is an error, false otherwise + */ public boolean isError() { return isError; } diff --git a/src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java b/src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java index d69a791..c959d5a 100644 --- a/src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java +++ b/src/main/java/de/rub/nds/crawler/core/BulkScanWorker.java @@ -12,13 +12,23 @@ import de.rub.nds.crawler.data.ScanTarget; import de.rub.nds.crawler.util.CanceallableThreadPoolExecutor; import de.rub.nds.scanner.core.execution.NamedThreadFactory; -import java.util.concurrent.*; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.bson.Document; +import java.util.concurrent.Future; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * A worker that scans all targets for a single scan ID. + * Instances are managed using the {@link BulkScanWorkerManager}. + * + * @param The specific ScanConfig type used by this worker + */ public abstract class BulkScanWorker { private static final Logger LOGGER = LogManager.getLogger(); private final AtomicInteger activeJobs = new AtomicInteger(0); @@ -26,6 +36,10 @@ public abstract class BulkScanWorker { private final AtomicBoolean shouldCleanupSelf = new AtomicBoolean(false); private final Object initializationLock = new Object(); protected final String bulkScanId; + + /** + * The scan configuration for this worker + */ protected final T scanConfig; /** @@ -34,6 +48,13 @@ public abstract class BulkScanWorker { */ private final ThreadPoolExecutor timeoutExecutor; + /** + * Creates a new bulk scan worker. This should only be called by the {@link BulkScanWorkerManager}. + * + * @param bulkScanId The ID of the bulk scan this worker is associated with + * @param scanConfig The scan configuration for this worker + * @param parallelScanThreads The number of parallel scan threads to use, i.e., how many {@link ScanTarget}s to handle in parallel. + */ protected BulkScanWorker(String bulkScanId, T scanConfig, int parallelScanThreads) { this.bulkScanId = bulkScanId; this.scanConfig = scanConfig; @@ -48,6 +69,13 @@ protected BulkScanWorker(String bulkScanId, T scanConfig, int parallelScanThread new NamedThreadFactory("crawler-worker: scan executor")); } + /** + * Handles a scan target by submitting it to the executor. + * If init was not called, it will initialize itself. In this case it will also clean up itself if all jobs are done. + * + * @param scanTarget The target to scan. + * @return A future that resolves to the scan result once the scan is done. + */ public Future handle(ScanTarget scanTarget) { // if we initialized ourself, we also clean up ourself shouldCleanupSelf.weakCompareAndSetAcquire(false, init()); @@ -62,8 +90,21 @@ public Future handle(ScanTarget scanTarget) { }); } + /** + * Scans a target and returns the result as a Document. This is the core scanning functionality + * that must be implemented by subclasses. + * + * @param scanTarget The target to scan + * @return The scan result as a Document + */ public abstract Document scan(ScanTarget scanTarget); + /** + * Initializes this worker if it hasn't been initialized yet. This method is thread-safe and + * will only initialize once. + * + * @return True if this call performed the initialization, false if already initialized + */ public final boolean init() { // synchronize such that no thread runs before being initialized // but only synchronize if not already initialized @@ -78,6 +119,13 @@ public final boolean init() { return false; } + /** + * Cleans up this worker if it has been initialized and has no active jobs. This method is + * thread-safe and will only clean up once. + * If there are still active jobs, it will enqueue the cleanup for later. + * + * @return True if this call performed the cleanup, false otherwise + */ public final boolean cleanup() { // synchronize such that init and cleanup do not run simultaneously // but only synchronize if already initialized @@ -99,7 +147,17 @@ public final boolean cleanup() { return false; } + /** + * Performs the actual initialization of this worker. This method is called exactly once by + * {@link #init()} when initialization is needed. Subclasses must implement this method to + * initialize their specific resources. + */ protected abstract void initInternal(); + /** + * Performs the actual cleanup of this worker. This method is called exactly once by {@link + * #cleanup()} when cleanup is needed. Subclasses must implement this method to clean up their + * specific resources. + */ protected abstract void cleanupInternal(); } diff --git a/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java b/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java index 57c37e4..285e8b9 100644 --- a/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java +++ b/src/main/java/de/rub/nds/crawler/core/BulkScanWorkerManager.java @@ -14,18 +14,37 @@ import de.rub.nds.crawler.data.BulkScanInfo; import de.rub.nds.crawler.data.ScanConfig; import de.rub.nds.crawler.data.ScanJobDescription; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.exception.UncheckedException; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.bson.Document; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +/** + * Each ScanJob has its own BulkScanWorker. This class manages the BulkScanWorkers and ensures that + * each BulkScanWorker is only created once and is cleaned up after it is not used anymore. + *

+ * More concretely: If a scan with ID 1 is started, a worker is created. This worker will exist as long as scan targets + * for scan ID 1 are processed. If the worker does not receive jobs for more than 30 Minutes, it is considered stale. + * Upon receiving a new job, stale workers are removed. + * I.e. if after 30 mins a new ID 1 job arrives, the worker persists. + * If a new ID 2 job arrives, a worker for that is created, and stale workers are removed. + *

+ * This class manages the mechanism above and acts as a singleton factory and manager for BulkScanWorker instances. + */ public class BulkScanWorkerManager { private static final Logger LOGGER = LogManager.getLogger(); private static volatile BulkScanWorkerManager instance; + /** + * Gets the singleton instance of the BulkScanWorkerManager. Creates the instance if it doesn't + * exist yet. + * + * @return The singleton instance + */ public static BulkScanWorkerManager getInstance() { if (instance == null) { synchronized (BulkScanWorkerManager.class) { @@ -37,6 +56,14 @@ public static BulkScanWorkerManager getInstance() { return instance; } + /** + * Static convenience method to handle a scan job. See also {@link #handle(ScanJobDescription, int, int)}. + * + * @param scanJobDescription The scan job to handle + * @param parallelConnectionThreads The number of parallel connection threads to use (used to create worker if it does not exist) + * @param parallelScanThreads The number of parallel scan threads to use (used to create worker if it does not exist) + * @return A future that returns the scan result when the target is scanned is done + */ public static Future handleStatic( ScanJobDescription scanJobDescription, int parallelConnectionThreads, @@ -62,6 +89,17 @@ private BulkScanWorkerManager() { .build(); } + /** + * Gets or creates a bulk scan worker for the specified bulk scan. Workers are cached and reused + * to ensure thread limits. + * + * @param bulkScanId The ID of the bulk scan to get the worker for (used as key in the cache) + * @param scanConfig The scan configuration (used to create worker if it does not exist) + * @param parallelConnectionThreads The number of parallel connection threads to use (used to create worker if it does not exist) + * @param parallelScanThreads The number of parallel scan threads to use (used to create worker if it does not exist) + * @return A bulk scan worker for the specified bulk scan + * @throws UncheckedException If a worker cannot be created + */ public BulkScanWorker getBulkScanWorker( String bulkScanId, ScanConfig scanConfig, @@ -83,6 +121,15 @@ public BulkScanWorker getBulkScanWorker( } } + /** + * Handles a scan job by creating or retrieving the appropriate worker and submitting the scan + * target for processing. + * + * @param scanJobDescription The scan job to handle + * @param parallelConnectionThreads The number of parallel connection threads to use (used to create worker if it does not exist) + * @param parallelScanThreads The number of parallel scan threads to use (used to create worker if it does not exist) + * @return A future that returns the scan result when the target is scanned is done + */ public Future handle( ScanJobDescription scanJobDescription, int parallelConnectionThreads, diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScan.java b/src/main/java/de/rub/nds/crawler/data/BulkScan.java index b70b0b2..90e548f 100644 --- a/src/main/java/de/rub/nds/crawler/data/BulkScan.java +++ b/src/main/java/de/rub/nds/crawler/data/BulkScan.java @@ -9,17 +9,24 @@ package de.rub.nds.crawler.data; import de.rub.nds.crawler.constant.JobStatus; + +import javax.persistence.Id; import java.io.Serializable; import java.time.Instant; import java.time.ZoneId; import java.time.format.DateTimeFormatter; import java.util.EnumMap; import java.util.Map; -import javax.persistence.Id; +/** + * Represents a bulk scanning operation that manages multiple scanning jobs. This class tracks + * metadata about a scan batch including scan configuration, timing information, job statistics, and + * version information. + */ public class BulkScan implements Serializable { - @Id private String _id; + @Id + private String _id; private String name; @@ -55,8 +62,20 @@ public class BulkScan implements Serializable { DateTimeFormatter.ofPattern("yyyy-MM-dd_HH-mm").withZone(ZoneId.systemDefault()); @SuppressWarnings("unused") - private BulkScan() {} - + private BulkScan() { + } + + /** + * Creates a new bulk scan with the given parameters. + * + * @param scannerClass A scanner implementation class for retrieving version information + * @param crawlerClass A crawler implementation class for retrieving version information + * @param name The name of the bulk scan + * @param scanConfig The configuration to use for this scan + * @param startTime The start time as a timestamp in milliseconds + * @param monitored Whether this scan should be monitored for progress + * @param notifyUrl Optional URL to notify when the scan is complete + */ public BulkScan( Class scannerClass, Class crawlerClass, @@ -75,141 +94,312 @@ public BulkScan( this.collectionName = name + "_" + DATE_FORMATTER.format(Instant.ofEpochMilli(startTime)); this.notifyUrl = notifyUrl; } - - // Getter naming important for correct serialization, do not change! + + /** + * Gets the database ID for this bulk scan. + * + * @return The database ID + */ public String get_id() { + // Getter naming important for correct serialization, do not change! return _id; } + /** + * Gets the name of this bulk scan. + * + * @return The name + */ public String getName() { return this.name; } + /** + * Gets the collection name where scan results will be stored. + * + * @return The collection name + */ public String getCollectionName() { return this.collectionName; } + /** + * Gets the scan configuration for this bulk scan. + * + * @return The scan configuration + */ public ScanConfig getScanConfig() { return this.scanConfig; } + /** + * Checks if this bulk scan is monitored for progress. + * + * @return True if the scan is monitored, false otherwise + */ public boolean isMonitored() { return this.monitored; } + /** + * Checks if this bulk scan has finished. + * + * @return True if the scan is finished, false otherwise + */ public boolean isFinished() { return this.finished; } + /** + * Gets the start time of this bulk scan. + * + * @return The start time as a timestamp in milliseconds + */ public long getStartTime() { return this.startTime; } + /** + * Gets the end time of this bulk scan. + * + * @return The end time as a timestamp in milliseconds + */ public long getEndTime() { return this.endTime; } + /** + * Gets the total number of targets provided for this bulk scan. + * + * @return The number of targets + */ public int getTargetsGiven() { return this.targetsGiven; } + /** + * Gets the number of scan jobs published for this bulk scan. + * + * @return The number of scan jobs published + */ public long getScanJobsPublished() { return this.scanJobsPublished; } + /** + * Gets the number of successful scans completed for this bulk scan. + * + * @return The number of successful scans + */ public int getSuccessfulScans() { return this.successfulScans; } + /** + * Gets the URL to notify when this bulk scan is complete. + * + * @return The notification URL + */ public String getNotifyUrl() { return this.notifyUrl; } + /** + * Gets the version of the scanner used for this bulk scan. + * + * @return The scanner version + */ public String getScannerVersion() { return this.scannerVersion; } + /** + * Gets the version of the crawler used for this bulk scan. + * + * @return The crawler version + */ public String getCrawlerVersion() { return this.crawlerVersion; } // Setter naming important for correct serialization, do not change! + + /** + * Sets the database ID for this bulk scan. + * + * @param _id The database ID + */ public void set_id(String _id) { this._id = _id; } + /** + * Sets the name of this bulk scan. + * + * @param name The name + */ public void setName(String name) { this.name = name; } + /** + * Sets the collection name where scan results will be stored. + * + * @param collectionName The collection name + */ public void setCollectionName(String collectionName) { this.collectionName = collectionName; } + /** + * Sets the scan configuration for this bulk scan. + * + * @param scanConfig The scan configuration + */ public void setScanConfig(ScanConfig scanConfig) { this.scanConfig = scanConfig; } + /** + * Sets whether this bulk scan is monitored for progress. + * + * @param monitored True if the scan should be monitored, false otherwise + */ public void setMonitored(boolean monitored) { this.monitored = monitored; } + /** + * Sets whether this bulk scan is finished. + * + * @param finished True if the scan is finished, false otherwise + */ public void setFinished(boolean finished) { this.finished = finished; } + /** + * Sets the start time of this bulk scan. + * + * @param startTime The start time as a timestamp in milliseconds + */ public void setStartTime(long startTime) { this.startTime = startTime; } + /** + * Sets the end time of this bulk scan. + * + * @param endTime The end time as a timestamp in milliseconds + */ public void setEndTime(long endTime) { this.endTime = endTime; } + /** + * Sets the total number of targets for this bulk scan. + * + * @param targetsGiven The number of targets + */ public void setTargetsGiven(int targetsGiven) { this.targetsGiven = targetsGiven; } + /** + * Sets the number of scan jobs published for this bulk scan. + * + * @param scanJobsPublished The number of scan jobs published + */ public void setScanJobsPublished(long scanJobsPublished) { this.scanJobsPublished = scanJobsPublished; } + /** + * Sets the number of successful scans completed for this bulk scan. + * + * @param successfulScans The number of successful scans + */ public void setSuccessfulScans(int successfulScans) { this.successfulScans = successfulScans; } + /** + * Sets the URL to notify when this bulk scan is complete. + * + * @param notifyUrl The notification URL + */ public void setNotifyUrl(String notifyUrl) { this.notifyUrl = notifyUrl; } + /** + * Sets the version of the scanner used for this bulk scan. + * + * @param scannerVersion The scanner version + */ public void setScannerVersion(String scannerVersion) { this.scannerVersion = scannerVersion; } + /** + * Sets the version of the crawler used for this bulk scan. + * + * @param crawlerVersion The crawler version + */ public void setCrawlerVersion(String crawlerVersion) { this.crawlerVersion = crawlerVersion; } + /** + * Gets the job status counters for this bulk scan. + * + * @return A map of job status to count + */ public Map getJobStatusCounters() { return jobStatusCounters; } + /** + * Sets the job status counters for this bulk scan. + * + * @param jobStatusCounters A map of job status to count + */ public void setJobStatusCounters(Map jobStatusCounters) { this.jobStatusCounters = jobStatusCounters; } + /** + * Gets the number of scan jobs that failed due to domain resolution errors. + * + * @return The number of resolution errors + */ public long getScanJobsResolutionErrors() { return scanJobsResolutionErrors; } + /** + * Sets the number of scan jobs that failed due to domain resolution errors. + * + * @param scanJobsResolutionErrors The number of resolution errors + */ public void setScanJobsResolutionErrors(long scanJobsResolutionErrors) { this.scanJobsResolutionErrors = scanJobsResolutionErrors; } + /** + * Gets the number of scan jobs skipped due to denylisting. + * + * @return The number of denylisted scan jobs + */ public long getScanJobsDenylisted() { return scanJobsDenylisted; } + /** + * Sets the number of scan jobs skipped due to denylisting. + * + * @param scanJobsDenylisted The number of denylisted scan jobs + */ public void setScanJobsDenylisted(long scanJobsDenylisted) { this.scanJobsDenylisted = scanJobsDenylisted; } diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java b/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java index 1e40e41..4937ee4 100644 --- a/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java +++ b/src/main/java/de/rub/nds/crawler/data/BulkScanInfo.java @@ -21,24 +21,51 @@ public class BulkScanInfo implements Serializable { private final boolean isMonitored; + /** + * Creates a new BulkScanInfo from a BulkScan. + * + * @param bulkScan The bulk scan to extract information from + */ public BulkScanInfo(BulkScan bulkScan) { this.bulkScanId = bulkScan.get_id(); this.scanConfig = bulkScan.getScanConfig(); this.isMonitored = bulkScan.isMonitored(); } + /** + * Gets the ID of the bulk scan. + * + * @return The bulk scan ID + */ public String getBulkScanId() { return bulkScanId; } + /** + * Gets the scan configuration for this bulk scan. + * + * @return The scan configuration + */ public ScanConfig getScanConfig() { return scanConfig; } + /** + * Gets the scan configuration cast to a specific type. + * + * @param The type to cast the scan configuration to + * @param clazz The class of the type to cast to + * @return The scan configuration cast to the specified type + */ public T getScanConfig(Class clazz) { return clazz.cast(scanConfig); } + /** + * Checks if this bulk scan is being monitored. + * + * @return True if the scan is monitored, false otherwise + */ public boolean isMonitored() { return isMonitored; } diff --git a/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java b/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java index bfaac3a..d452392 100644 --- a/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java +++ b/src/main/java/de/rub/nds/crawler/data/BulkScanJobCounters.java @@ -9,10 +9,16 @@ package de.rub.nds.crawler.data; import de.rub.nds.crawler.constant.JobStatus; + import java.util.EnumMap; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; +/** + * Counter class for tracking job statistics during a bulk scan. This class maintains thread-safe + * counters for each job status type. + * Used to track statistics of finished jobs during a bulk scan. + */ public class BulkScanJobCounters { private final BulkScan bulkScan; @@ -20,6 +26,12 @@ public class BulkScanJobCounters { private final AtomicInteger totalJobDoneCount = new AtomicInteger(0); private final Map jobStatusCounters = new EnumMap<>(JobStatus.class); + /** + * Creates a new BulkScanJobCounters instance for the given bulk scan. Initializes counters for + * all job statuses except TO_BE_EXECUTED. + * + * @param bulkScan The bulk scan to track counters for + */ public BulkScanJobCounters(BulkScan bulkScan) { this.bulkScan = bulkScan; for (JobStatus jobStatus : JobStatus.values()) { @@ -30,10 +42,21 @@ public BulkScanJobCounters(BulkScan bulkScan) { } } + /** + * Gets the bulk scan associated with these counters. + * + * @return The bulk scan + */ public BulkScan getBulkScan() { return bulkScan; } + /** + * Gets a copy of the job status counters as a non-atomic map. This creates a snapshot of the + * current counter values. + * + * @return A map of job status to count + */ public Map getJobStatusCountersCopy() { EnumMap ret = new EnumMap<>(JobStatus.class); for (Map.Entry entry : jobStatusCounters.entrySet()) { @@ -42,10 +65,22 @@ public Map getJobStatusCountersCopy() { return ret; } + /** + * Gets the count for a specific job status. + * + * @param jobStatus The job status to get the count for + * @return The current count for the given status + */ public int getJobStatusCount(JobStatus jobStatus) { return jobStatusCounters.get(jobStatus).get(); } + /** + * Increments the count for a specific job status and the total job count. + * + * @param jobStatus The job status to increment the count for + * @return The new total job count after incrementing + */ public int increaseJobStatusCount(JobStatus jobStatus) { jobStatusCounters.get(jobStatus).incrementAndGet(); return totalJobDoneCount.incrementAndGet(); diff --git a/src/main/java/de/rub/nds/crawler/data/ScanConfig.java b/src/main/java/de/rub/nds/crawler/data/ScanConfig.java index 8f91fc2..80ff97d 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanConfig.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanConfig.java @@ -12,6 +12,10 @@ import de.rub.nds.scanner.core.config.ScannerDetail; import java.io.Serializable; +/** + * Abstract base class for scan configurations. Contains common configuration options for all + * scanner types and defines required factory methods to create workers. + */ public abstract class ScanConfig implements Serializable { private ScannerDetail scannerDetail; @@ -23,36 +27,82 @@ public abstract class ScanConfig implements Serializable { @SuppressWarnings("unused") private ScanConfig() {} + /** + * Creates a new scan configuration with the specified parameters. + * + * @param scannerDetail The level of detail for the scan + * @param reexecutions The number of times to retry failed scans + * @param timeout The timeout for each scan in seconds + */ protected ScanConfig(ScannerDetail scannerDetail, int reexecutions, int timeout) { this.scannerDetail = scannerDetail; this.reexecutions = reexecutions; this.timeout = timeout; } + /** + * Gets the scanner detail level. + * + * @return The scanner detail level + */ public ScannerDetail getScannerDetail() { return this.scannerDetail; } + /** + * Gets the number of reexecutions for failed scans. + * + * @return The number of reexecutions + */ public int getReexecutions() { return this.reexecutions; } + /** + * Gets the timeout for each scan in seconds. + * + * @return The timeout in seconds + */ public int getTimeout() { return this.timeout; } + /** + * Sets the scanner detail level. + * + * @param scannerDetail The scanner detail level + */ public void setScannerDetail(ScannerDetail scannerDetail) { this.scannerDetail = scannerDetail; } + /** + * Sets the number of reexecutions for failed scans. + * + * @param reexecutions The number of reexecutions + */ public void setReexecutions(int reexecutions) { this.reexecutions = reexecutions; } + /** + * Sets the timeout for each scan in seconds. + * + * @param timeout The timeout in seconds + */ public void setTimeout(int timeout) { this.timeout = timeout; } + /** + * Creates a worker for this scan configuration. Each implementation must provide a factory + * method to create the appropriate worker type. + * + * @param bulkScanID The ID of the bulk scan this worker is for + * @param parallelConnectionThreads The number of parallel connection threads to use + * @param parallelScanThreads The number of parallel scan threads to use + * @return A worker for this scan configuration + */ public abstract BulkScanWorker createWorker( String bulkScanID, int parallelConnectionThreads, int parallelScanThreads); } diff --git a/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java b/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java index 841b410..3bd92a7 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java @@ -12,9 +12,12 @@ import java.io.IOException; import java.io.Serializable; import java.util.Optional; +import java.util.UUID; public class ScanJobDescription implements Serializable { + private final UUID id = UUID.randomUUID(); + private final ScanTarget scanTarget; // Metadata @@ -52,6 +55,10 @@ public ScanJobDescription(ScanTarget scanTarget, BulkScan bulkScan, JobStatus st status); } + public UUID getId() { + return id; + } + private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { // handle deserialization, cf. https://stackoverflow.com/a/3960558 diff --git a/src/main/java/de/rub/nds/crawler/data/ScanResult.java b/src/main/java/de/rub/nds/crawler/data/ScanResult.java index ebd5de5..b2b1c2d 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanResult.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanResult.java @@ -8,6 +8,7 @@ */ package de.rub.nds.crawler.data; +import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import de.rub.nds.crawler.constant.JobStatus; import java.io.Serializable; @@ -26,8 +27,12 @@ public class ScanResult implements Serializable { private final Document result; + @JsonCreator private ScanResult( - String bulkScan, ScanTarget scanTarget, JobStatus jobStatus, Document result) { + @JsonProperty("bulkScan") String bulkScan, + @JsonProperty("scanTarget") ScanTarget scanTarget, + @JsonProperty("resultStatus") JobStatus jobStatus, + @JsonProperty("result") Document result) { this.id = UUID.randomUUID().toString(); this.bulkScan = bulkScan; this.scanTarget = scanTarget; @@ -35,6 +40,14 @@ private ScanResult( this.result = result; } + public ScanResult() { + // Default constructor for Jackson deserialization + this.bulkScan = ""; + this.scanTarget = null; + this.jobStatus = null; + this.result = null; + } + public ScanResult(ScanJobDescription scanJobDescription, Document result) { this( scanJobDescription.getBulkScanInfo().getBulkScanId(), diff --git a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java index 33fccd5..a50a5b4 100644 --- a/src/main/java/de/rub/nds/crawler/data/ScanTarget.java +++ b/src/main/java/de/rub/nds/crawler/data/ScanTarget.java @@ -10,14 +10,19 @@ import de.rub.nds.crawler.constant.JobStatus; import de.rub.nds.crawler.denylist.IDenylistProvider; -import java.io.Serializable; -import java.net.InetAddress; -import java.net.UnknownHostException; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.validator.routines.InetAddressValidator; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.io.Serializable; +import java.net.InetAddress; +import java.net.UnknownHostException; + +/** + * Represents a target to be scanned by the crawler. Contains information about the hostname, IP + * address, port, and ranking information. + */ public class ScanTarget implements Serializable { private static final Logger LOGGER = LogManager.getLogger(); @@ -25,8 +30,8 @@ public class ScanTarget implements Serializable { * Initializes a ScanTarget object from a string that potentially contains a hostname, an ip, a * port, the tranco rank. * - * @param targetString from which to create the ScanTarget object - * @param defaultPort that used if no port is present in targetString + * @param targetString from which to create the ScanTarget object + * @param defaultPort that used if no port is present in targetString * @param denylistProvider which provides info if a host is denylisted * @return ScanTarget object */ @@ -139,49 +144,111 @@ public static Pair fromTargetString( return Pair.of(target, JobStatus.TO_BE_EXECUTED); } + /** + * The IP address of the target. + */ private String ip; + /** + * The hostname of the target. + */ private String hostname; + /** + * The port number to connect to. + */ private int port; + /** + * The Tranco rank of the target (if applicable). + */ private int trancoRank; - public ScanTarget() {} + /** + * Creates a new empty scan target. Fields should be set using the setter methods. + */ + public ScanTarget() { + } + /** + * Returns a string representation of this scan target. Uses the hostname if available, + * otherwise uses the IP address. + * + * @return The string representation + */ @Override public String toString() { return hostname != null ? hostname : ip; } + /** + * Gets the IP address of this target. + * + * @return The IP address + */ public String getIp() { return this.ip; } + /** + * Gets the hostname of this target. + * + * @return The hostname + */ public String getHostname() { return this.hostname; } + /** + * Gets the port number to connect to. + * + * @return The port number + */ public int getPort() { return this.port; } + /** + * Gets the Tranco rank of this target (if applicable). + * + * @return The Tranco rank + */ public int getTrancoRank() { return this.trancoRank; } + /** + * Sets the IP address of this target. + * + * @param ip The IP address + */ public void setIp(String ip) { this.ip = ip; } + /** + * Sets the hostname of this target. + * + * @param hostname The hostname + */ public void setHostname(String hostname) { this.hostname = hostname; } + /** + * Sets the port number to connect to. + * + * @param port The port number + */ public void setPort(int port) { this.port = port; } + /** + * Sets the Tranco rank of this target. + * + * @param trancoRank The Tranco rank + */ public void setTrancoRank(int trancoRank) { this.trancoRank = trancoRank; } diff --git a/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java b/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java index ed1e4c5..3dba32b 100644 --- a/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java +++ b/src/main/java/de/rub/nds/crawler/denylist/IDenylistProvider.java @@ -10,7 +10,17 @@ import de.rub.nds.crawler.data.ScanTarget; +/** + * Interface for providers that check if a scan target is on a denylist. This can be used to skip + * scanning of certain targets for various reasons (legal, ethical, or technical). + */ public interface IDenylistProvider { + /** + * Checks if a scan target is on the denylist. + * + * @param target The scan target to check + * @return True if the target is denylisted, false otherwise + */ boolean isDenylisted(ScanTarget target); } diff --git a/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java b/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java index 9af1769..90ae8c0 100644 --- a/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java +++ b/src/main/java/de/rub/nds/crawler/orchestration/DoneNotificationConsumer.java @@ -10,8 +10,18 @@ import de.rub.nds.crawler.data.ScanJobDescription; +/** + * Functional interface for consumers that handle completion notifications of scan jobs. Used to + * notify controllers when workers have completed their assigned tasks. + */ @FunctionalInterface public interface DoneNotificationConsumer { + /** + * Consumes a notification that a scan job has completed. + * + * @param consumerTag A tag identifying the consumer + * @param scanJobDescription The description of the completed scan job + */ void consumeDoneNotification(String consumerTag, ScanJobDescription scanJobDescription); } diff --git a/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java b/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java index 628b0ee..f565eab 100644 --- a/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java +++ b/src/main/java/de/rub/nds/crawler/orchestration/ScanJobConsumer.java @@ -10,8 +10,17 @@ import de.rub.nds.crawler.data.ScanJobDescription; +/** + * Functional interface for consumers that process scan jobs. Used by workers to receive jobs from + * the orchestration system. + */ @FunctionalInterface public interface ScanJobConsumer { + /** + * Consumes and processes a scan job. + * + * @param scanJobDescription The description of the scan job to process + */ void consumeScanJob(ScanJobDescription scanJobDescription); } diff --git a/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java index 50e3626..734876c 100644 --- a/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java +++ b/src/main/java/de/rub/nds/crawler/persistence/IPersistenceProvider.java @@ -11,6 +11,7 @@ import de.rub.nds.crawler.data.BulkScan; import de.rub.nds.crawler.data.ScanJobDescription; import de.rub.nds.crawler.data.ScanResult; +import java.util.List; /** * Persistence provider interface. Exposes methods to write out the different stages of a task to a @@ -40,4 +41,24 @@ public interface IPersistenceProvider { * @param bulkScan The bulk scan to update. */ void updateBulkScan(BulkScan bulkScan); + + /** + * Retrieve scan results for a specific target hostname or IP. + * + * @param dbName The database name where the scan results are stored. + * @param collectionName The collection name where the scan results are stored. + * @param target The hostname or IP address to search for. + * @return A list of scan results matching the target. + */ + List getScanResultsByTarget(String dbName, String collectionName, String target); + + /** + * Retrieve a specific scan result by its ID. + * + * @param dbName The database name where the scan result is stored. + * @param collectionName The collection name where the scan result is stored. + * @param id The ID of the scan result to retrieve. + * @return The scan result, or null if not found. + */ + ScanResult getScanResultById(String dbName, String collectionName, String id); } diff --git a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java index 22c0999..2f668b3 100644 --- a/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java +++ b/src/main/java/de/rub/nds/crawler/persistence/MongoPersistenceProvider.java @@ -9,6 +9,7 @@ package de.rub.nds.crawler.persistence; import com.fasterxml.jackson.annotation.JsonFormat; +import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.JsonSerializer; import com.fasterxml.jackson.databind.Module; import com.fasterxml.jackson.databind.ObjectMapper; @@ -35,7 +36,9 @@ import java.math.BigDecimal; import java.nio.file.Files; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.HashSet; +import java.util.List; import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.commons.lang3.tuple.Pair; @@ -128,6 +131,10 @@ private static MongoClient createMongoClient(MongoDbDelegate mongoDbDelegate) { private static ObjectMapper createMapper() { ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + mapper.configure(DeserializationFeature.FAIL_ON_MISSING_CREATOR_PROPERTIES, false); + mapper.configure(DeserializationFeature.FAIL_ON_NULL_CREATOR_PROPERTIES, false); + if (!serializers.isEmpty()) { SimpleModule serializerModule = new SimpleModule(); for (JsonSerializer serializer : serializers) { @@ -222,6 +229,7 @@ private JacksonMongoCollection getBulkScanCollection(String dbName) { @Override public void insertBulkScan(@NonNull BulkScan bulkScan) { + LOGGER.info("Inserting bulk scan with name: {}", bulkScan.getName()); this.getBulkScanCollection(bulkScan.getName()).insertOne(bulkScan); } @@ -243,6 +251,10 @@ private void writeResultToDatabase( @Override public void insertScanResult(ScanResult scanResult, ScanJobDescription scanJobDescription) { + LOGGER.info( + "Inserting scan result for job ID: {} with status: {}", + scanJobDescription.getId(), + scanResult.getResultStatus()); if (scanResult.getResultStatus() != scanJobDescription.getStatus()) { LOGGER.error( "ScanResult status ({}) does not match ScanJobDescription status ({})", @@ -251,6 +263,7 @@ public void insertScanResult(ScanResult scanResult, ScanJobDescription scanJobDe throw new IllegalArgumentException( "ScanResult status does not match ScanJobDescription status"); } + scanResult.setId(scanJobDescription.getId().toString()); try { writeResultToDatabase( scanJobDescription.getDbName(), @@ -270,4 +283,75 @@ public void insertScanResult(ScanResult scanResult, ScanJobDescription scanJobDe } } } + + @Override + public List getScanResultsByTarget( + String dbName, String collectionName, String target) { + LOGGER.info( + "Retrieving scan results for target {} from collection: {}.{}", + target, + dbName, + collectionName); + + try { + var collection = resultCollectionCache.getUnchecked(Pair.of(dbName, collectionName)); + + // Create a query that matches either hostname or IP + var query = new org.bson.Document(); + var orQuery = new ArrayList(); + orQuery.add(new org.bson.Document("scanTarget.hostname", target)); + orQuery.add(new org.bson.Document("scanTarget.ip", target)); + query.append("$or", orQuery); + + var iterable = collection.find(query); + + List results = new ArrayList<>(); + iterable.forEach(results::add); + + LOGGER.info( + "Retrieved {} scan results for target {} from collection: {}.{}", + results.size(), + target, + dbName, + collectionName); + + return results; + } catch (Exception e) { + LOGGER.error("Exception while retrieving scan results from MongoDB: ", e); + throw new RuntimeException("Failed to retrieve scan results for target: " + target, e); + } + } + + @Override + public ScanResult getScanResultById(String dbName, String collectionName, String id) { + LOGGER.info( + "Retrieving scan result with ID {} from collection: {}.{}", + id, + dbName, + collectionName); + + try { + var collection = resultCollectionCache.getUnchecked(Pair.of(dbName, collectionName)); + var result = collection.findOneById(id); + + if (result == null) { + LOGGER.warn( + "No scan result found with ID: {} in collection: {}.{}", + id, + dbName, + collectionName); + } else { + LOGGER.info( + "Retrieved scan result with ID: {} from collection: {}.{}", + id, + dbName, + collectionName); + } + + return result; + } catch (Exception e) { + LOGGER.error("Exception while retrieving scan result from MongoDB: ", e); + throw new RuntimeException("Failed to retrieve scan result with ID: " + id, e); + } + } } diff --git a/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java b/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java index 5e4662f..98bc542 100644 --- a/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java +++ b/src/main/java/de/rub/nds/crawler/targetlist/ITargetListProvider.java @@ -10,7 +10,16 @@ import java.util.List; +/** + * Interface for providers that supply lists of targets to scan. Implementations can retrieve + * targets from different sources such as files, databases, or web services. + */ public interface ITargetListProvider { + /** + * Gets the list of targets to scan. + * + * @return A list of target hostnames or IP addresses + */ List getTargetList(); } diff --git a/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java b/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java index f4d14fd..ae0f457 100644 --- a/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java +++ b/src/main/java/de/rub/nds/crawler/util/CanceallableThreadPoolExecutor.java @@ -10,7 +10,20 @@ import java.util.concurrent.*; +/** + * A custom thread pool executor that creates cancellable futures. This executor allows tasks to + * return a partial result even when cancelled. + */ public class CanceallableThreadPoolExecutor extends ThreadPoolExecutor { + /** + * Creates a new thread pool executor with the given parameters. + * + * @param corePoolSize The number of threads to keep in the pool, even if idle + * @param maximumPoolSize The maximum number of threads to allow in the pool + * @param keepAliveTime How long idle threads should be kept alive + * @param unit The time unit for the keepAliveTime + * @param workQueue The queue to use for holding tasks before they are executed + */ public CanceallableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, @@ -20,6 +33,16 @@ public CanceallableThreadPoolExecutor( super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue); } + /** + * Creates a new thread pool executor with the given parameters. + * + * @param corePoolSize The number of threads to keep in the pool, even if idle + * @param maximumPoolSize The maximum number of threads to allow in the pool + * @param keepAliveTime How long idle threads should be kept alive + * @param unit The time unit for the keepAliveTime + * @param workQueue The queue to use for holding tasks before they are executed + * @param threadFactory The factory to use when creating new threads + */ public CanceallableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, @@ -30,6 +53,16 @@ public CanceallableThreadPoolExecutor( super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory); } + /** + * Creates a new thread pool executor with the given parameters. + * + * @param corePoolSize The number of threads to keep in the pool, even if idle + * @param maximumPoolSize The maximum number of threads to allow in the pool + * @param keepAliveTime How long idle threads should be kept alive + * @param unit The time unit for the keepAliveTime + * @param workQueue The queue to use for holding tasks before they are executed + * @param handler The handler to use when execution is blocked + */ public CanceallableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, @@ -40,6 +73,17 @@ public CanceallableThreadPoolExecutor( super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, handler); } + /** + * Creates a new thread pool executor with the given parameters. + * + * @param corePoolSize The number of threads to keep in the pool, even if idle + * @param maximumPoolSize The maximum number of threads to allow in the pool + * @param keepAliveTime How long idle threads should be kept alive + * @param unit The time unit for the keepAliveTime + * @param workQueue The queue to use for holding tasks before they are executed + * @param threadFactory The factory to use when creating new threads + * @param handler The handler to use when execution is blocked + */ public CanceallableThreadPoolExecutor( int corePoolSize, int maximumPoolSize, @@ -58,11 +102,26 @@ public CanceallableThreadPoolExecutor( handler); } + /** + * Creates a new cancellable future for the given callable. + * + * @param The type of the result + * @param callable The callable to be executed + * @return A new cancellable future for the callable + */ @Override protected RunnableFuture newTaskFor(Callable callable) { return new CancellableFuture<>(callable); } + /** + * Creates a new cancellable future for the given runnable and result value. + * + * @param The type of the result + * @param runnable The runnable to be executed + * @param value The result value to return when the runnable completes + * @return A new cancellable future for the runnable + */ @Override protected RunnableFuture newTaskFor(Runnable runnable, T value) { return new CancellableFuture<>(runnable, value); diff --git a/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java b/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java index d7706b1..b166c85 100644 --- a/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java +++ b/src/main/java/de/rub/nds/crawler/util/CancellableFuture.java @@ -12,12 +12,25 @@ import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicReference; +/** + * A cancellable future implementation that can return partial results even when cancelled. This + * class wraps a standard FutureTask but captures the result when available, allowing it to be + * retrieved even after cancellation. + * + * @param The result type returned by this future + */ public class CancellableFuture implements RunnableFuture { private final AtomicReference result = new AtomicReference<>(); private final RunnableFuture innerFuture; private final Semaphore resultWritten = new Semaphore(0); + /** + * Creates a new cancellable future for the given callable. When the callable completes, the + * result is stored for retrieval even after cancellation. + * + * @param callable The callable to be executed + */ public CancellableFuture(Callable callable) { innerFuture = new FutureTask<>( @@ -29,6 +42,13 @@ public CancellableFuture(Callable callable) { }); } + /** + * Creates a new cancellable future for the given runnable and result value. When the runnable + * completes, the result value is stored for retrieval even after cancellation. + * + * @param runnable The runnable to be executed + * @param res The result value to return when the runnable completes + */ public CancellableFuture(Runnable runnable, V res) { innerFuture = new FutureTask<>( @@ -40,21 +60,46 @@ public CancellableFuture(Runnable runnable, V res) { }); } + /** + * Attempts to cancel execution of this task. + * + * @param mayInterruptIfRunning True if the thread executing this task should be interrupted + * @return True if the task was cancelled, false otherwise + */ @Override - public boolean cancel(boolean b) { - return innerFuture.cancel(b); + public boolean cancel(boolean mayInterruptIfRunning) { + return innerFuture.cancel(mayInterruptIfRunning); } + /** + * Returns true if this task was cancelled before it completed normally. + * + * @return True if this task was cancelled before it completed + */ @Override public boolean isCancelled() { return innerFuture.isCancelled(); } + /** + * Returns true if this task completed. Completion may be due to normal termination, an + * exception, or cancellation. + * + * @return True if this task completed + */ @Override public boolean isDone() { return innerFuture.isDone(); } + /** + * Waits if necessary for the computation to complete, and then retrieves its result. If the + * task was cancelled but the result was captured, returns the captured result. + * + * @return The computed result + * @throws InterruptedException If the current thread was interrupted while waiting + * @throws ExecutionException If the computation threw an exception + */ @Override public V get() throws InterruptedException, ExecutionException { try { @@ -65,19 +110,32 @@ public V get() throws InterruptedException, ExecutionException { } } + /** + * Waits if necessary for at most the given time for the computation to complete, and then + * retrieves its result. If the task was cancelled but the result was captured, returns the + * captured result if available within the timeout. + * + * @param timeout The maximum time to wait + * @param timeUnit The time unit of the timeout argument + * @return The computed result + * @throws InterruptedException If the current thread was interrupted while waiting + * @throws ExecutionException If the computation threw an exception + * @throws TimeoutException If the wait timed out + */ @Override - public V get(long l, @NonNull TimeUnit timeUnit) + public V get(long timeout, @NonNull TimeUnit timeUnit) throws InterruptedException, ExecutionException, TimeoutException { try { - return innerFuture.get(l, timeUnit); + return innerFuture.get(timeout, timeUnit); } catch (CancellationException e) { - if (resultWritten.tryAcquire(l, timeUnit)) { + if (resultWritten.tryAcquire(timeout, timeUnit)) { return result.get(); } throw new TimeoutException("Timeout while waiting for cancelled result"); } } + /** Executes the underlying task. */ @Override public void run() { innerFuture.run(); diff --git a/src/test/java/de/rub/nds/crawler/dummy/DummyPersistenceProvider.java b/src/test/java/de/rub/nds/crawler/dummy/DummyPersistenceProvider.java index 9c2bd00..dabd334 100644 --- a/src/test/java/de/rub/nds/crawler/dummy/DummyPersistenceProvider.java +++ b/src/test/java/de/rub/nds/crawler/dummy/DummyPersistenceProvider.java @@ -13,6 +13,7 @@ import de.rub.nds.crawler.data.ScanResult; import de.rub.nds.crawler.persistence.IPersistenceProvider; import java.util.ArrayList; +import java.util.LinkedList; import java.util.List; public class DummyPersistenceProvider implements IPersistenceProvider { @@ -31,4 +32,18 @@ public void insertBulkScan(BulkScan bulkScan) { @Override public void updateBulkScan(BulkScan bulkScan) {} + + @Override + public List getScanResultsByTarget( + String dbName, String collectionName, String target) { + return new LinkedList<>(); + } + + @Override + public ScanResult getScanResultById(String dbName, String collectionName, String id) { + return results.stream() + .filter(result -> result.getId().equals(id)) + .findFirst() + .orElse(null); + } }