diff --git a/Classes/CheckLinks/CrawlDelay.php b/Classes/CheckLinks/CrawlDelay.php index 8db9d6e7..3ca0cd1f 100644 --- a/Classes/CheckLinks/CrawlDelay.php +++ b/Classes/CheckLinks/CrawlDelay.php @@ -24,11 +24,6 @@ */ class CrawlDelay { - /** - * @var int - */ - protected $delaySeconds; - /** * @var array */ @@ -41,10 +36,11 @@ class CrawlDelay */ protected $lastCheckedDomainTimestamps = []; + protected Configuration $configuration; + public function setConfiguration(Configuration $config): void { - $this->delaySeconds = $config->getCrawlDelaySeconds(); - $this->noCrawlDelayDomains = $config->getCrawlDelayNodelay(); + $this->configuration = $config; } /** @@ -56,8 +52,8 @@ public function setConfiguration(Configuration $config): void */ public function crawlDelay(string $domain): int { - if ($domain === '' || in_array($domain, $this->noCrawlDelayDomains)) { - // skip delay + $delaySeconds = $this->getCrawlDelayByDomain($domain); + if ($delaySeconds === 0) { return 0; } /** @@ -67,7 +63,7 @@ public function crawlDelay(string $domain): int $current = \time(); // check if delay necessary - $wait = $this->delaySeconds - ($current-$lastTimestamp); + $wait = $delaySeconds - ($current-$lastTimestamp); if ($wait > 0) { // wait now sleep($wait); @@ -93,4 +89,22 @@ public function setLastCheckedTime(string $domain): bool $this->lastCheckedDomainTimestamps[$domain] = $current; return true; } + + protected function getCrawlDelayByDomain(string $domain): int + { + if ($domain === '') { + return 0; + } + + // check if domain should be skipped: do not use crawlDelay + if ($this->configuration->isCrawlDelayNoDelayRegex()) { + if (preg_match($this->configuration->getCrawlDelayNoDelayRegex(), $domain)) { + return 0; + } + } elseif (in_array($domain, $this->configuration->getCrawlDelayNodelayDomains())) { + // skip delay + return 0; + } + return $this->configuration->getCrawlDelaySeconds(); + } } diff --git a/Classes/Configuration/Configuration.php b/Classes/Configuration/Configuration.php index 01d70acc..1d740527 100644 --- a/Classes/Configuration/Configuration.php +++ b/Classes/Configuration/Configuration.php @@ -431,11 +431,48 @@ public function getCrawlDelaySeconds(): int } /** + * Return crawlDelay nodelay domains. * @return array + * @deprecated Use getCrawlDelayNodelayDomains */ public function getCrawlDelayNodelay(): array { - return explode(',', $this->tsConfig['crawlDelay.']['nodelay'] ?? ''); + $noDelayString = $this->getCrawlDelayNodelayString(); + if (str_starts_with($noDelayString, 'regex:')) { + return []; + } + return explode(',', $$noDelayString); + } + + /** + * Return crawlDelay.nodelay domains. + * @return arrayawlDelayNodelayDomains + */ + public function getCrawlDelayNodelayDomains(): array + { + $noDelayString = $this->getCrawlDelayNodelayString(); + if (str_starts_with($noDelayString, 'regex:')) { + return []; + } + return explode(',', $noDelayString); + } + + public function isCrawlDelayNoDelayRegex(): bool + { + return str_starts_with($this->getCrawlDelayNodelayString(), 'regex:'); + } + + public function getCrawlDelayNoDelayRegex(): string + { + if ($this->isCrawlDelayNoDelayRegex()) { + return trim(substr($this->getCrawlDelayNodelayString(), strlen('regex:'))); + } + return ''; + } + + public function getCrawlDelayNodelayString(): string + { + return trim($this->tsConfig['crawlDelay.']['nodelay'] ?? ''); } public function getDocsUrl(): string diff --git a/Configuration/TsConfig/Page/pagetsconfig.tsconfig b/Configuration/TsConfig/Page/pagetsconfig.tsconfig index 7d780500..0e26c4fb 100644 --- a/Configuration/TsConfig/Page/pagetsconfig.tsconfig +++ b/Configuration/TsConfig/Page/pagetsconfig.tsconfig @@ -62,7 +62,9 @@ mod.brofix { # minimum number of second delay between checking URLs of the same domain seconds = 5 - # comma separated list of domains to not use crawlDelay (can be used for internal sites) + # comma separated list of domains or regeix to not use crawlDelay (should be used for internal sites only!) + # e.g. nodelay = example.org, example.com + # e.g. nodelay = regex:/(.*\.)?example.(org|com)/ nodelay = } diff --git a/Documentation/Setup/TsconfigReference.rst b/Documentation/Setup/TsconfigReference.rst index e593b379..9be7726b 100644 --- a/Documentation/Setup/TsconfigReference.rst +++ b/Documentation/Setup/TsconfigReference.rst @@ -616,6 +616,10 @@ crawlDelay.nodelay crawlDelay.nodelay = example.org,example.com + .. code-block:: typoscript + :caption: regex example + crawlDelay.nodelay = regex:/(.*\.)?example.(org|com)/ + Default empty