Skip to content

Commit

Permalink
Make it possible to use regex in crawlDelay.nodelay (#298)
Browse files Browse the repository at this point in the history
* Make it possible to use regex in crawlDelay.nodelay

E.g. by using this in TSconfig:

mod.brofix.crawlDelay.nodelay = regex:/(.*\.)?example.(org|com)/

Using nodelay, it is possible to exclude local domains from crawlDelay.
  • Loading branch information
sypets committed Feb 25, 2024
1 parent 0de0038 commit c22a519
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 12 deletions.
34 changes: 24 additions & 10 deletions Classes/CheckLinks/CrawlDelay.php
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,6 @@
*/
class CrawlDelay
{
/**
* @var int
*/
protected $delaySeconds;

/**
* @var array<string>
*/
Expand All @@ -41,10 +36,11 @@ class CrawlDelay
*/
protected $lastCheckedDomainTimestamps = [];

protected Configuration $configuration;

public function setConfiguration(Configuration $config): void
{
$this->delaySeconds = $config->getCrawlDelaySeconds();
$this->noCrawlDelayDomains = $config->getCrawlDelayNodelay();
$this->configuration = $config;
}

/**
Expand All @@ -56,8 +52,8 @@ public function setConfiguration(Configuration $config): void
*/
public function crawlDelay(string $domain): int
{
if ($domain === '' || in_array($domain, $this->noCrawlDelayDomains)) {
// skip delay
$delaySeconds = $this->getCrawlDelayByDomain($domain);
if ($delaySeconds === 0) {
return 0;
}
/**
Expand All @@ -67,7 +63,7 @@ public function crawlDelay(string $domain): int
$current = \time();

// check if delay necessary
$wait = $this->delaySeconds - ($current-$lastTimestamp);
$wait = $delaySeconds - ($current-$lastTimestamp);
if ($wait > 0) {
// wait now
sleep($wait);
Expand All @@ -93,4 +89,22 @@ public function setLastCheckedTime(string $domain): bool
$this->lastCheckedDomainTimestamps[$domain] = $current;
return true;
}

protected function getCrawlDelayByDomain(string $domain): int
{
if ($domain === '') {
return 0;
}

// check if domain should be skipped: do not use crawlDelay
if ($this->configuration->isCrawlDelayNoDelayRegex()) {
if (preg_match($this->configuration->getCrawlDelayNoDelayRegex(), $domain)) {
return 0;
}
} elseif (in_array($domain, $this->configuration->getCrawlDelayNodelayDomains())) {
// skip delay
return 0;
}
return $this->configuration->getCrawlDelaySeconds();
}
}
39 changes: 38 additions & 1 deletion Classes/Configuration/Configuration.php
Original file line number Diff line number Diff line change
Expand Up @@ -431,11 +431,48 @@ public function getCrawlDelaySeconds(): int
}

/**
* Return crawlDelay nodelay domains.
* @return array<string>
* @deprecated Use getCrawlDelayNodelayDomains
*/
public function getCrawlDelayNodelay(): array
{
return explode(',', $this->tsConfig['crawlDelay.']['nodelay'] ?? '');
$noDelayString = $this->getCrawlDelayNodelayString();
if (str_starts_with($noDelayString, 'regex:')) {
return [];
}
return explode(',', $$noDelayString);
}

/**
* Return crawlDelay.nodelay domains.
* @return array<string>awlDelayNodelayDomains
*/
public function getCrawlDelayNodelayDomains(): array
{
$noDelayString = $this->getCrawlDelayNodelayString();
if (str_starts_with($noDelayString, 'regex:')) {
return [];
}
return explode(',', $noDelayString);
}

public function isCrawlDelayNoDelayRegex(): bool
{
return str_starts_with($this->getCrawlDelayNodelayString(), 'regex:');
}

public function getCrawlDelayNoDelayRegex(): string
{
if ($this->isCrawlDelayNoDelayRegex()) {
return trim(substr($this->getCrawlDelayNodelayString(), strlen('regex:')));
}
return '';
}

public function getCrawlDelayNodelayString(): string
{
return trim($this->tsConfig['crawlDelay.']['nodelay'] ?? '');
}

public function getDocsUrl(): string
Expand Down
4 changes: 3 additions & 1 deletion Configuration/TsConfig/Page/pagetsconfig.tsconfig
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ mod.brofix {
# minimum number of second delay between checking URLs of the same domain
seconds = 5

# comma separated list of domains to not use crawlDelay (can be used for internal sites)
# comma separated list of domains or regeix to not use crawlDelay (should be used for internal sites only!)
# e.g. nodelay = example.org, example.com
# e.g. nodelay = regex:/(.*\.)?example.(org|com)/
nodelay =
}

Expand Down
4 changes: 4 additions & 0 deletions Documentation/Setup/TsconfigReference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,10 @@ crawlDelay.nodelay
crawlDelay.nodelay = example.org,example.com
.. code-block:: typoscript
:caption: regex example
crawlDelay.nodelay = regex:/(.*\.)?example.(org|com)/
Default
empty

Expand Down

0 comments on commit c22a519

Please sign in to comment.