Skip to content

Commit

Permalink
add CrawlerHasBeenDetectedException
Browse files Browse the repository at this point in the history
  • Loading branch information
win7user10 committed Sep 22, 2023
1 parent 1378209 commit c5e01f3
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 8 deletions.
16 changes: 11 additions & 5 deletions src/Laraue.Crawling.Crawler/BaseCrawlerJob.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,28 @@ public override async Task<TimeSpan> ExecuteAsync(JobState<TState> jobState, Can
{
var link = await GetNextLinkAsync(jobState, stoppingToken).ConfigureAwait(false);
_logger.LogInformation("Page {Page} processing started", link);

var result = await ParseLinkAsync(link, jobState, stoppingToken).ConfigureAwait(false);
await AfterLinkParsedAsync(link, result, jobState, stoppingToken).ConfigureAwait(false);

_logger.LogInformation(
"Page {Page} processing finished for {Time}",
link,
pageStopwatch.Elapsed);

stoppingToken.ThrowIfCancellationRequested();
}
catch (SessionInterruptedException e)
{
_logger.LogInformation("Session should be finished. Reason: {Message}", e.Message);

return await RunSessionFinishAsync(jobState, stoppingToken);

return await RunSessionFinishAsync(jobState, stoppingToken).ConfigureAwait(false);
}
catch (CrawlerHasBeenDetectedException e)
{
_logger.LogInformation("Crawler has been detected. {Message}", e.Message);

await e.SwitchToCorrectStateAsync().ConfigureAwait(false);
}
}
}
Expand Down
23 changes: 23 additions & 0 deletions src/Laraue.Crawling.Crawler/CrawlerHasBeenDetectedException.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
namespace Laraue.Crawling.Crawler;

/// <summary>
/// Exception that describe to the system that crawler has been detected and the passed
/// delegate should be executed to continue the crawling process.
/// </summary>
public sealed class CrawlerHasBeenDetectedException : Exception
{
/// <summary>
/// Delegate to return a crawler to the normal state.
/// </summary>
public Func<Task> SwitchToCorrectStateAsync { get; }

/// <summary>
/// Initializes a new instance of <see cref="SessionInterruptedException"/> with description of interrupting.
/// </summary>
/// <param name="message"></param>
/// <param name="switchToCorrectStateAsync"></param>
public CrawlerHasBeenDetectedException(string message, Func<Task> switchToCorrectStateAsync) : base(message)
{
SwitchToCorrectStateAsync = switchToCorrectStateAsync;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@ public interface IPageParser
/// <param name="page"></param>
/// <param name="schema"></param>
/// <returns></returns>
Task<TResult?> ParseAsync<TResult>(IPage page, ICompiledHtmlSchema<IElementHandle, TResult> schema);
Task<TResult> ParseAsync<TResult>(IPage page, ICompiledHtmlSchema<IElementHandle, TResult> schema);
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ public static class ServiceCollectionExtensions
/// <param name="serviceCollection"></param>
/// <param name="launchOptions"></param>
/// <returns></returns>
public static IServiceCollection AddPuppeterFactory(
public static IServiceCollection AddCrawlingServices(
this IServiceCollection serviceCollection,
LaunchOptions launchOptions)
{
Expand Down
2 changes: 1 addition & 1 deletion src/Laraue.Crawling.Dynamic.PuppeterSharp/PageParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public PageParser(IHtmlSchemaParser<IElementHandle> schemaParser)
}

/// <inheritdoc />
public async Task<TResult?> ParseAsync<TResult>(IPage page, ICompiledHtmlSchema<IElementHandle, TResult> schema)
public async Task<TResult> ParseAsync<TResult>(IPage page, ICompiledHtmlSchema<IElementHandle, TResult> schema)
{
var element = await page.QuerySelectorAsync("body")
.ConfigureAwait(false);
Expand Down

0 comments on commit c5e01f3

Please sign in to comment.