forked from dgtlmoon/changedetection.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_worker.py
584 lines (467 loc) · 32 KB
/
update_worker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
from .processors.exceptions import ProcessorException
import changedetectionio.content_fetchers.exceptions as content_fetchers_exceptions
from changedetectionio.processors.text_json_diff.processor import FilterNotFoundInResponse
from changedetectionio import html_tools
import importlib
import os
import queue
import threading
import time
# A single update worker
#
# Requests for checking on a single site(watch) from a queue of watches
# (another process inserts watches into the queue that are time-ready for checking)
from loguru import logger
class update_worker(threading.Thread):
current_uuid = None
def __init__(self, q, notification_q, app, datastore, *args, **kwargs):
self.q = q
self.app = app
self.notification_q = notification_q
self.datastore = datastore
super().__init__(*args, **kwargs)
def queue_notification_for_watch(self, notification_q, n_object, watch):
from changedetectionio import diff
dates = []
trigger_text = ''
now = time.time()
if watch:
watch_history = watch.history
dates = list(watch_history.keys())
trigger_text = watch.get('trigger_text', [])
# Add text that was triggered
if len(dates):
snapshot_contents = watch.get_history_snapshot(dates[-1])
else:
snapshot_contents = "No snapshot/history available, the watch should fetch atleast once."
# HTML needs linebreak, but MarkDown and Text can use a linefeed
if n_object.get('notification_format') == 'HTML':
line_feed_sep = "<br>"
# Snapshot will be plaintext on the disk, convert to some kind of HTML
snapshot_contents = snapshot_contents.replace('\n', line_feed_sep)
else:
line_feed_sep = "\n"
triggered_text = ''
if len(trigger_text):
from . import html_tools
triggered_text = html_tools.get_triggered_text(content=snapshot_contents, trigger_text=trigger_text)
if triggered_text:
triggered_text = line_feed_sep.join(triggered_text)
# Could be called as a 'test notification' with only 1 snapshot available
prev_snapshot = "Example text: example test\nExample text: change detection is cool\nExample text: some more examples\n"
current_snapshot = "Example text: example test\nExample text: change detection is fantastic\nExample text: even more examples\nExample text: a lot more examples"
if len(dates) > 1:
prev_snapshot = watch.get_history_snapshot(dates[-2])
current_snapshot = watch.get_history_snapshot(dates[-1])
n_object.update({
'current_snapshot': snapshot_contents,
'diff': diff.render_diff(prev_snapshot, current_snapshot, line_feed_sep=line_feed_sep),
'diff_added': diff.render_diff(prev_snapshot, current_snapshot, include_removed=False, line_feed_sep=line_feed_sep),
'diff_full': diff.render_diff(prev_snapshot, current_snapshot, include_equal=True, line_feed_sep=line_feed_sep),
'diff_patch': diff.render_diff(prev_snapshot, current_snapshot, line_feed_sep=line_feed_sep, patch_format=True),
'diff_removed': diff.render_diff(prev_snapshot, current_snapshot, include_added=False, line_feed_sep=line_feed_sep),
'notification_timestamp': now,
'screenshot': watch.get_screenshot() if watch and watch.get('notification_screenshot') else None,
'triggered_text': triggered_text,
'uuid': watch.get('uuid') if watch else None,
'watch_url': watch.get('url') if watch else None,
})
n_object.update(watch.extra_notification_token_values())
logger.trace(f"Main rendered notification placeholders (diff_added etc) calculated in {time.time()-now:.3f}s")
logger.debug("Queued notification for sending")
notification_q.put(n_object)
# Prefer - Individual watch settings > Tag settings > Global settings (in that order)
def _check_cascading_vars(self, var_name, watch):
from changedetectionio.notification import (
default_notification_format_for_watch,
default_notification_body,
default_notification_title
)
# Would be better if this was some kind of Object where Watch can reference the parent datastore etc
v = watch.get(var_name)
if v and not watch.get('notification_muted'):
if var_name == 'notification_format' and v == default_notification_format_for_watch:
return self.datastore.data['settings']['application'].get('notification_format')
return v
tags = self.datastore.get_all_tags_for_watch(uuid=watch.get('uuid'))
if tags:
for tag_uuid, tag in tags.items():
v = tag.get(var_name)
if v and not tag.get('notification_muted'):
return v
if self.datastore.data['settings']['application'].get(var_name):
return self.datastore.data['settings']['application'].get(var_name)
# Otherwise could be defaults
if var_name == 'notification_format':
return default_notification_format_for_watch
if var_name == 'notification_body':
return default_notification_body
if var_name == 'notification_title':
return default_notification_title
return None
def send_content_changed_notification(self, watch_uuid):
n_object = {}
watch = self.datastore.data['watching'].get(watch_uuid)
if not watch:
return
watch_history = watch.history
dates = list(watch_history.keys())
# Theoretically it's possible that this could be just 1 long,
# - In the case that the timestamp key was not unique
if len(dates) == 1:
raise ValueError(
"History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?"
)
# Should be a better parent getter in the model object
# Prefer - Individual watch settings > Tag settings > Global settings (in that order)
n_object['notification_urls'] = self._check_cascading_vars('notification_urls', watch)
n_object['notification_title'] = self._check_cascading_vars('notification_title', watch)
n_object['notification_body'] = self._check_cascading_vars('notification_body', watch)
n_object['notification_format'] = self._check_cascading_vars('notification_format', watch)
# (Individual watch) Only prepare to notify if the rules above matched
queued = False
if n_object and n_object.get('notification_urls'):
queued = True
count = watch.get('notification_alert_count', 0) + 1
self.datastore.update_watch(uuid=watch_uuid, update_obj={'notification_alert_count': count})
self.queue_notification_for_watch(notification_q=self.notification_q, n_object=n_object, watch=watch)
return queued
def send_filter_failure_notification(self, watch_uuid):
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts')
watch = self.datastore.data['watching'].get(watch_uuid)
if not watch:
return
n_object = {'notification_title': 'Changedetection.io - Alert - CSS/xPath filter was not present in the page',
'notification_body': "Your configured CSS/xPath filters of '{}' for {{{{watch_url}}}} did not appear on the page after {} attempts, did the page change layout?\n\nLink: {{{{base_url}}}}/edit/{{{{watch_uuid}}}}\n\nThanks - Your omniscient changedetection.io installation :)\n".format(
", ".join(watch['include_filters']),
threshold),
'notification_format': 'text'}
if len(watch['notification_urls']):
n_object['notification_urls'] = watch['notification_urls']
elif len(self.datastore.data['settings']['application']['notification_urls']):
n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
# Only prepare to notify if the rules above matched
if 'notification_urls' in n_object:
n_object.update({
'watch_url': watch['url'],
'uuid': watch_uuid,
'screenshot': None
})
self.notification_q.put(n_object)
logger.debug(f"Sent filter not found notification for {watch_uuid}")
else:
logger.debug(f"NOT sending filter not found notification for {watch_uuid} - no notification URLs")
def send_step_failure_notification(self, watch_uuid, step_n):
watch = self.datastore.data['watching'].get(watch_uuid, False)
if not watch:
return
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts')
n_object = {'notification_title': "Changedetection.io - Alert - Browser step at position {} could not be run".format(step_n+1),
'notification_body': "Your configured browser step at position {} for {{{{watch_url}}}} "
"did not appear on the page after {} attempts, did the page change layout? "
"Does it need a delay added?\n\nLink: {{{{base_url}}}}/edit/{{{{watch_uuid}}}}\n\n"
"Thanks - Your omniscient changedetection.io installation :)\n".format(step_n+1, threshold),
'notification_format': 'text'}
if len(watch['notification_urls']):
n_object['notification_urls'] = watch['notification_urls']
elif len(self.datastore.data['settings']['application']['notification_urls']):
n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
# Only prepare to notify if the rules above matched
if 'notification_urls' in n_object:
n_object.update({
'watch_url': watch['url'],
'uuid': watch_uuid
})
self.notification_q.put(n_object)
logger.error(f"Sent step not found notification for {watch_uuid}")
def cleanup_error_artifacts(self, uuid):
# All went fine, remove error artifacts
cleanup_files = ["last-error-screenshot.png", "last-error.txt"]
for f in cleanup_files:
full_path = os.path.join(self.datastore.datastore_path, uuid, f)
if os.path.isfile(full_path):
os.unlink(full_path)
def run(self):
now = time.time()
while not self.app.config.exit.is_set():
update_handler = None
try:
queued_item_data = self.q.get(block=False)
except queue.Empty:
pass
else:
uuid = queued_item_data.item.get('uuid')
self.current_uuid = uuid
if uuid in list(self.datastore.data['watching'].keys()) and self.datastore.data['watching'][uuid].get('url'):
changed_detected = False
contents = b''
process_changedetection_results = True
update_obj = {}
# Clear last errors (move to preflight func?)
self.datastore.data['watching'][uuid]['browser_steps_last_error_step'] = None
watch = self.datastore.data['watching'].get(uuid)
logger.info(f"Processing watch UUID {uuid} Priority {queued_item_data.priority} URL {watch['url']}")
now = time.time()
try:
# Processor is what we are using for detecting the "Change"
processor = watch.get('processor', 'text_json_diff')
# Init a new 'difference_detection_processor', first look in processors
processor_module_name = f"changedetectionio.processors.{processor}.processor"
try:
processor_module = importlib.import_module(processor_module_name)
except ModuleNotFoundError as e:
print(f"Processor module '{processor}' not found.")
raise e
update_handler = processor_module.perform_site_check(datastore=self.datastore,
watch_uuid=uuid
)
update_handler.call_browser()
changed_detected, update_obj, contents = update_handler.run_changedetection(watch=watch)
# Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
# We then convert/.decode('utf-8') for the notification etc
# if not isinstance(contents, (bytes, bytearray)):
# raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
except PermissionError as e:
logger.critical(f"File permission error updating file, watch: {uuid}")
logger.critical(str(e))
process_changedetection_results = False
# A generic other-exception thrown by processors
except ProcessorException as e:
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot)
if e.xpath_data:
watch.save_xpath_data(data=e.xpath_data)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': e.message})
process_changedetection_results = False
except content_fetchers_exceptions.ReplyWithContentButNoText as e:
# Totally fine, it's by choice - just continue on, nothing more to care about
# Page had elements/content but no renderable text
# Backend (not filters) gave zero output
extra_help = ""
if e.has_filters:
# Maybe it contains an image? offer a more helpful link
has_img = html_tools.include_filters(include_filters='img',
html_content=e.html_content)
if has_img:
extra_help = ", it's possible that the filters you have give an empty result or contain only an image."
else:
extra_help = ", it's possible that the filters were found, but contained no usable text."
self.datastore.update_watch(uuid=uuid, update_obj={
'last_error': f"Got HTML content but no text found (With {e.status_code} reply code){extra_help}"
})
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)
if e.xpath_data:
watch.save_xpath_data(data=e.xpath_data)
process_changedetection_results = False
except content_fetchers_exceptions.Non200ErrorCodeReceived as e:
if e.status_code == 403:
err_text = "Error - 403 (Access denied) received"
elif e.status_code == 404:
err_text = "Error - 404 (Page not found) received"
elif e.status_code == 407:
err_text = "Error - 407 (Proxy authentication required) received, did you need a username and password for the proxy?"
elif e.status_code == 500:
err_text = "Error - 500 (Internal server error) received from the web site"
else:
extra = ' (Access denied or blocked)' if str(e.status_code).startswith('4') else ''
err_text = f"Error - Request returned a HTTP error code {e.status_code}{extra}"
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)
if e.xpath_data:
watch.save_xpath_data(data=e.xpath_data, as_error=True)
if e.page_text:
watch.save_error_text(contents=e.page_text)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
process_changedetection_results = False
except FilterNotFoundInResponse as e:
if not self.datastore.data['watching'].get(uuid):
continue
err_text = "Warning, no filters were found, no change detection ran - Did the page change layout? update your Visual Filter if necessary."
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
# Filter wasnt found, but we should still update the visual selector so that they can have a chance to set it up again
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot)
if e.xpath_data:
watch.save_xpath_data(data=e.xpath_data)
# Only when enabled, send the notification
if watch.get('filter_failure_notification_send', False):
c = watch.get('consecutive_filter_failures', 0)
c += 1
# Send notification if we reached the threshold?
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts', 0)
logger.debug(f"Filter for {uuid} not found, consecutive_filter_failures: {c} of threshold {threshold}")
if c >= threshold:
if not watch.get('notification_muted'):
logger.debug(f"Sending filter failed notification for {uuid}")
self.send_filter_failure_notification(uuid)
c = 0
logger.debug(f"Reset filter failure count back to zero")
self.datastore.update_watch(uuid=uuid, update_obj={'consecutive_filter_failures': c})
else:
logger.trace(f"{uuid} - filter_failure_notification_send not enabled, skipping")
process_changedetection_results = False
except content_fetchers_exceptions.checksumFromPreviousCheckWasTheSame as e:
# Yes fine, so nothing todo, don't continue to process.
process_changedetection_results = False
changed_detected = False
except content_fetchers_exceptions.BrowserConnectError as e:
self.datastore.update_watch(uuid=uuid,
update_obj={'last_error': e.msg
}
)
process_changedetection_results = False
except content_fetchers_exceptions.BrowserFetchTimedOut as e:
self.datastore.update_watch(uuid=uuid,
update_obj={'last_error': e.msg
}
)
process_changedetection_results = False
except content_fetchers_exceptions.BrowserStepsStepException as e:
if not self.datastore.data['watching'].get(uuid):
continue
error_step = e.step_n + 1
from playwright._impl._errors import TimeoutError, Error
# Generally enough info for TimeoutError (couldnt locate the element after default seconds)
err_text = f"Browser step at position {error_step} could not run, check the watch, add a delay if necessary, view Browser Steps to see screenshot at that step."
if e.original_e.name == "TimeoutError":
# Just the first line is enough, the rest is the stack trace
err_text += " Could not find the target."
else:
# Other Error, more info is good.
err_text += " " + str(e.original_e).splitlines()[0]
logger.debug(f"BrowserSteps exception at step {error_step} {str(e.original_e)}")
self.datastore.update_watch(uuid=uuid,
update_obj={'last_error': err_text,
'browser_steps_last_error_step': error_step
}
)
if watch.get('filter_failure_notification_send', False):
c = watch.get('consecutive_filter_failures', 0)
c += 1
# Send notification if we reached the threshold?
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts',
0)
logger.error(f"Step for {uuid} not found, consecutive_filter_failures: {c}")
if threshold > 0 and c >= threshold:
if not watch.get('notification_muted'):
self.send_step_failure_notification(watch_uuid=uuid, step_n=e.step_n)
c = 0
self.datastore.update_watch(uuid=uuid, update_obj={'consecutive_filter_failures': c})
process_changedetection_results = False
except content_fetchers_exceptions.EmptyReply as e:
# Some kind of custom to-str handler in the exception handler that does this?
err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetchers_exceptions.ScreenshotUnavailable as e:
err_text = "Screenshot unavailable, page did not render fully in the expected time or page was too long - try increasing 'Wait seconds before extracting text'"
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetchers_exceptions.JSActionExceptions as e:
err_text = "Error running JS Actions - Page request - "+e.message
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetchers_exceptions.PageUnloadable as e:
err_text = "Page request from server didnt respond correctly"
if e.message:
err_text = "{} - {}".format(err_text, e.message)
if e.screenshot:
watch.save_screenshot(screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code,
'has_ldjson_price_data': None})
process_changedetection_results = False
except content_fetchers_exceptions.BrowserStepsInUnsupportedFetcher as e:
err_text = "This watch has Browser Steps configured and so it cannot run with the 'Basic fast Plaintext/HTTP Client', either remove the Browser Steps or select a Chrome fetcher."
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
process_changedetection_results = False
logger.error(f"Exception (BrowserStepsInUnsupportedFetcher) reached processing watch UUID: {uuid}")
except Exception as e:
logger.error(f"Exception reached processing watch UUID: {uuid}")
logger.error(str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Exception: " + str(e)})
# Other serious error
process_changedetection_results = False
else:
# Crash protection, the watch entry could have been removed by this point (during a slow chrome fetch etc)
if not self.datastore.data['watching'].get(uuid):
continue
update_obj['content-type'] = update_handler.fetcher.get_all_headers().get('content-type', '').lower()
# Mark that we never had any failures
if not watch.get('ignore_status_codes'):
update_obj['consecutive_filter_failures'] = 0
# Everything ran OK, clean off any previous error
update_obj['last_error'] = False
self.cleanup_error_artifacts(uuid)
if not self.datastore.data['watching'].get(uuid):
continue
#
# Different exceptions mean that we may or may not want to bump the snapshot, trigger notifications etc
if process_changedetection_results:
# Extract <title> as title if possible/requested.
if self.datastore.data['settings']['application'].get('extract_title_as_title') or watch['extract_title_as_title']:
if not watch['title'] or not len(watch['title']):
try:
update_obj['title'] = html_tools.extract_element(find='title', html_content=update_handler.fetcher.content)
logger.info(f"UUID: {uuid} Extract <title> updated title to '{update_obj['title']}")
except Exception as e:
logger.warning(f"UUID: {uuid} Extract <title> as watch title was enabled, but couldn't find a <title>.")
# Now update after running everything
timestamp = round(time.time())
try:
self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
# Also save the snapshot on the first time checked, "last checked" will always be updated, so we just check history length.
if changed_detected or not watch.history_n:
if update_handler.screenshot:
watch.save_screenshot(screenshot=update_handler.screenshot)
if update_handler.xpath_data:
watch.save_xpath_data(data=update_handler.xpath_data)
# Small hack so that we sleep just enough to allow 1 second between history snapshots
# this is because history.txt indexes/keys snapshots by epoch seconds and we dont want dupe keys
if watch.newest_history_key and int(timestamp) == int(watch.newest_history_key):
logger.warning(
f"Timestamp {timestamp} already exists, waiting 1 seconds so we have a unique key in history.txt")
timestamp = str(int(timestamp) + 1)
time.sleep(1)
watch.save_history_text(contents=contents,
timestamp=timestamp,
snapshot_id=update_obj.get('previous_md5', 'none'))
if update_handler.fetcher.content:
watch.save_last_fetched_html(contents=update_handler.fetcher.content, timestamp=timestamp)
# Notifications should only trigger on the second time (first time, we gather the initial snapshot)
if watch.history_n >= 2:
logger.info(f"Change detected in UUID {uuid} - {watch['url']}")
if not watch.get('notification_muted'):
self.send_content_changed_notification(watch_uuid=uuid)
except Exception as e:
# Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
logger.critical("!!!! Exception in update_worker while processing process_changedetection_results !!!")
logger.critical(str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
# Always record that we atleast tried
count = watch.get('check_count', 0) + 1
# Record the 'server' header reply, can be used for actions in the future like cloudflare/akamai workarounds
try:
server_header = update_handler.fetcher.headers.get('server', '').strip().lower()[:255]
self.datastore.update_watch(uuid=uuid,
update_obj={'remote_server_reply': server_header}
)
except Exception as e:
pass
self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
'last_checked': round(time.time()),
'check_count': count
})
self.current_uuid = None # Done
self.q.task_done()
logger.debug(f"Watch {uuid} done in {time.time()-now:.2f}s")
# Give the CPU time to interrupt
time.sleep(0.1)
self.app.config.exit.wait(1)