forked from dgtlmoon/changedetection.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_worker.py
386 lines (311 loc) · 21.6 KB
/
update_worker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
import os
import threading
import queue
import time
from changedetectionio import content_fetcher
from .processors.text_json_diff import FilterNotFoundInResponse
# A single update worker
#
# Requests for checking on a single site(watch) from a queue of watches
# (another process inserts watches into the queue that are time-ready for checking)
import logging
import sys
class update_worker(threading.Thread):
current_uuid = None
def __init__(self, q, notification_q, app, datastore, *args, **kwargs):
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
self.q = q
self.app = app
self.notification_q = notification_q
self.datastore = datastore
super().__init__(*args, **kwargs)
def send_content_changed_notification(self, t, watch_uuid):
from changedetectionio import diff
from changedetectionio.notification import (
default_notification_format_for_watch
)
n_object = {}
watch = self.datastore.data['watching'].get(watch_uuid, False)
if not watch:
return
watch_history = watch.history
dates = list(watch_history.keys())
# Theoretically it's possible that this could be just 1 long,
# - In the case that the timestamp key was not unique
if len(dates) == 1:
raise ValueError(
"History index had 2 or more, but only 1 date loaded, timestamps were not unique? maybe two of the same timestamps got written, needs more delay?"
)
n_object['notification_urls'] = watch['notification_urls'] if len(watch['notification_urls']) else \
self.datastore.data['settings']['application']['notification_urls']
n_object['notification_title'] = watch['notification_title'] if watch['notification_title'] else \
self.datastore.data['settings']['application']['notification_title']
n_object['notification_body'] = watch['notification_body'] if watch['notification_body'] else \
self.datastore.data['settings']['application']['notification_body']
n_object['notification_format'] = watch['notification_format'] if watch['notification_format'] != default_notification_format_for_watch else \
self.datastore.data['settings']['application']['notification_format']
# Only prepare to notify if the rules above matched
if 'notification_urls' in n_object and n_object['notification_urls']:
# HTML needs linebreak, but MarkDown and Text can use a linefeed
if n_object['notification_format'] == 'HTML':
line_feed_sep = "<br>"
else:
line_feed_sep = "\n"
# Add text that was triggered
snapshot_contents = watch.get_history_snapshot(dates[-1])
trigger_text = watch.get('trigger_text', [])
triggered_text = ''
if len(trigger_text):
from . import html_tools
triggered_text = html_tools.get_triggered_text(content=snapshot_contents, trigger_text=trigger_text)
if triggered_text:
triggered_text = line_feed_sep.join(triggered_text)
n_object.update({
'current_snapshot': snapshot_contents,
'diff': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), line_feed_sep=line_feed_sep),
'diff_added': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), include_removed=False, line_feed_sep=line_feed_sep),
'diff_full': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), include_equal=True, line_feed_sep=line_feed_sep),
'diff_removed': diff.render_diff(watch.get_history_snapshot(dates[-2]), watch.get_history_snapshot(dates[-1]), include_added=False, line_feed_sep=line_feed_sep),
'screenshot': watch.get_screenshot() if watch.get('notification_screenshot') else None,
'triggered_text': triggered_text,
'uuid': watch_uuid,
'watch_url': watch['url'],
})
logging.info (">> SENDING NOTIFICATION")
self.notification_q.put(n_object)
else:
logging.info (">> NO Notification sent, notification_url was empty in both watch and system")
def send_filter_failure_notification(self, watch_uuid):
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts')
watch = self.datastore.data['watching'].get(watch_uuid, False)
if not watch:
return
n_object = {'notification_title': 'Changedetection.io - Alert - CSS/xPath filter was not present in the page',
'notification_body': "Your configured CSS/xPath filters of '{}' for {{{{watch_url}}}} did not appear on the page after {} attempts, did the page change layout?\n\nLink: {{{{base_url}}}}/edit/{{{{watch_uuid}}}}\n\nThanks - Your omniscient changedetection.io installation :)\n".format(
", ".join(watch['include_filters']),
threshold),
'notification_format': 'text'}
if len(watch['notification_urls']):
n_object['notification_urls'] = watch['notification_urls']
elif len(self.datastore.data['settings']['application']['notification_urls']):
n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
# Only prepare to notify if the rules above matched
if 'notification_urls' in n_object:
n_object.update({
'watch_url': watch['url'],
'uuid': watch_uuid,
'screenshot': None
})
self.notification_q.put(n_object)
print("Sent filter not found notification for {}".format(watch_uuid))
def send_step_failure_notification(self, watch_uuid, step_n):
watch = self.datastore.data['watching'].get(watch_uuid, False)
if not watch:
return
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts')
n_object = {'notification_title': "Changedetection.io - Alert - Browser step at position {} could not be run".format(step_n+1),
'notification_body': "Your configured browser step at position {} for {{watch['url']}} "
"did not appear on the page after {} attempts, did the page change layout? "
"Does it need a delay added?\n\nLink: {{base_url}}/edit/{{watch_uuid}}\n\n"
"Thanks - Your omniscient changedetection.io installation :)\n".format(step_n+1, threshold),
'notification_format': 'text'}
if len(watch['notification_urls']):
n_object['notification_urls'] = watch['notification_urls']
elif len(self.datastore.data['settings']['application']['notification_urls']):
n_object['notification_urls'] = self.datastore.data['settings']['application']['notification_urls']
# Only prepare to notify if the rules above matched
if 'notification_urls' in n_object:
n_object.update({
'watch_url': watch['url'],
'uuid': watch_uuid
})
self.notification_q.put(n_object)
print("Sent step not found notification for {}".format(watch_uuid))
def cleanup_error_artifacts(self, uuid):
# All went fine, remove error artifacts
cleanup_files = ["last-error-screenshot.png", "last-error.txt"]
for f in cleanup_files:
full_path = os.path.join(self.datastore.datastore_path, uuid, f)
if os.path.isfile(full_path):
os.unlink(full_path)
def run(self):
from .processors import text_json_diff, restock_diff
while not self.app.config.exit.is_set():
try:
queued_item_data = self.q.get(block=False)
except queue.Empty:
pass
else:
uuid = queued_item_data.item.get('uuid')
self.current_uuid = uuid
if uuid in list(self.datastore.data['watching'].keys()):
changed_detected = False
contents = b''
process_changedetection_results = True
update_obj = {}
print("> Processing UUID {} Priority {} URL {}".format(uuid, queued_item_data.priority,
self.datastore.data['watching'][uuid]['url']))
now = time.time()
try:
processor = self.datastore.data['watching'][uuid].get('processor','text_json_diff')
# @todo some way to switch by name
if processor == 'restock_diff':
update_handler = restock_diff.perform_site_check(datastore=self.datastore)
else:
# Used as a default and also by some tests
update_handler = text_json_diff.perform_site_check(datastore=self.datastore)
changed_detected, update_obj, contents = update_handler.run(uuid, skip_when_checksum_same=queued_item_data.item.get('skip_when_checksum_same'))
# Re #342
# In Python 3, all strings are sequences of Unicode characters. There is a bytes type that holds raw bytes.
# We then convert/.decode('utf-8') for the notification etc
if not isinstance(contents, (bytes, bytearray)):
raise Exception("Error - returned data from the fetch handler SHOULD be bytes")
except PermissionError as e:
self.app.logger.error("File permission error updating", uuid, str(e))
process_changedetection_results = False
except content_fetcher.ReplyWithContentButNoText as e:
# Totally fine, it's by choice - just continue on, nothing more to care about
# Page had elements/content but no renderable text
# Backend (not filters) gave zero output
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': "Got HTML content but no text found (With {} reply code).".format(e.status_code)})
if e.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot)
process_changedetection_results = False
except content_fetcher.Non200ErrorCodeReceived as e:
if e.status_code == 403:
err_text = "Error - 403 (Access denied) received"
elif e.status_code == 404:
err_text = "Error - 404 (Page not found) received"
elif e.status_code == 500:
err_text = "Error - 500 (Internal server Error) received"
else:
err_text = "Error - Request returned a HTTP error code {}".format(str(e.status_code))
if e.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True)
if e.xpath_data:
self.datastore.save_xpath_data(watch_uuid=uuid, data=e.xpath_data, as_error=True)
if e.page_text:
self.datastore.save_error_text(watch_uuid=uuid, contents=e.page_text)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
process_changedetection_results = False
except FilterNotFoundInResponse as e:
if not self.datastore.data['watching'].get(uuid):
continue
err_text = "Warning, no filters were found, no change detection ran."
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
# Only when enabled, send the notification
if self.datastore.data['watching'][uuid].get('filter_failure_notification_send', False):
c = self.datastore.data['watching'][uuid].get('consecutive_filter_failures', 5)
c += 1
# Send notification if we reached the threshold?
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts',
0)
print("Filter for {} not found, consecutive_filter_failures: {}".format(uuid, c))
if threshold > 0 and c >= threshold:
if not self.datastore.data['watching'][uuid].get('notification_muted'):
self.send_filter_failure_notification(uuid)
c = 0
self.datastore.update_watch(uuid=uuid, update_obj={'consecutive_filter_failures': c})
process_changedetection_results = False
except content_fetcher.checksumFromPreviousCheckWasTheSame as e:
# Yes fine, so nothing todo, don't continue to process.
process_changedetection_results = False
changed_detected = False
except content_fetcher.BrowserStepsStepTimout as e:
if not self.datastore.data['watching'].get(uuid):
continue
err_text = "Warning, browser step at position {} could not run, target not found, check the watch, add a delay if necessary.".format(e.step_n+1)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text})
if self.datastore.data['watching'][uuid].get('filter_failure_notification_send', False):
c = self.datastore.data['watching'][uuid].get('consecutive_filter_failures', 5)
c += 1
# Send notification if we reached the threshold?
threshold = self.datastore.data['settings']['application'].get('filter_failure_notification_threshold_attempts',
0)
print("Step for {} not found, consecutive_filter_failures: {}".format(uuid, c))
if threshold > 0 and c >= threshold:
if not self.datastore.data['watching'][uuid].get('notification_muted'):
self.send_step_failure_notification(watch_uuid=uuid, step_n=e.step_n)
c = 0
self.datastore.update_watch(uuid=uuid, update_obj={'consecutive_filter_failures': c})
process_changedetection_results = False
except content_fetcher.EmptyReply as e:
# Some kind of custom to-str handler in the exception handler that does this?
err_text = "EmptyReply - try increasing 'Wait seconds before extracting text', Status Code {}".format(e.status_code)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetcher.ScreenshotUnavailable as e:
err_text = "Screenshot unavailable, page did not render fully in the expected time - try increasing 'Wait seconds before extracting text'"
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetcher.JSActionExceptions as e:
err_text = "Error running JS Actions - Page request - "+e.message
if e.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except content_fetcher.PageUnloadable as e:
err_text = "Page request from server didnt respond correctly"
if e.message:
err_text = "{} - {}".format(err_text, e.message)
if e.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=e.screenshot, as_error=True)
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': err_text,
'last_check_status': e.status_code})
process_changedetection_results = False
except Exception as e:
self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
# Other serious error
process_changedetection_results = False
else:
# Crash protection, the watch entry could have been removed by this point (during a slow chrome fetch etc)
if not self.datastore.data['watching'].get(uuid):
continue
# Mark that we never had any failures
if not self.datastore.data['watching'][uuid].get('ignore_status_codes'):
update_obj['consecutive_filter_failures'] = 0
self.cleanup_error_artifacts(uuid)
#
# Different exceptions mean that we may or may not want to bump the snapshot, trigger notifications etc
if process_changedetection_results:
try:
watch = self.datastore.data['watching'].get(uuid)
self.datastore.update_watch(uuid=uuid, update_obj=update_obj)
# Also save the snapshot on the first time checked
if changed_detected or not watch['last_checked']:
watch.save_history_text(contents=contents,
timestamp=str(round(time.time())),
snapshot_id=update_obj.get('previous_md5', 'none'))
# A change was detected
if changed_detected:
print (">> Change detected in UUID {} - {}".format(uuid, watch['url']))
# Notifications should only trigger on the second time (first time, we gather the initial snapshot)
if watch.history_n >= 2:
if not self.datastore.data['watching'][uuid].get('notification_muted'):
self.send_content_changed_notification(self, watch_uuid=uuid)
except Exception as e:
# Catch everything possible here, so that if a worker crashes, we don't lose it until restart!
print("!!!! Exception in update_worker !!!\n", e)
self.app.logger.error("Exception reached processing watch UUID: %s - %s", uuid, str(e))
self.datastore.update_watch(uuid=uuid, update_obj={'last_error': str(e)})
if self.datastore.data['watching'].get(uuid):
# Always record that we atleast tried
count = self.datastore.data['watching'][uuid].get('check_count', 0) + 1
self.datastore.update_watch(uuid=uuid, update_obj={'fetch_time': round(time.time() - now, 3),
'last_checked': round(time.time()),
'check_count': count
})
# Always save the screenshot if it's available
if update_handler.screenshot:
self.datastore.save_screenshot(watch_uuid=uuid, screenshot=update_handler.screenshot)
if update_handler.xpath_data:
self.datastore.save_xpath_data(watch_uuid=uuid, data=update_handler.xpath_data)
self.current_uuid = None # Done
self.q.task_done()
# Give the CPU time to interrupt
time.sleep(0.1)
self.app.config.exit.wait(1)