PEP-8 cleanups and Refactoring

Add .gitignore Cleanups / update copyright years Use regex-based filter by default Use argparse JobState, exception handling Fix mode of lib/urwatch/report.py Split handler.py into jobs.py, filters.py, storage.py, util.py Config file support, custom reporter support Fix unit tests
thp · Jan 16, 2016 · 68329e0 · 68329e0
1 parent 3b871b0
commit 68329e0
Show file tree

Hide file tree

Showing 17 changed files with 1,161 additions and 869 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,3 +1,2 @@
 include ChangeLog COPYING
 recursive-include share *
-include convert-to-python3.sh
diff --git a/lib/urlwatch/__init__.py b/lib/urlwatch/__init__.py
@@ -0,0 +1,15 @@
+"""Watch web pages and arbitrary URLs for changes
+
+This script is intended to help you watch URLs and get notified (via email or
+in your terminal) of any changes. The change notification will include the URL
+that has changed and a unified diff of what has changed.
+"""
+
+pkgname = 'urlwatch'
+
+__copyright__ = 'Copyright 2008-2016 Thomas Perl'
+__author__ = 'Thomas Perl <m@thp.io>'
+__license__ = 'BSD'
+__url__ = 'http://thp.io/2008/urlwatch/'
+__version__ = '1.18'
+__user_agent__ = '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__)
diff --git a/lib/urlwatch/filters.py b/lib/urlwatch/filters.py
@@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of urlwatch (https://thp.io/2008/urlwatch/).
+# Copyright (c) 2008-2016 Thomas Perl <thp.io/about>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 3. The name of the author may not be used to endorse or promote products
+#    derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import re
+import logging
+import itertools
+
+from .util import TrackSubClasses
+
+
+logger = logging.getLogger(__name__)
+
+
+class FilterBase(object, metaclass=TrackSubClasses):
+    __subclasses__ = {}
+    __anonymous_subclasses__ = []
+
+    def __init__(self, job, state):
+        self.job = job
+        self.state = state
+
+    def _no_subfilters(self, subfilter):
+        if subfilter is not None:
+            raise ValueError('No subfilters supported for {}'.format(self.__kind__))
+
+    @classmethod
+    def filter_documentation(cls):
+        result = []
+        for sc in list(cls.__subclasses__.values()):
+            result.extend((
+                '  * %s - %s' % (sc.__kind__, sc.__doc__),
+            ))
+        return '\n'.join(result)
+
+    @classmethod
+    def auto_process(cls, state, data):
+        filters = itertools.chain((filtercls for _, filtercls in
+                                   sorted(cls.__subclasses__.items(), key=lambda k_v: k_v[0])),
+                                  cls.__anonymous_subclasses__)
+
+        for filtercls in filters:
+            filter_instance = filtercls(state.job, state)
+            if filter_instance.match():
+                logger.info('Auto-applying filter %r to %s', filter_instance, state.job.get_location())
+                data = filter_instance.filter(data)
+
+        return data
+
+    @classmethod
+    def process(cls, filter_kind, subfilter, state, data):
+        filtercls = cls.__subclasses__.get(filter_kind, None)
+        if filtercls is None:
+            raise ValueError('Unknown filter kind: %s:%s' % (filter_kind, subfilter))
+        return filtercls(state.job, state).filter(data, subfilter)
+
+    def match(self):
+        return False
+
+    def filter(self, data, subfilter=None):
+        raise NotImplementedError()
+
+
+class AutoMatchFilter(FilterBase):
+    """Automatically matches subclass filters with a given location"""
+    MATCH = None
+
+    def match(self):
+        if self.MATCH is None:
+            return False
+
+        d = self.job.to_dict()
+        result = all(d.get(k, None) == v for k, v in self.MATCH.items())
+        logger.debug('Matching %r with %r result: %r', self, self.job, result)
+        return result
+
+
+class Html2TextFilter(FilterBase):
+    """Convert HTML to plaintext"""
+
+    __kind__ = 'html2text'
+
+    def filter(self, data, subfilter=None):
+        if subfilter is None:
+            subfilter = 're'
+
+        from .html2txt import html2text
+        return html2text(data, method=subfilter)
+
+
+class Ical2TextFilter(FilterBase):
+    """Convert iCalendar to plaintext"""
+
+    __kind__ = 'ical2text'
+
+    def filter(self, data, subfilter=None):
+        self._no_subfilters(subfilter)
+        from .ical2txt import ical2text
+        return ical2text(data)
+
+
+class GrepFilter(FilterBase):
+    """Filter only lines matching a regular expression"""
+
+    __kind__ = 'grep'
+
+    def filter(self, data, subfilter=None):
+        if subfilter is None:
+            raise ValueError('The grep filter needs a regular expression')
+
+        return '\n'.join(line for line in data.splitlines()
+                         if re.search(subfilter, line) is not None)
+
+
+class StripFilter(FilterBase):
+    """Strip leading and trailing whitespace"""
+
+    __kind__ = 'strip'
+
+    def filter(self, data, subfilter=None):
+        self._no_subfilters(subfilter)
+        return data.strip()