From 02bc7776cab461fbc12d94045de620d68e3a7bcf Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Sun, 15 Oct 2017 22:47:23 -0700 Subject: [PATCH] config and docs work: (#255) config and docs work: - autoindexing now set in config via 'autoindex: ' option - autoindexing only runs in first uwsgi worker if in uwsgi - recorder config: rename props to 'rollover_' to match docs - docs: write configuring.rst section for recording mode, autoindexing and proxy mode! - update README for new pywb release, point to new docs! --- README.rst | 283 ++---------------------------------- docs/manual/configuring.rst | 148 ++++++++++++++++++- pywb/apps/cli.py | 21 +-- pywb/apps/frontendapp.py | 22 ++- pywb/manager/autoindex.py | 15 +- 5 files changed, 189 insertions(+), 300 deletions(-) diff --git a/README.rst b/README.rst index c06830e0d..4a78f90ab 100644 --- a/README.rst +++ b/README.rst @@ -1,288 +1,29 @@ -PyWb 0.33.2 -=========== +Webrecorder pywb 2.0 +==================== .. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master :target: https://travis-ci.org/ikreymer/pywb .. image:: https://coveralls.io/repos/ikreymer/pywb/badge.svg?branch=master :target: https://coveralls.io/r/ikreymer/pywb?branch=master -**pywb** is a Python (2 and 3) implementation of web archival replay tools, sometimes also known as 'Wayback Machine'. +**pywb** is a Python (2 and 3) web archive replay and recording toolkit. -**pywb** allows high-quality replay (browsing) of archived web data stored in standardized `ARC `_ and `WARC `_, -and it can also serve as a customizable rewriting proxy to live web content. +This toolset forms the foundation of Webrecorder, but also provides a variety of web archiving tools, +such as the traditional "Wayback Machine" functionality. -The replay system is designed to accurately replay complex dynamic sites, including `video and audio content `_ and sites -with complex JavaScript. +Note: this version, which represents a major overhaul of pywb, is not yet released on pypi, but you can: -Additionally, **pywb** includes an extensive `index query api `_ for querying information about archived content. +* Install with ``python setup.py install`` -The software can run as a traditional web application or an HTTP or HTTPS proxy server, and has been tested on Linux, OS X and Windows platforms. +* Run tests with ``python setup.py test`` -**pywb** is fully compliant with the `Memento `_ protocol (`RFC-7089 `_). +* Run Wayback with ``wayback`` (see docs for info on how to setup collections) -**pywb** supports Python 2.6+ and Python 3.3+ +* Build docs locally with: ``cd docs; make html`` +* ..and a lot more! -Getting Started -- Run your own Web Archive -------------------------------------------- +Please see the `Webrecorder pywb documentation for usage and configuration info `_ -With release 0.9.0, **pywb** provides new simplified, directory-based init system to create and -run your own web archive replay system (wayback machine) directly from archive collections on disk. -A new utility, ``wb-manager`` performs the most common collection management tasks from the command line. - - -1. Archive a Web Page -""""""""""""""""""""" - -If you do not have any web archive files (WARCS), you can create easily create one from any page by using the free -https://webrecorder.io/ service - -For example, you may visit https://webrecorder.io/record/http://example.com, then (after a few seconds), -click *Download -> Web Archive (WARC)* to get the WARC file (.warc.gz) - -Everything you have seen in your browser during the recording session was archived. - - -2. Create a new Collection -"""""""""""""""""""""""""" - -Each collections contains an arbitrary amount of WARC files. - -Once you have at least one WARC/ARC file, you can set up a quick collection as follows, including installing -**pywb**: - -:: - - pip install pywb - wb-manager init my_coll - wb-manager add my_coll - wayback - - -Point your browser to ``http://localhost:8080/my_coll//`` where ```` is a url you recorded before into your WARC/ARC file. (If you just recorded ``http://example.com/``, you should be able to view ``http://localhost:8080/my_coll/http://example.com/``) - -If all worked well, you should see your archived version of ````. Congrats, you are now running your own web archive! - - -`A more detailed tutorial is available on the wiki `_ - - -Using Existing Web Archive Collections --------------------------------------- - -Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``, -WARC/ARC files will automatically be placed in the collection archive directory and indexed. - -If you have a large number of existing CDX index files, pywb will be able to read them as well without having to reindex. -It is recommended that any index files be converted to the latest JSON based format, which can be done by running: -``wb-manager cdx-convert `` - -To setup a collection with existing ARC/WARCs and CDX index files, you can: - -1. Run ``wb-manager init ``. This will initialize all the required collection directories. -2. Copy any archive files (WARCs and ARCs) to ``collections//archive/`` -3. Copy any existing cdx indexes to ``collections//indexes/`` -4. Run ``wb-manager cdx-convert collections//indexes/``. This step is optional but strongly recommended, as it will - ensure that the CDX indexes are in a consistent format. - -This will fully migrate your archive and indexes the collection. Any new WARCs added with ``wb-manager add`` will be indexed and added to the existing collection. -You may use the auto-indexing features (explained below) to add new content to the existing collection. - -`Legacy installation instructions `_ contain additional -information and testing examples, and use a custom ``config.yaml`` file. These instructions are from previous releases but -still compatible with pywb 0.9.x. - - -Custom UI and User Metadata ---------------------------- - -**pywb** makes it easy to customize most aspects of the UI around archived content, including a custom banner insert, query calendar, search and home pages, -via HTML Jinja2 templates. - -You can see a list of all available UI templates by running: ``wb-manager template --list`` - -To copy a default template to the file system (for modification), you can run ``wb-manager template --add `` - -**pywb** now supports custom user metadata for each collection. The metadata may be specified in the ``metadata.yaml`` in each collection's directory. - -The metadata is accessible to all UI templates and may be displayed to the user as needed. - -See the `Collections Manager Tutorial `_ and the -and `UI Customization `_ page for more details. - - -Automatic Indexing ------------------- - -**pywb** now also includes support for automatic indexing of any web archive files (WARC or ARC). - -Whenever a WARC/ARC file is added or changed, pywb will update the internal index automatically and make the archived content -instantly available for replay, without manual intervention or restart. (Of course, indexing will take some time if adding -many gigabytes of data all at once, but is quite useful for smaller archive updates). - -To enable auto-indexing, you can run the ``wayback -a`` when running command line, or run -``wb-manager autoindex `` as a seperate program. - - -Samples and Tests -------------------------- - -To run with the bundled sample and test suite, you'll need to clone pywb locally: - -1. ``git clone https://github.com/ikreymer/pywb.git; cd pywb`` - -2. ``python setup.py install`` - -3. ``wayback`` to run samples - -4. Browse to http://localhost:8080/pywb/\*/example.com to see capture of http://example.com - -To run tests on your system, you may run ``python setup.py test`` - -(The HTTPS proxy tests require the optional ``certauth`` package and are skipped if the package is not installed) - - -Additional Samples and Other Projects -"""""""""""""""""""""""""""""""""""""" - -Additional (older) samples can be found in the `pywb-samples `_ repository. - -For additional reference on how pywb is being used, you may check some of the `public projects using with pywb `_ - - -Desktop Web Archive Player --------------------------- - -There is now also a downloadable point-and-click `Web Archive Player `_ which provides -a native OS X and Windows desktop client application for browsing web archives, built using **pywb**. - -You can use this tool to quickly check the contents of any WARC or ARC file through a simple point-and-click GUI interface, no command line tools needed. - - -pywb Tools Overview -------------------- - -In addition to the standard Wayback Machine, **pywb** tool suite includes a -number of useful command-line and web server tools. The tools should be available to use after installing with -``pip install pywb``: - - -* ``wayback`` -- The Wayback Machine application itself. - - -* ``wb-manager`` -- A command-line utility for managing collections, adding WARC/ARC files, metadata and UI templates. - See ``wb-manager --help`` for an up-to-date listing of commands and options. - - -* ``live-rewrite-server`` -- a demo live rewriting web server which accepts requests using wayback machine url format at ``/live/`` path, eg, ``/live/http://example.com/`` and applies the same url rewriting rules as are used for archived content. - This is useful for checking how live content will appear when archived before actually creating any archive files, or for recording data. - The `webrecorder.io `_ service extends upon this functionality. - - -* ``cdx-indexer`` -- a command-line tool for manually creating CDX indexes from WARC and ARC files. Supports SURT and - non-SURT based cdx files, optional sorting, and several formats. See ``cdx-indexer -h`` for all options. Using ``wb-manager`` is recommended - for higher-level collection file management, but this tool can be used for any custom indexing needs. - - -* ``cdx-server`` -- a CDX API only server which returns a responses about CDX captures in bulk. See `CDX Server API `_ - for an updated documentation on the latest query api. - - -Latest Changes --------------- - -See `CHANGES.rst `_ for an up-to-date changelist. - - -Running as Rewriting Live Web Proxy ------------------------------------ - -In addition to replaying archived web content, pywb can serve as a rewriting proxy to the live web. This allows **pywb** -to serve live content, and inject customized code into any web page on the fly. This allow for a variety of use cases beyond archive replay. - -For example, the `pywb-webrecorder `_ demonstrates a way to use pywb live web rewriting -together with a recording proxy (warcprox) to record content while browsing. - -The `via.hypothes.is `_ project provides an example of using pywb to inject annotations into any live web page. - - -Running in HTTP/HTTPS Proxy Mode --------------------------------- - -**pywb** can also be used as an actual HTTP and/or HTTPS proxy server. See `pywb Proxy Mode Usage `_ for more details -on configuring proxy mode. - -To run as an HTTPS proxy server, pywb uses the `certauth `_ tool for generating a custom self-signed root certificate, which can be used to replay HTTPS content from the archive. (The certificate should be used with caution within a controlled setting). - -Using these features requiring an extra dependency: installing *certauth* with ``pip install certauth``. (This will also install the ``pyOpenSSL`` package which is used to handle the -ssl functionality). - -Collection and Timestamp Selection In Proxy Mode -"""""""""""""""""""""""""""""""""""""""""""""""" - -When running in proxy mode, the current collection and current timestamp are not included in the page url and need to be set separeately. pywb provides several options for 'resolving' the collection and timestamp: - -- *By Proxy Auth*: Proxy Authorization settings are used to select a (fixed) collection and Memento API can be used to pick the timestamp. - -- *By IP*: Settings for current collection and timestamp can be set per-IP using a seperate HTTP request to the proxy. Useful for fixed-IP deployments, such as when running in Docker. - -- *By Cookie*: The most complex but dynamic option, this allows a user to switch collection and current timestamp through cookies that are propagated across domains. - -For more info, see `Proxy Mode Usage `_. - -The `pywb-proxy-demo `_ project also contains a working configuration of proxy mode deployment. - - -Running with any WSGI Container -------------------------------- - -The command-line ``wayback`` utility starts pywb using the standard Python library `WSGIRef `_ server. This should be sufficient for basic usage and testing, but is not recommended for production. In the future, a different default option will be provided. - -Since pywb conforms to the Python `WSGI `_ specification, it can be run with any standard WSGI container/server -and can be embedded in larger applications. - -When running with a different container, specify ``pywb.apps.wayback`` as the WSGI application module. - -For production deployments, `uWSGI `_ with gevent is the recommended container and the ``uwsgi.ini and ``run-uwsgi.sh`` -scripts in this repo provides examples of running pywb with uWSGI. - - -Wayback Machine Compatibility ------------------------------ - -**pywb** is compatible with the standard `Wayback Machine `_ url format, which was developed by the Internet Archive: - -Replay: ``http://///`` - -- ex: http://pywb.herokuapp.com/pywb/20140127171238/http://www.iana.org - -- ex: http://web.archive.org/web/20150316213720/http://www.example.com/ - -Query Listing: ``http:////*/`` - -- ex: http://pywb.herokuapp.com/pywb/\*/http://iana.org/ - -- ex: http://web.archive.org/web/\*/http://www.example.com/ - - -Additional Reference --------------------- - -- The `wiki `_ will have - additional technical documentation about various aspects of pywb - -- The sample ``config.yaml`` file, although not required, provides a listing of various advanced configuration options: - `config.yaml `_ - - -Contributions & Bug Reports ---------------------------- - -Users are encouraged to fork and contribute to this project to improve any and all aspects of web archival -replay and web proxy services. - -Please take a look at list of current -`issues `_ and feel -free to open new ones. diff --git a/docs/manual/configuring.rst b/docs/manual/configuring.rst index 697615eca..396159fbc 100644 --- a/docs/manual/configuring.rst +++ b/docs/manual/configuring.rst @@ -32,7 +32,11 @@ To disable framed replay add: Directory Structure ------------------- -The pywb system assumes the following default directory structure for a web archive:: +The pywb system is designed to automatically access and manage web archive collections that follow a defined directory structure. +The directory structure can be fully customized and "special" collections can be defined outside the structure as well. + +The default directory structure for a web archive is as follows:: + +-- config.yaml (optional) | @@ -63,7 +67,7 @@ The pywb system assumes the following default directory structure for a web arch If running with default settings, the ``config.yaml`` can be omitted. -It is possible to config these paths in the config.yaml +It is possible to config these directory paths in the config.yaml The following are some of the implicit default settings which can be customized:: collections_root: collections @@ -107,6 +111,8 @@ In addition, several "special" collection definitions are possible. All custom defined collections are placed under the ``collections`` key in ``config.yaml`` +.. _live-web: + Live Web Collection ^^^^^^^^^^^^^^^^^^^ @@ -165,18 +171,152 @@ Such a collection must be defined explicitly using the ``$root`` as collection n Note: When a root collection is set, no other collections are currently accessible, they are ignored. +.. _recording-mode: Recording Mode -------------- -TODO +A new recording mode can be enabled for any automatically managed collection by adding a ``recorder`` block in +the root of ``config.yaml``. +The mode can be configured with the following options:: + + recorder: + source_coll: live + rollover_size: 100000000 + rollover_idle_secs: 600 + filename_template: my-warc-{timestamp}-{hostname}-{random}.warc.gz + + +This will enable the ``/record/`` access point under every managed collection, writing new WARCs directly into each collection. +The required ``source_coll`` setting specifies the source collection from which to load content that will be recorded. + +Most likely this will be the :ref:`live-web` collection, which should also be defined. +However, it could be any other collection, allowing for "extraction" from other collections or remote web archives. +Both the request and response are recorded into the WARC file, and most standard HTTP verbs should be recordable. + +The other options are optional and may be omitted. The ``rollover_size`` and ``rollover_idle_secs`` specified +the maximum size and maximum idle time, respectively, after which a new WARC file is created. +For example, a new WARC will be created if more than 100MB are recorded, or after 600 seconds have elapsed between +subsequent requests. This allows the WARC size to be more manageable and prevents files from being left open for long periods of time. + +The ``filename-template`` specifies the naming convention for WARC files, and allows a timestamp, current hostname, and +random string to be inserted into the filename. + +For example, if recording with the above config into a collection called ``my-coll``, the user would access: + +``http://my-archive.example.com/my-coll/record/http://example.com/``, which would load ``http://example.com/`` from the live web +and write the request and response to a WARC named something like: + +``./collections/my-coll/archive/my-warc-20170102030000000000-archive.example.com-QRTGER.warc.gz`` + +If running with auto indexing, the WARC will also get automatically indexd and available for replay after the index interval. + +As a shortcut, ``recorder: live`` can also be used to specify only the ``source_coll`` option. + + +Auto-Indexing Mode +------------------ + +If auto-indexing is enabled, pywb will update the indexes stored in the ``indexes`` directory whenever files are added or modified in the +``archive`` directory. Auto-indexing can be enabled via the ``autoindex`` option set to the check interval in seconds:: + + autoindex: 30 + +This specifies that the ``archive`` directories should be every 30 seconds. Auto-indexing is useful when WARCs are being +appened to or added to the ``archive`` by an extneral operation. + +If a user is manually adding a new WARC to the collection, ``wb-manager add `` is recommended, +as this will add the WARC and perform a one-time reindex the collection, without the need for auto-indexing. + +Note: Auto-indexing also does not support deletion of removal of WARCs from the ``archive`` directory. + +This is not a common operation for web archives, a WARC must be manually removed from the +``collections//archive/`` directory and then collection index can be regenreated from the remaining WARCs +by running ``wb-manager reindex `` + +The auto-indexing mode can also be enabled via commandline by running ``wayback -a`` or ``wayback -a --auto-interval 30`` to also set the interval. + +(If running pywb with uWSGI in multi-process mode, the auto-indexing is only run in a single worker to avoid race conditions and duplicate indexing) + .. _https-proxy: HTTP/S Proxy Mode ----------------- -TODO +In addition to "url rewritinng prefix mode" (the default), pywb can also act as a full-fledged HTTP and HTTPS proxy, allowing +any browser or client supporting HTTP and HTTPS proxy to access web archives through the proxy. + +Proxy mode can provide access to a single collection at time, eg. instead of accessing ``http://localhost:8080/my-coll/2017/http://example.com/``, +the user enters ``http://example.com/`` and is served content from the ``my-coll`` collection. +As a result, the collection and timestamp must be specified separately. + +Configuring HTTP Proxy +^^^^^^^^^^^^^^^^^^^^^^ + +At this time, pywb requires the collection to be configured at setup time (though collection switching will be added soon). + +The collection can be specified by running: ``wayback --proxy my-coll`` or by adding to the config:: + + proxy: + coll: my-coll + +For HTTP proxy access, this is all that is needed to use the proxy. If pywb is running on port 8080 on localhost, the following curl command should provide proxy access: ``curl -x "localhost:8080" http://example.com/`` + + +Proxy Recording +^^^^^^^^^^^^^^^ + +The proxy can additional be set to recording mode, equivalent to access the ``//record/`` path, +by adding ``recording: true``, as follows:: + + proxy: + coll: my-coll + recording: true + +By default, proxy recording will use the ``live`` collection if not otherwise configured. + +See :ref:`recording-mode` for full set of configurable recording options. + + +HTTPS Proxy and pywb Certificate Authority +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For HTTPS proxy access, pywb provides its own Certificate Authority and dynamically generates certificates for each host and signes the responses +with these certificates. By design, this allows pywb to act as "man-in-the-middle" servring archived copies of a given site. + +However, the pywb certificate authority (CA) will need to be accepted by the browser. The CA cert can be downloaded from pywb directly +using the specical download paths. Recommended set up for using the proxy is as follows: + +1. Configure the browser proxy settings host port, for example ``localhost`` and ``8080`` (if running locally) + +2. Download the CA: + + * For most browsers, use the PEM format: ``http://wsgiprox/download/pem`` + + * For windows, use the PKCS12 format: ``http://wsgiprox/download/p12`` + +3. You may need to agree to "Trust this CA" to identify websites. + +The pywb CA file is automatically generated if it does not exist, and may be added to the key store directly. + +Additional proxy options ``ca_name`` and ``ca_file_cache`` allow configuring the location and name of the CA file. + +The following are all the available proxy options (only ``coll`` is required):: + + proxy: + coll: my-coll + ca_name: pywb HTTPS Proxy CA + ca_file_cache: ./proxy-certs/pywb-ca.pem + recording: false + +The HTTP/S functionality is provided by the separate :mod:`wsgiprox` utility which provides HTTP/S proxy +for any WSGI application. + +See the `wsgiprox README `_ for additional details on how it works. + +For more information on custom certificate authority (CA) installation, the `mitmproxy certificate page `_ provides a good overview for installing a custom CA on different platforms. + UI Customizations ----------------- diff --git a/pywb/apps/cli.py b/pywb/apps/cli.py index 4abce6893..ec098321b 100644 --- a/pywb/apps/cli.py +++ b/pywb/apps/cli.py @@ -102,28 +102,13 @@ def load(self): self.extra_config['collections'] = {} self.extra_config['collections'][self.r.all_coll] = '$all' + if self.r.autoindex: + self.extra_config['autoindex'] = self.r.auto_interval + import os if self.r.directory: #pragma: no cover os.chdir(self.r.directory) - def run(self): - if self.r.autoindex: - from pywb.manager.autoindex import AutoIndexer - import os - - indexer = AutoIndexer(interval=self.r.auto_interval) - if not os.path.isdir(indexer.root_path): - msg = 'No managed directory "{0}" for auto-indexing' - logging.error(msg.format(indexer.root_path)) - import sys - sys.exit(2) - - msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs' - logging.info(msg.format(indexer.root_path, self.r.auto_interval)) - indexer.start() - - return super(ReplayCli, self).run() - #============================================================================= class WarcServerCli(BaseCli): diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index f37129273..3c581d3f3 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -58,6 +58,8 @@ def __init__(self, config_file='./config.yaml', custom_config=None): self.init_recorder(config.get('recorder')) + self.init_autoindex(config.get('autoindex')) + static_path = config.get('static_url_path', 'pywb/static/').replace('/', os.path.sep) self.static_handler = StaticHandler(static_path) @@ -125,8 +127,8 @@ def init_recorder(self, recorder_config): # TODO: support dedup dedup_index = None warc_writer = MultiFileWARCWriter(self.warcserver.archive_paths, - max_size=int(recorder_config.get('max_size', 1000000000)), - max_idle_secs=int(recorder_config.get('max_idle_secs', 600)), + max_size=int(recorder_config.get('rollover_size', 1000000000)), + max_idle_secs=int(recorder_config.get('rollover_idle_secs', 600)), filename_template=recorder_config.get('filename_template'), dedup_index=dedup_index) @@ -136,6 +138,22 @@ def init_recorder(self, recorder_config): self.recorder_path = self.RECORD_API % (recorder_server.port, recorder_coll) + def init_autoindex(self, auto_interval): + if not auto_interval: + return + + from pywb.manager.autoindex import AutoIndexer + indexer = AutoIndexer(interval=int(auto_interval)) + if not os.path.isdir(indexer.root_path): + msg = 'No managed directory "{0}" for auto-indexing' + logging.error(msg.format(indexer.root_path)) + import sys + sys.exit(2) + + msg = 'Auto-Indexing Enabled on "{0}", checking every {1} secs' + logging.info(msg.format(indexer.root_path, auto_interval)) + indexer.start() + def serve_home(self, environ): home_view = BaseInsertView(self.rewriterapp.jinja_env, 'index.html') fixed_routes = self.warcserver.list_fixed_routes() diff --git a/pywb/manager/autoindex.py b/pywb/manager/autoindex.py index 1c06e44f6..26ea37b8f 100644 --- a/pywb/manager/autoindex.py +++ b/pywb/manager/autoindex.py @@ -63,16 +63,13 @@ def check_path(self): index_file = os.path.join(self.manager.indexes_dir, self.AUTO_INDEX_FILE) - if os.path.isfile(index_file): - if os.name != 'nt' and self.is_newer_than(archive_dir, index_file): - continue - else: + if not os.path.isfile(index_file): try: os.makedirs(self.manager.indexes_dir) except Exception as e: pass - logging.info('Collection Possibly Changed: ' + coll) + logging.info('Checking Collection: ' + coll) to_index = [] for dirpath, dirnames, filenames in os.walk(archive_dir): for filename in filenames: @@ -88,6 +85,14 @@ def check_path(self): self.do_index(to_index) def run(self): + try: + # If running in uwsgi, run AutoIndexer only in first worker! + import uwsgi + if uwsgi.worker_id() != 1: + return + except: + pass + try: while self.keep_running: self.check_path()