trove_newspaper_harvester/cli.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../01_cli.ipynb.

# %% auto 0
__all__ = ['start_harvest', 'restart_harvest', 'report_harvest', 'main']

# %% ../01_cli.ipynb 4
import argparse
from pathlib import Path
from pprint import pprint

from requests.exceptions import HTTPError

from trove_newspaper_harvester.core import (
    Harvester,
    NoQueryError,
    get_harvest,
    get_config,
    prepare_query,
    get_crate
)

# %% ../01_cli.ipynb 5
def start_harvest(
    query=None,
    key=None,
    config_file=None,
    data_dir="data",
    harvest_dir=None,
    text=False,
    pdf=False,
    image=False,
    include_linebreaks=False,
    max=None,
    keep_json=False,
):
    """
    Start a harvest. Note that you must supply either `query_params` and `key` or `config_file`.

    Parameters:

    * `query` [optional, search url from Trove web interface or API, string]
    * `key` [optional, Trove API key, string]
    * `config_file` [optional, path to a config file]
    * `data_dir` [optional, directory for harvests, string]
    * `harvest_dir` [optional, directory for this harvest, string]
    * `text` [optional, save articles as text files, True or False]
    * `pdf` [optional, save articles as PDFs, True or False]
    * `image` [optional, save articles as images, True or False]
    * `include_linebreaks` [optional, include linebreaks in text files, True or False]
    * `max` [optional, maximum number of results, integer]
    * `keep_json` [optional, keep the results.ndjson file, true or False]

    """
    # Turn the query url into a dictionary of parameters
    params = prepare_query(query)
    # Create the harvester
    try:
        harvester = Harvester(
            query_params=params,
            key=key,
            config_file=config_file,
            data_dir=data_dir,
            harvest_dir=harvest_dir,
            pdf=pdf,
            text=text,
            image=image,
            include_linebreaks=include_linebreaks,
            maximum=max,
        )
    except HTTPError as e:
        if e.response.status_code == 403:
            print("The request could not be authorised, check your API key.")
        else:
            raise
    except NoQueryError:
        print("No query parameters found, check your query url. You must supply either a query and key, or a config_file.")
    else:
        # Go!
        try:
            harvester.harvest()
        except AttributeError:
            pass
        else:
            if harvester.total > 0:
                harvester.save_csv()
                if not keep_json:
                    Path(harvester.harvest_dir, "results.ndjson").unlink()
                    harvester.remove_ndjson_from_crate()


def restart_harvest(data_dir="data", harvest_dir=None):
    """
    Restart a failed harvest.

    Parameters:

    * `data_dir` [optional, directory for harvests, string]
    * `harvest_dir` [optional, directory for this harvest, string]
    """
    if data_dir and harvest_dir:
        harvest = get_harvest(data_dir=data_dir, harvest_dir=harvest_dir)
    else:
        harvest = get_harvest()
    if Path(f"{'-'.join(harvest.parts)}.sqlite").exists():
        data_dir, harvest_dir = harvest.parts
        config_path = Path(data_dir, harvest_dir, "harvester_config.json")
        harvester = Harvester(
            data_dir=data_dir,
            harvest_dir=harvest_dir,
            config_file=config_path
        )
        harvester.harvest()


def report_harvest(data_dir="data", harvest_dir=None):
    """
    Provide some details of a harvest.
    If no harvest is specified, show the most recent.

    Parameters:

    * `data_dir` [optional, directory for harvests, string]
    * `harvest_dir` [optional, directory for this harvest, string]
    """
    harvest = get_harvest(data_dir=data_dir, harvest_dir=harvest_dir)
    config = get_config(harvest)
    crate = get_crate(harvest)
    harvest_run = crate.get("#harvester_run").properties()
    results = crate.get("results.csv").properties()
    harvester = crate.get("https://github.com/wragge/trove-newspaper-harvester")
    if config:
        # results = get_results(data_dir)
        print("")
        print("HARVEST PARAMETERS")
        print("==================")
        #print(f"Last harvest started: {meta['date_started']}")
        print(f"Harvest path: {config['full_harvest_dir']}")
        print("Query parameters:")
        pprint(config["query_params"], indent=2)
        print(f"Max results: {config['maximum']}")
        print(f"Include PDFs: {config['pdf']}")
        print(f"Include text: {config['text']}")
        print(f"Include images: {config['image']}")
        print(f"Include linebreaks: {config['include_linebreaks']}")
        #print(f"Harvested with: {meta['harvester']}")
        print("")
        print("HARVEST RESULTS")
        print("===============")
        print(f"Harvest started: {harvest_run['startDate']}")
        print(f"Harvest ended: {harvest_run.get('endDate', '')}")
        print(f"Total articles: {results['size']}")
        print(f"Harvested by: {harvester['name']} version {harvester['softwareVersion']}")

# CLI


def main():
    """
    Sets up the command-line interface
    """
    parser = argparse.ArgumentParser(prog="troveharvester")
    subparsers = parser.add_subparsers(dest="action")
    parser_start = subparsers.add_parser("start", help="start a new harvest")
    parser_start.add_argument("query", nargs="?", default="", help="url of the search you want to harvest")
    parser_start.add_argument("key", nargs="?", default="", help="Your Trove API key")
    parser_start.add_argument("--config-file", help="The path to a harvester config file")
    parser_start.add_argument(
        "--data_dir", default="data", help="directory for harvests"
    )
    parser_start.add_argument("--harvest_dir", help="directory for this harvest")
    parser_start.add_argument(
        "--max", type=int, default=0, help="maximum number of results to return"
    )
    parser_start.add_argument(
        "--pdf", action="store_true", help="save PDFs of articles"
    )
    parser_start.add_argument(
        "--text", action="store_true", help="save text contents of articles"
    )
    parser_start.add_argument(
        "--image", action="store_true", help="save images of articles"
    )
    parser_start.add_argument(
        "--include_linebreaks",
        action="store_true",
        help="preserve line breaks in text files",
    )
    parser_start.add_argument(
        "--keep_json", action="store_true", help="keep the raw ndjson results file"
    )
    parser_restart = subparsers.add_parser(
        "restart", help="restart an unfinished harvest"
    )
    parser_restart.add_argument("--data_dir", help="directory for harvests")
    parser_restart.add_argument("--harvest_dir", help="directory for this harvest")
    parser_report = subparsers.add_parser("report", help="report on a harvest")
    parser_report.add_argument("--data_dir", help="directory for harvests")
    parser_report.add_argument("--harvest_dir", help="directory for this harvest")
    args = parser.parse_args()
    if args.action == "report":
        report_harvest(
            data_dir=args.data_dir,
            harvest_dir=args.harvest_dir,
        )
    elif args.action == "restart":
        restart_harvest(
            data_dir=args.data_dir,
            harvest_dir=args.harvest_dir,
        )
    elif args.action == "start":
        start_harvest(
            query=args.query,
            key=args.key,
            config_file=args.config_file,
            data_dir=args.data_dir,
            harvest_dir=args.harvest_dir,
            text=args.text,
            pdf=args.pdf,
            image=args.image,
            include_linebreaks=args.include_linebreaks,
            keep_json=args.keep_json,
            max=args.max,
        )