# core

> a harvester for downloading large numbers of digitised newspaper articles from Trove

In [1]:
#| default_exp core

In [2]:
#| export
import argparse
import csv
import datetime
import json
import os
import re
import time
from importlib.metadata import version
from pathlib import Path
from pprint import pprint
from urllib.parse import parse_qs, parse_qsl, urlparse

import arrow
import html2text
import requests_cache
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.exceptions import HTTPError
from requests.packages.urllib3.util.retry import Retry
from rocrate.rocrate import ROCrate
from rocrate.model.contextentity import ContextEntity
from rocrate.model.person import Person
from tqdm.auto import tqdm
from trove_newspaper_images.articles import download_images
from trove_query_parser.parser import parse_query

In [3]:
#| hide
import shutil

from nbdev.showdoc import *
from fastcore.test import ExceptionExpected
import pandas as pd

# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv

In [4]:
#| export


class NoQueryError(Exception):
    """
    Exception triggered by empty query.
    """
    pass


class Harvester:
    """
    Harvest large quantities of digitised newspaper articles from Trove. Note that you must supply either `query_params` and `key` or `config_file`.

    Parameters:

    * `query_params` [optional, dictionary of parameters]
    * `key` [optional, Trove API key]
    * `config_file` [optional, path to a config file]
    * `data_dir` [optional, directory for harvests, string]
    * `harvest_dir` [optional, directory for this harvest, string]
    * `text` [optional, save articles as text files, True or False]
    * `pdf` [optional, save articles as PDFs, True or False]
    * `image` [optional, save articles as images, True or False]
    * `include_linebreaks` [optional, include linebreaks in text files, True or False]
    * `maximum` [optional, maximum number of results, integer]
    """

    zoom = 3
    api_url = "https://api.trove.nla.gov.au/v3/result"

    def __init__(
        self,
        query_params=None,
        key=None,
        data_dir="data",
        harvest_dir=None,
        config_file=None,
        text=False,
        pdf=False,
        image=False,
        include_linebreaks=False,
        maximum=None,
    ):
        if not (query_params and key) and not config_file:
            raise NoQueryError
            
        if config_file:
            config = json.loads(Path(config_file).read_text())
            self.query_params = config["query_params"]
            self.key = config["key"]
            self.pdf = config["pdf"]
            self.text = config["text"]
            self.image = config["image"]
            self.include_linebreaks = config["include_linebreaks"]
            self.maximum = config["maximum"]
        else:
            self.query_params = query_params
            self.key = key
            self.pdf = pdf
            self.text = text
            self.image = image
            self.include_linebreaks = include_linebreaks
            self.maximum = maximum
        if self.text:
            try:
                self.query_params["include"].append("articleText")
            except KeyError:
                self.query_params["include"] = ["articleText"]
        self.data_dir = Path(data_dir)
        if harvest_dir:
            self.harvest_dir = Path(self.data_dir, harvest_dir)
        else:
            self.harvest_dir = Path(
                self.data_dir, arrow.utcnow().format("YYYYMMDDHHmmss")
            )
        self.s = self.initialise_cache()
        self.ndjson_file = Path(self.harvest_dir, "results.ndjson")
        # Deletes existing file in case of restart
        self.ndjson_file.unlink(missing_ok=True)
        self.create_dirs()
        self.harvested = 0
        self.start = "*"
        self.number = 100
        if self.maximum:
            self.total = self.maximum
        else:
            self.total = self._get_total()
        self.save_config()
        self.create_crate()

    def initialise_cache(self):
        cache_name = "-".join(self.harvest_dir.parts)
        s = requests_cache.CachedSession(cache_name)
        retries = Retry(
            total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504]
        )
        s.mount("http://", HTTPAdapter(max_retries=retries))
        s.mount("https://", HTTPAdapter(max_retries=retries))
        return s

    def delete_cache(self):
        cache_name = f"{'-'.join(self.harvest_dir.parts)}.sqlite"
        Path(cache_name).unlink(missing_ok=True)

    def create_dirs(self):
        self.harvest_dir.mkdir(exist_ok=True, parents=True)
        if self.pdf:
            Path(self.harvest_dir, "pdf").mkdir(exist_ok=True)
        if self.text:
            Path(self.harvest_dir, "text").mkdir(exist_ok=True)
        if self.image:
            Path(self.harvest_dir, "image").mkdir(exist_ok=True)

    def _get_total(self):
        try:
            params = self.query_params.copy()
        except AttributeError:
            raise NoQueryError
        else:
            params["n"] = 0
            response = self.s.get(self.api_url, params=params, headers={"X-API-KEY": self.key}, timeout=30)
            response.raise_for_status()
            # print(response.url)
            try:
                results = response.json()
            except (AttributeError, ValueError):
                return 0
            else:
                return int(results["category"][0]["records"]["total"])

    def log_query(self):
        """
        Do something with details of query -- ie log date?
        """
        pass

    def harvest(self):
        """
        Start the harvest and loop over the result set until finished.
        """
        if self.total > 0:
            params = self.query_params.copy()
            params["n"] = self.number
            with tqdm(total=self.total, unit="article") as pbar:
                pbar.update(self.harvested)
                while self.start and (self.harvested < self.total):
                    params["s"] = self.start
                    response = self.s.get(self.api_url, params=params, headers={"X-API-KEY": self.key}, timeout=30)
                    response.raise_for_status()
                    # print(response.url)
                    try:
                        results = response.json()
                    except (AttributeError, ValueError):
                        # Log errors?
                        pass
                    else:
                        records = results["category"][0]["records"]
                        self.process_results(records, pbar)
                        # pbar.update(len(records['article']))
        # Add the number harvested to the metadata file
        self.update_crate()
        self.delete_cache()
        
    def create_crate(self):
        crate = ROCrate()
        
        # Add CreateAction with datetime started & instrument & object pointing to config
        harvest_properties = {
            "@type": "CreateAction",
            "name": "Run of harvester",
            "startDate": arrow.now().isoformat(),
            "instrument": "https://github.com/wragge/trove-newspaper-harvester",
            "object": "harvester_config.json",
            "actionStatus": {"@id": "http://schema.org/ActiveActionStatus"}
        }
        crate.add(ContextEntity(crate, "#harvester_run", properties=harvest_properties))
        
        # Add link to action from root
        crate.update_jsonld(
            {
                "@id": "./",
                "mainEntity": {"@id": "#harvester_run"}
            }
        )
        
        # Add harvester as software
        harvester_properties = {
            "@type": "SoftwareApplication",
            "name": "Trove Newspaper and Gazette Harvester",
            "description": "The Trove Newspaper (& Gazette) Harvester makes it easy to download large quantities of digitised articles from Trove’s newspapers and gazettes.",
            "documentation": "https://wragge.github.io/trove-newspaper-harvester/",
            "url": "https://github.com/wragge/trove-newspaper-harvester",
            "softwareVersion": version('trove_newspaper_harvester')
        }
        
        crate.add(ContextEntity(crate, "https://github.com/wragge/trove-newspaper-harvester", properties=harvester_properties))
        
        # Add config file
        config_properties = {
            "@type": "File",
            "name": "Trove Newspaper Harvester configuration file",
            "encodingFormat": "application/json"
        }
        
        crate.add_file(Path(self.harvest_dir, "harvester_config.json"), properties=config_properties)
        
        # Add licences
        # For newspaper metadata
        nkc_properties = {
            "@type": "CreativeWork",
            "url": "http://rightsstatements.org/vocab/NKC/1.0/",
            "name": "No Known Copyright",
            "description": "The organization that has made the Item available reasonably believes that the Item is not restricted by copyright or related rights, but a conclusive determination could not be made."
        }
        
        # For text, pdfs, images
        cne_properties = {
            "@type": "CreativeWork",
            "url": "http://rightsstatements.org/vocab/CNE/1.0/",
            "name": "Copyright Not Evaluated",
            "description": "The copyright and related rights status of this Item has not been evaluated."
        }
        
        # For crate metadata
        cc_properties = {
            "name": "CC0 Public Domain Dedication",
            "@type": "CreativeWork",
            "url": "https://creativecommons.org/publicdomain/zero/1.0/"
        }
        
        for licence in [nkc_properties, cne_properties, cc_properties]:
            crate.add(ContextEntity(crate, licence["url"], properties=licence))
        
        # Add licence to metadata
        crate.update_jsonld(
            {
                "@id": "ro-crate-metadata.json",
                "license": {"@id": "https://creativecommons.org/publicdomain/zero/1.0/"}
            }
        )
        
        crate.write(self.harvest_dir)

    def save_config(self):
        """
        Save the harvester config in a JSON file.
        Useful for documenting your harvest.
        """
        config = {
            "query_params": self.query_params,
            "key": self.key,
            "full_harvest_dir": str(self.harvest_dir),
            "maximum": self.maximum,
            "text": self.text,
            "pdf": self.pdf,
            "image": self.image,
            "include_linebreaks": self.include_linebreaks,
            #"date_started": arrow.utcnow().isoformat(),
            #"harvester": f"trove_newspaper_harvester v{version('trove_newspaper_harvester')}",
        }
        with Path(self.harvest_dir, "harvester_config.json").open("w") as config_file:
            json.dump(config, config_file, indent=4)

    def update_crate(self):
        """
        Update the RO-Crate file with the total harvested, end date, and files.
        """
        crate = ROCrate(source=self.harvest_dir)
        
        finished_date = arrow.now().isoformat()
        
        run_update = {
            "@id": "#harvester_run",
            "endDate": finished_date
        }
        
        if self.harvested > 0:
            run_update["actionStatus"] = {"@id": "http://schema.org/CompletedActionStatus"}
            run_update["result"] = [{"@id": "results.ndjson"}]
            
            ndjson_properties = {
                "@type": ["File", "Dataset"],
                "name": "Metadata of harvested articles in NDJSON format",
                "dateCreated": finished_date,
                "encodingFormat": "application/x-ndjson",
                "size": self.harvested,
                "contentSize": Path(self.harvest_dir, "results.ndjson").stat().st_size,
                "license": {"@id": "http://rightsstatements.org/vocab/NKC/1.0/"}
            }

            crate.add_file(Path(self.harvest_dir, "results.ndjson"), properties=ndjson_properties)

            if self.text:
                run_update["result"].append({"@id": "text"})
                text_properties = {
                    "@type": ["File", "Dataset"],
                    "name": "Text files harvested from articles",
                    "description": "There is one text file per article. The file titles include basic article metadata – the date of the article, the id number of the newspaper, and the id number of the article.",
                    "dateCreated": finished_date,
                    "size": len(list(Path(self.harvest_dir, "text").glob('*.txt'))),
                    "license": {"@id": "http://rightsstatements.org/vocab/CNE/1.0/"}
                }
                crate.add_file(Path(self.harvest_dir, "text"), properties=text_properties)

            if self.pdf:
                run_update["result"].append({"@id": "pdf"})
                pdf_properties = {
                    "@type": ["File", "Dataset"],
                    "name": "PDF files of harvested articles",
                    "description": "There is one PDF file per article. The file titles include basic article metadata – the date of the article, the id number of the newspaper, and the id number of the article.",
                    "dateCreated": finished_date,
                    "size": len(list(Path(self.harvest_dir, "pdf").glob('*.pdf'))),
                    "license": {"@id": "http://rightsstatements.org/vocab/CNE/1.0/"}
                }
                crate.add_file(Path(self.harvest_dir, "pdf"), properties=pdf_properties)

            if self.image:
                run_update["result"].append({"@id": "image"})
                image_properties = {
                    "@type": ["File", "Dataset"],
                    "name": "Images of harvested articles",
                    "description": "There can be multiple image files per article if the article was split over multiple pages. The file titles include basic article metadata – the date of the article, the id number of the newspaper, the id number of the article, and the id number of the page.",
                    "dateCreated": finished_date,
                    "size": len(list(Path(self.harvest_dir, "image").glob('*.jpg'))),
                    "license": {"@id": "http://rightsstatements.org/vocab/CNE/1.0/"}
                }
                crate.add_file(Path(self.harvest_dir, "image"), properties=image_properties)
        else:
            run_update["actionStatus"] = {"@id": "http://schema.org/FailedActionStatus"}
                
        crate.update_jsonld(run_update)   
        crate.write(self.harvest_dir)
        
    def add_csv_to_crate(self):
        """
        Update the RO-Crate file with the total harvested, end date, and files.
        """
        crate = ROCrate(source=self.harvest_dir)
        
        csv_properties = {
            "@type": ["File", "Dataset"],
            "name": "Metadata of harvested articles in CSV format",
            "dateCreated": arrow.now().isoformat(),
            "encodingFormat": "text/csv",
            "size": self.harvested,
            "contentSize": Path(self.harvest_dir, "results.csv").stat().st_size,
            "license": {"@id": "http://rightsstatements.org/vocab/NKC/1.0/"}
        }
        
        crate.add_file(Path(self.harvest_dir, "results.csv"), properties=csv_properties)
        
        crate.get("#harvester_run").append_to("result", {"@id": "results.csv"})

        crate.write(self.harvest_dir)
        
    def remove_ndjson_from_crate(self):
        crate = ROCrate(source=self.harvest_dir)
        crate.delete("results.ndjson")
        crate.write(self.harvest_dir)
        outputs = crate.get("#harvester_run").properties()["result"]
        new_outputs = [o for o in outputs if o != {"@id": "results.ndjson"}]
        crate.update_jsonld({"@id": "#harvester_run", "result": new_outputs})
        crate.write(self.harvest_dir)

    def save_csv(self):
        """
        Flatten and rename data in the ndjson file to save as CSV.
        """
        with Path(self.harvest_dir, "results.csv").open('w') as csvfile:
            columns = ['article_id', 'title', 'date', 'page', 'newspaper_id', 'newspaper_title', 'category', 'words', 'illustrated', 'edition', 'supplement', 'section', 'url', 'page_url', 'snippet', 'relevance', 'corrections', 'last_corrected', 'tags', 'comments', 'lists', 'text', 'pdf', 'images']
            writer = csv.DictWriter(csvfile, fieldnames=columns)
            writer.writeheader()
            with self.ndjson_file.open("r") as ndjson_file:
                for line in ndjson_file:
                    data = json.loads(line.strip())
                    row = {
                        "article_id": data["id"],
                        "title": data["heading"],
                        "date": data["date"],
                        "page": data["pageSequence"],
                        "newspaper_id": data["title"]["id"],
                        "newspaper_title": data["title"].get("title", ""),
                        "category": data["category"],
                        "words": data["wordCount"],
                        "illustrated": data["illustrated"],
                        "edition": data.get("edition", ""),
                        "supplement": data.get("supplement", ""),
                        "section": data.get("section", ""),
                        "url": data["identifier"],
                        "page_url": data.get("trovePageUrl", ""),
                        "snippet": data.get("snippet", ""),
                        "relevance": data.get("relevance", {}).get("score", ""),
                        "corrections": data.get("correctionCount", 0),
                        "last_corrected": data.get("lastCorrection", {}).get("lastupdated", ""),
                        "tags": data.get("tagCount", 0),
                        "comments": data.get("commentCount", 0),
                        "lists": data.get("listCount", 0),
                        "text": data["articleText"],
                        "pdf": data["pdf"],
                        "images": "|".join(data["images"])
                    }
                    writer.writerow(row)
        self.add_csv_to_crate()

    def make_filename(self, article):
        """
        Create a filename for a text file or PDF.
        For easy sorting/aggregation the filename has the format:
            PUBLICATIONDATE-NEWSPAPERID-ARTICLEID
        """
        # If the article object doesn't have basic info like date, there's something wrong
        # Don't try and save files if that's the case
        try:
            date = article["date"]
        except KeyError:
            return None
        date = date.replace("-", "")
        newspaper_id = article["title"]["id"]
        article_id = article["id"]
        return f"{date}-{newspaper_id}-{article_id}"

    def ping_pdf(self, ping_url):
        """
        Check to see if a PDF is ready for download.
        If a 200 status code is received, return True.
        """
        ready = False
        # req = Request(ping_url)
        try:
            # urlopen(req)
            with self.s.cache_disabled():
                response = self.s.get(ping_url, timeout=30)
            response.raise_for_status()
        except HTTPError:
            if response.status_code == 423:
                ready = False
            else:
                raise
        else:
            ready = True
        return ready

    def get_pdf_url(self, article_id, zoom=3):
        """
        Download the PDF version of an article.
        These can take a while to generate, so we need to ping the server to see if it's ready before we download.
        """
        pdf_url = None
        # Ask for the PDF to be created
        prep_url = "https://trove.nla.gov.au/newspaper/rendition/nla.news-article{}/level/{}/prep".format(
            article_id, zoom
        )
        response = self.s.get(prep_url)
        # Get the hash
        prep_id = response.text
        # Url to check if the PDF is ready
        ping_url = "https://trove.nla.gov.au/newspaper/rendition/nla.news-article{}.{}.ping?followup={}".format(
            article_id, zoom, prep_id
        )
        tries = 0
        ready = False
        time.sleep(1)  # Give some time to generate pdf
        # Are you ready yet?
        while ready is False and tries < 5:
            ready = self.ping_pdf(ping_url)
            if not ready:
                tries += 1
                time.sleep(2)
        # Download if ready
        if ready:
            pdf_url = "https://trove.nla.gov.au/newspaper/rendition/nla.news-article{}.{}.pdf?followup={}".format(
                article_id, zoom, prep_id
            )
        return pdf_url

    def get_aww_text(self, article_id):
        # Download text using the link from the web interface
        url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-article{article_id}.txt"
        response = self.s.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "lxml")
            # Remove the header
            soup.find("p").decompose()
            soup.find("hr").decompose()
            return str(soup)

    def save_text(self, article):
        text_filename = self.make_filename(article)
        if text_filename:
            text_file = Path(self.harvest_dir, "text", f"{text_filename}.txt")
            if not text_file.exists():
                html_text = article.get("articleText")
                if not html_text:
                    # If the text isn't in the API response (as with AWW), download separately
                    html_text = self.get_aww_text(article["id"])
                if html_text:
                    # Convert html to plain text
                    text = html2text.html2text(html_text)
                    if self.include_linebreaks == False:
                        text = re.sub("\s+", " ", text)

                    with open(text_file, "wb") as text_output:
                        text_output.write(text.encode("utf-8"))
                else:
                    return ""
            # Removes the output_dir from path
            return text_file.relative_to(*text_file.parts[:2])
        else:
            return ""

    def save_pdf(self, article):
        pdf_filename = self.make_filename(article)
        if pdf_filename:
            pdf_file = Path(self.harvest_dir, "pdf", f"{pdf_filename}.pdf")
            if not pdf_file.exists():
                pdf_url = self.get_pdf_url(article["id"])
                if pdf_url:
                    response = self.s.get(pdf_url)
                    pdf_file.write_bytes(response.content)
                    # Removes the output_dir from path
                else:
                    return ""
            return pdf_file.relative_to(*pdf_file.parts[:2])
        else:
            return ""

    def process_results(self, records, pbar):
        """
        Processes a page full of results.
        """
        rows = []
        try:
            articles = records["article"]
        except KeyError:
            raise
        else:
            with self.ndjson_file.open("a") as ndjson_file:
                for article in articles:
                    if self.harvested >= self.total:
                        break
                    article_id = article["id"]
                    # rows.append(self.prepare_row(article))

                    if self.pdf:
                        pdf_file = self.save_pdf(article)
                        article["pdf"] = str(pdf_file)
                    else:
                        article["pdf"] = ""
                    if self.text:
                        text_file = self.save_text(article)
                        article["articleText"] = str(text_file)
                    else:
                        article["articleText"] = ""
                    if self.image:
                        images = download_images(
                            article_id, output_dir=Path(self.harvest_dir, "image")
                        )
                        images = [str(Path("image", i)) for i in images]
                        article["images"] = images
                    else:
                        article["images"] = []
                    ndjson_file.write(json.dumps(article) + "\n")
                    pbar.update(1)
                    # Update the number harvested
                    self.harvested += 1
            # Get the nextStart token
            try:
                self.start = records["nextStart"]
            except KeyError:
                self.start = None
            # print('Harvested: {}'.format(self.harvested))


def prepare_query(query):
    """
    Converts a Trove search url into a set of parameters ready for harvesting.

    Parameters:

    * `query` [required, search url from Trove web interface or API, string]

    Returns:

    * a dictionary of parameters
    """
    if query:
        #if text and "articleText" not in query:
            # If text is set to True, make sure the query is getting the article text
            # Adding it here rather than to the params dict to avoid overwriting any existing include values
            #query += "&include=articleText"
        if "api.trove.nla.gov.au" in query:
            # If it's an API url, no further processing of parameters needed
            parsed_url = urlparse(query)
            new_params = parse_qs(parsed_url.query)
        else:
            # These params can be accepted as is.
            new_params = parse_query(query, 3)
        new_params["encoding"] = "json"
        new_params["reclevel"] = "full"
        new_params["bulkHarvest"] = "true"
        
        # The query parser defaults to 'newspaper,gazette' if no zone is set.
        # But multiple zones won't work with bulkHarvest, so set to 'newspaper'.
        #if new_params["zone"] == "newspaper,gazette":
        #    new_params["zone"] = "newspaper"
        # return '{}?{}'.format('https://api.trove.nla.gov.au/v2/result', urlencode(new_params, doseq=True))
        return new_params


def get_harvest(data_dir="data", harvest_dir=None):
    """
    Get the path to a harvest.
    If data_dir and harvest_dir are not supplied, this will return the most recent harvest in the 'data' directory.

    Parameters:

    * `data_dir` [optional, directory for harvests, string]
    * `harvest_dir` [optional, directory for this harvest, string]

    Returns:

    * a pathlib.Path object pointing to the harvest directory
    """
    if harvest_dir:
        harvest = Path(data_dir, harvest_dir)
    else:
        harvests = Path("data").glob("*")
        harvests = sorted([d for d in harvests if d.is_dir()])
        harvest = Path(harvests[-1])
    return harvest


def get_config(harvest):
    """
    Get the query config parameters from a harvest directory.

    Parameters:

    * `harvest` [required, path to harvest, string or pathlib.Path]

    Returns:

    * config dictionary
    """
    try:
        with Path(harvest, "harvester_config.json").open("r") as config_file:
            config = json.load(config_file)
    except IOError:
        print("No harvest!")
        config = None
    return config

def get_crate(harvest):
    """
    Get the RO-Crate metadata file from a harvest directory.
    
     Parameters:

    * `harvest` [required, path to harvest, string or pathlib.Path]

    Returns:

    * ROCrate object
    """
    return ROCrate(source=harvest)

In [5]:
show_doc(Harvester)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L41){target="_blank" style="float:right; font-size:smaller"}

### Harvester

>      Harvester (query_params=None, key=None, data_dir='data',
>                 harvest_dir=None, config_file=None, text=False, pdf=False,
>                 image=False, include_linebreaks=False, maximum=None)

Harvest large quantities of digitised newspaper articles from Trove. Note that you must supply either `query_params` and `key` or `config_file`.

Parameters:

* `query_params` [optional, dictionary of parameters]
* `key` [optional, Trove API key]
* `config_file` [optional, path to a config file]
* `data_dir` [optional, directory for harvests, string]
* `harvest_dir` [optional, directory for this harvest, string]
* `text` [optional, save articles as text files, True or False]
* `pdf` [optional, save articles as PDFs, True or False]
* `image` [optional, save articles as images, True or False]
* `include_linebreaks` [optional, include linebreaks in text files, True or False]
* `maximum` [optional, maximum number of results, integer]

The `Harvester` class configures and runs your harvest, saving results in a variety of formats.

By default, the harvester will save harvests in a directory called `data`, with each individual harvest in a directory named according to the current date and time (`YYYYMMDDHHmmss` format). You can change this by setting the `data_dir` and `harvest_dir` parameters. This can help you to manage your harvests by grouping together related searches, or giving them meaningful names.

The harvester generates three data files by default:

* `harvester_config.json` a file that captures the parameters used to launch the harvest
* `ro-crate-metadata.json` a metadata file documenting the harvest in [RO-Crate](https://www.researchobject.org/ro-crate/) format
* `results.ndjson` contains details of all the harvested articles in a newline delimited JSON format (each line is a JSON object)

You can convert the `ndjson` file to a CSV format using `Harvester.save_csv`.

The `text`, `pdf`, and `image` options give you the option to save the contents of the articles as either text files, PDF files, or JPG images. Note that saving PDFs and images can be very slow.

If you only want to harvest part of the results set you can set the `maximum` parameter to the number of records you want.

### Quick start

* You'll need a [Trove API key](https://trove.nla.gov.au/about/create-something/using-api) to use the harvester.
* Just copy the url from a search in the newspapers and gazettes category.

```python
from trove_newspaper_harvester.core import prepare_query, Harvester

my_api_key = "myApIkEy"
search_url = "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge"

# Convert the search url into a set of API parameters
my_query_params = prepare_query(search_url)

# Initialise the Harvester
harvester = Harvester(query_params=myquery_params, key=my_api_key)

# Start the harvest
harvester.harvest()
```

If you want to harvest the OCRd text of articles as well as metadata, add `text=True` to the harvester initialisation.

```python
# Initialise the Harvester
harvester = Harvester(query_params=myquery_params, key=my_api_key, text=True)
```

Similarly you can harvest PDFs and images of articles by adding `pdf=True` and `image=True` to the harvester initialisation, but keep in mind that these options will make the harvest much slower!

You *must* supply either `query_params` and `key`, or the path to a `config_file`. If you don't you'll get a `NoQueryError`.

You can generate a set of query parameters from a Trove search url using `prepare_query()`.

In [6]:
# TEST FOR MISSING PARAMETERS
# You need to supply either query_params AND key, OR config_file. 
# If you don't you'll get a NoQueryError
with ExceptionExpected(ex=NoQueryError):
    harvester = Harvester()

In [7]:
show_doc(prepare_query)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L604){target="_blank" style="float:right; font-size:smaller"}

### prepare_query

>      prepare_query (query)

Converts a Trove search url into a set of parameters ready for harvesting.

Parameters:

* `query` [required, search url from Trove web interface or API, string]

Returns:

* a dictionary of parameters

The `prepare_query` function converts a search url from the Trove web interface or API into a set of parameters that you can feed to `Harvester`. It uses the [trove-query-parser](https://pypi.org/project/trove-query-parser/) to do most of the work, but adds in a few extra parameters needed for the harvest.

In [8]:
query_params = prepare_query(
    query="https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-state=New%20South%20Wales&l-artType=newspapers&l-title=508&l-decade=191&l-category=Article"
)
query_params

{'q': 'wragge',
 'l-state': ['New South Wales'],
 'l-artType': 'newspapers',
 'l-title': ['508'],
 'l-decade': ['191'],
 'l-category': ['Article'],
 'category': 'newspaper',
 'encoding': 'json',
 'reclevel': 'full',
 'bulkHarvest': 'true'}

In [9]:
# TEST query_params()
# Convert a url from the Trove web interface
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge"
)

# Test the results
assert query_params == {
    "q": "wragge",
    "category": "newspaper",
    "encoding": "json",
    "reclevel": "full",
    "bulkHarvest": "true",
}

# Convert a url from an API request
query_params = prepare_query(
    "https://api.trove.nla.gov.au/v2/result?q=wragge&category=newspaper&encoding=json&l-category=Article"
)

assert query_params == {
    "q": ["wragge"],
    "category": ["newspaper"],
    "encoding": "json",
    "l-category": ["Article"],
    "reclevel": "full",
    "bulkHarvest": "true",
}

### Initialising a harvest using a `harvester_config.json` file

The parameters used to initialise a harvest are saved into a file called `harvester_config.json()`. This provides useful documentation of your harvest, making it possible to reconstruct the process at a later date.

For example, you might want to re-harvest a particular query a year after your initial harvest to see how the results have changed. Remember, more articles are being added every week! To re-run a harvest, just point the Harvester to the `harvester_config.json()` file. By default, your new harvest will be saved in a fresh directory.

```python
from trove_newspaper_harvester.core import Harvester

harvester = Harvester(config_file="path/to/old/harvest/harvester_config.json")

harvester.harvest()
```

In [10]:
# TEST: Reharvest from config file

API_KEY = os.getenv("TROVE_API_KEY")

test_config = {
    'query_params': {'q': 'wragge',
    'l-state': ['Western Australia'],
    'l-illustrated': 'true',
    'l-illtype': ['Photo'],
    'include': ['articleText'],
    'category': 'newspaper',
    'encoding': 'json',
    'reclevel': 'full',
    'bulkHarvest': 'true'},
    'key': API_KEY,
    'full_harvest_dir': 'harvests/test_harvest',
    'maximum': None,
    'text': True,
    'pdf': False,
    'image': False,
    'include_linebreaks': False
}

Path("harvester_config.json").write_text(json.dumps(test_config))

# Initialise the harvester
harvester = Harvester(config_file="harvester_config.json")

# Start the harvest!
harvester.harvest()

shutil.rmtree(Path("data"))
Path("harvester_config.json").unlink()

  0%|          | 0/174 [00:00<?, ?article/s]

### Where your harvests are saved

By default, harvests are saved in a directory named `data`. Each individual harvest is saved in a directory named according to the current date/time, for example: `data/20230826125205`.

In [11]:
# TEST HARVESTER CREATES DEFAULT HARVEST DIRECTORY
# This example initialises a harvest, but doesn't actually run it.

API_KEY = os.getenv("TROVE_API_KEY")

# Prepare query params
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge"
)

# Initialise the Harvester with the query parameters
harvester = Harvester(query_params=query_params, key=API_KEY, text=True)

# if you haven't set the max parameter, the total value will be the total number of results
assert harvester.total > 0
print(f"Total results: {harvester.total:,}")

# Check that the data directory exists
assert Path("data").exists() is True

# Check that a harvest directory with the current date/hour exists in the data directory
assert len(list(Path("data").glob(f'{arrow.utcnow().format("YYYYMMDDHH")}*'))) == 1

# Check that a 'text' directory exists in the harvest directory
assert (
    Path(next(Path("data").glob(f'{arrow.utcnow().format("YYYYMMDDHH")}*'))).exists()
    is True
)

# Check that the cache has been initialised
assert Path(f"{'-'.join(harvester.harvest_dir.parts)}.sqlite").exists()

# Clean up
shutil.rmtree(Path("data"))
harvester.delete_cache()

Total results: 140,658


You can change the default directories using the `data_dir` and `harvest_dir` parameters. For example, if you wanted to keep all the harvests relating to a specific project together, you could set `data_dir="my-cool-project"`. You can use `harvest_dir` to give your harvest a meaningful name, for example `harvest_dir="search-for-cat-photos"`.

In [12]:
# TEST HARVESTER CREATES REQUESTED HARVEST DIRECTORY

query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge"
)

harvester = Harvester(
    query_params=query_params,
    key=API_KEY,
    data_dir="harvests",
    harvest_dir="my_trove_harvest",
    pdf=True,
    image=True,
)

assert harvester.total > 0
print(f"Total results: {harvester.total:,}")

# Check that the data directory exists
assert Path("harvests").exists() is True

assert Path("harvests", "my_trove_harvest").exists() is True

assert Path("harvests", "my_trove_harvest", "pdf").exists() is True

assert Path("harvests", "my_trove_harvest", "image").exists() is True

# Clean up
shutil.rmtree(Path("harvests"))
harvester.delete_cache()

Total results: 140,658


In [13]:
show_doc(Harvester.harvest)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L168){target="_blank" style="float:right; font-size:smaller"}

### Harvester.harvest

>      Harvester.harvest ()

Start the harvest and loop over the result set until finished.

Once the harvester is initialised, you can start the harvest by calling `Harvester.harvest()`. A progress bar will keep you informed of the status of your harvest.

Add `text=True` to include the OCRd full text of the articles in the harvest. The contents of each article is saved as a separate file in the `text` directory. See the [harvest results](#harvest-results) section below for more information.

In [14]:
# HARVEST WITH TEXT > 100 records

# Prepare query parameters
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-state=Western%20Australia&l-illustrationType=Photo"
)

# Initialise the harvester
harvester = Harvester(
    query_params=query_params,
    key=API_KEY,
    data_dir="harvests",
    harvest_dir="test_harvest",
    text=True,
)

# Start the harvest
harvester.harvest()


# ---TESTS---
# Check that the ndjson file exists and lines can be parsed as json
json_data = []
with harvester.ndjson_file.open("r") as ndjson_file:
    for line in ndjson_file:
        json_data.append(json.loads(line.strip()))

# The length of the ndjson file should equal the number of records harvested
assert len(json_data) == harvester.harvested

# Check that the metadata file has been created
config = get_config(harvester.harvest_dir)
assert config["query_params"] == query_params

# Check that the RO-Crate file was created
crate = get_crate(harvester.harvest_dir)
eids = [
    "./", 
    "ro-crate-metadata.json", 
    "#harvester_run", 
    "harvester_config.json", 
    "https://github.com/wragge/trove-newspaper-harvester",
    "results.ndjson",
    "text",
    "https://creativecommons.org/publicdomain/zero/1.0/",
    "http://rightsstatements.org/vocab/CNE/1.0/",
    "http://rightsstatements.org/vocab/NKC/1.0/" 
]
for eid in eids:
    assert crate.get(eid) is not None

# Check that a text file exists and can be read
assert Path("harvests", "test_harvest", json_data[0]["articleText"]).exists()
text = Path("harvests", "test_harvest", json_data[0]["articleText"]).read_text()
assert isinstance(text, str)

# Check that the cache file was deleted
assert Path(f"{'-'.join(harvester.harvest_dir.parts)}.sqlite").exists() is False

shutil.rmtree(Path("harvests"))

  0%|          | 0/174 [00:00<?, ?article/s]

The text of articles in the *Australian Women's Weekly* is not available through the API, so the harvester has to scrape it separately. This happens automatically. The code below is just a little test to make sure it's working as expected.

In [15]:
# ---TEST FOR AWW---
# Prepare query params
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge"
)

# Initialise the Harvester with the query parameters
harvester = Harvester(query_params=query_params, key=API_KEY, text=True)

# Get html text of an article
text = harvester.get_aww_text(51187457)
assert "THE SHAPE OF THINGS TO COME" in text

# Clean up
shutil.rmtree(Path("data"))

You can include PDFs and images of the articles by adding `pdf=True` or `image=True` to the harvester initialisation. It's important to note that this will slow down the harvest a lot, as each file needs to be generated and downloaded individually.

In [16]:
# HARVEST WITH PDF AND IMAGE -- 1 RECORD MAX

# Prepare the query parameters
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-illustrationType=Cartoon"
)

# Initialise the harvester
harvester = Harvester(
    query_params=query_params,
    key=API_KEY,
    data_dir="harvests",
    harvest_dir="test_harvest",
    pdf=True,
    image=True,
    maximum=1,
)

# Start the harvest!
harvester.harvest()


# ---TESTS---

# Check that the ndjson file exists and lines can be parsed as json
json_data = []
with harvester.ndjson_file.open("r") as ndjson_file:
    for line in ndjson_file:
        json_data.append(json.loads(line.strip()))

assert harvester.maximum == harvester.harvested

# The length of the ndjson file should equal the number of records harvested
assert len(json_data) == harvester.harvested

# Check that a pdf and image file exist
assert Path("harvests", "test_harvest", json_data[0]["pdf"]).exists()
assert Path("harvests", "test_harvest", json_data[0]["images"][0]).exists()

shutil.rmtree(Path("harvests"))

  0%|          | 0/1 [00:00<?, ?article/s]

Naturally enough, nothing is harvested from a query with no results. Check your search and your API key!

In [17]:
# HARVEST WITH NO RESULTS

# Prepare query parameters
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wwgagsgshggshghso"
)

# Initialise the harvester
harvester = Harvester(
    query_params=query_params,
    key=API_KEY
)

# Start the harvest
harvester.harvest()

assert harvester.harvested == 0
shutil.rmtree(Path("data"))

### Restarting a failed harvest

The `Harvester` uses [requests-cache](https://pypi.org/project/requests-cache/) to cache API responses. This makes it easy to restart a failed harvest. All you need to do is call `Harvester.harvest()` again and it will pick up where it left off.

In [18]:
show_doc(Harvester.save_csv)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L395){target="_blank" style="float:right; font-size:smaller"}

### Harvester.save_csv

>      Harvester.save_csv ()

Flatten and rename data in the ndjson file to save as CSV.

Harvested metadata is saved, by default, in a newline-delimited JSON file. If you'd prefer the results in CSV format, just call `Harvester.save_csv()`. See below for more information on results formats.

In [19]:
# TEST - save harvest results as CSV

# Prepare query parameters
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-state=Western%20Australia&l-illustrationType=Photo"
)

# Initialise the harvester
harvester = Harvester(
    query_params=query_params,
    key=API_KEY,
    data_dir="harvests",
    harvest_dir="test_harvest",
    text=True,
)

# Start the harvest
harvester.harvest()

# Save results as CSV
harvester.save_csv()

# ---TESTS---

# Check that CSV file exists
csv_file = Path(harvester.harvest_dir, "results.csv")
assert csv_file.exists()

# Open the CSV file and check that the number of rows equals number of records harvested
df = pd.read_csv(csv_file)
assert df.shape[0] == harvester.harvested

#shutil.rmtree(Path("harvests"))

  0%|          | 0/174 [00:00<?, ?article/s]

### Harvest results

There will be at least two files created for each harvest:

* `harvester_config.json` a file that captures the parameters used to launch the harvest
* `ro-crate-metadata.json` a metadata file documenting the harvest in [RO-Crate](https://www.researchobject.org/ro-crate/) format
* `results.ndjson` contains details of all the harvested articles in a newline delimited JSON format (each line is a JSON object)

The `results.ndjson` stores the API results from Trove *as is*, with a couple of exceptions:

* if the `text` parameter has been set to `True`, the `articleText` field will contain the path to a `.txt` file containing the OCRd text contents of the article (rather than containing the text itself)
* similarly if PDFs and images are requests, the `pdf` and `image` fields int the `ndjson` file will point to the saved files.

You'll probably find it easier to work with the results in CSV format. The `Harvester.save_csv()` method flattens the `ndjson` file and renames some columns to make them compatible with previous versions of the harvest. It produces a `results.csv` file, which is a plain text CSV (Comma Separated Values) file. You can open it with any spreadsheet program. The details recorded for each article are:

* `article_id` – a unique identifier for the article
* `title` – the title of the article
* `date` – in ISO format, YYYY-MM-DD
* `page` – page number (of course), but might also indicate the page is part of a supplement or special section
* `newspaper_id` – a unique identifier for the newspaper or gazette title (this can be used to retrieve more information or build a link to the web interface)
* `newspaper_title` – the name of the newspaper (or gazette)
* `category` – one of ‘Article’, ‘Advertising’, ‘Detailed lists, results, guides’, ‘Family Notices’, or ‘Literature’
* `words` – number of words in the article
* `illustrated` – is it illustrated (values are y or n)
* `edition` – edition of newspaper (rarely used)
* `supplement` – section of newspaper (rarely used)
* `section` – section of newspaper (rarely used)
* `url` – the persistent url for the article
* `page_url` – the persistent url of the page on which the article is published
* `snippet` – short text sample
* `relevance` – search relevance score of this result
* `corrections` – number of text corrections
* `last_correction` – date of last correction
* `tags` – number of attached tags
* `comments` – number of attached comments
* `lists` – number of lists this article is included in
* `text` – path to text file
* `pdf` – path to PDF file
* `image` – path to image file

If you’ve asked for text files PDFs or images, there will be additional directories containing those files. Files containing the OCRd text of the articles will be saved in a directory named `text`. These are just plain text files, stripped on any HTML. These files include some basic metadata in their file titles – the date of the article, the id number of the newspaper, and the id number of the article. So, for example, the filename `19460104-1002-206680758.txt` tells you:

* `19460104` – the article was published on 4 January 1946 (YYYYMMDD)
* `1002` – the article was published in [*The Tribune*](https://trove.nla.gov.au/newspaper/title/1002)
* `206680758` – the [article's unique identifier](http://nla.gov.au/nla.news-article206680758)

As you can see, you can use the newspaper and article ids to create direct links into Trove:

* to a newspaper or gazette `https://trove.nla.gov.au/newspaper/title/[newspaper id]`
* to an article `http://nla.gov.au/nla.news-article[article id]`

Similarly, if you've asked for copies of the articles as images, they'll be in a directory named `image`. The image file names are similar to the text files, but with an extra id number for the page from which the image was extracted. So, for example, the image filename `19250411-460-140772994-11900413.jpg` tells you:

* `19250411` – the article was published on 11 April 1925 (YYYYMMDD)
* `460` – the article was published in [*The Australasian*](https://trove.nla.gov.au/newspaper/title/460)
* `140772994` – the [article's unique identifier](http://nla.gov.au/nla.news-article140772994)
* `11900413` – the [page's unique identifier](https://trove.nla.gov.au/newspaper/page/11900413) (some articles can be split over multiple pages)

The text of articles in the *Australian Women's Weekly* is not available through the API, so the harvester has to scrape it separately. This happens automatically. The code below is just a little test to make sure it's working as expected.

In [20]:
show_doc(get_harvest)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L640){target="_blank" style="float:right; font-size:smaller"}

### get_harvest

>      get_harvest (data_dir='data', harvest_dir=None)

Get the path to a harvest.
If data_dir and harvest_dir are not supplied, this will return the most recent harvest in the 'data' directory.

Parameters:

* `data_dir` [optional, directory for harvests, string]
* `harvest_dir` [optional, directory for this harvest, string]

Returns:

* a pathlib.Path object pointing to the harvest directory

In [21]:
# TEST GET HARVEST

# Create test folders
Path("data", "20220919100000").mkdir(parents=True)
Path("data", "20220919200000").mkdir(parents=True)

# Get latest harvest folder
harvest = get_harvest()
print(harvest)

# ---TESTS---
assert harvest.name == "20220919200000"

harvest = get_harvest(data_dir="data", harvest_dir="20220919100000")
assert harvest.name == "20220919100000"

shutil.rmtree(Path("data"))

data/20220919200000


In [22]:
show_doc(get_config)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L663){target="_blank" style="float:right; font-size:smaller"}

### get_config

>      get_config (harvest)

Get the query config parameters from a harvest directory.

Parameters:

* `harvest` [required, path to harvest, string or pathlib.Path]

Returns:

* config dictionary

The `harvester_config.json` file contains the parameters used to initiate a harvest. Using `get_config` you can retrieve the `harvester_config.json` for for a particular harvest. This can be useful if, for example, you want to re-run a harvest at a later data – you can just grab the `query_paramaters` and feed them into a new `Harvester` instance.

In [23]:
# Prepare query parameters
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-state=Western%20Australia&l-illustrationType=Photo"
)

# Initialise the harvester
harvester = Harvester(
    query_params=query_params,
    key=API_KEY,
    text=True,
)

# Start the harvest
harvester.harvest()

# Get the most recent harvest
harvest = get_harvest()

# Get the metadata
config = get_config(harvest)

# Obscure key and display
config["key"] = "########"
display(config)

# ---TESTS---
assert config["query_params"]["q"] == "wragge"
assert config["text"] is True

shutil.rmtree(Path("data"))

  0%|          | 0/174 [00:00<?, ?article/s]

{'query_params': {'q': 'wragge',
  'l-state': ['Western Australia'],
  'l-illustrated': 'true',
  'l-illtype': ['Photo'],
  'category': 'newspaper',
  'encoding': 'json',
  'reclevel': 'full',
  'bulkHarvest': 'true',
  'include': ['articleText']},
 'key': '########',
 'full_harvest_dir': 'data/20230828053742',
 'maximum': None,
 'text': True,
 'pdf': False,
 'image': False,
 'include_linebreaks': False}

In [24]:
show_doc(get_crate)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L683){target="_blank" style="float:right; font-size:smaller"}

### get_crate

>      get_crate (harvest)

Get the RO-Crate metadata file from a harvest directory.

 Parameters:

* `harvest` [required, path to harvest, string or pathlib.Path]

Returns:

* ROCrate object

Trove is changing all the time, so it's important to document your harvests. The Harvester automatically creates a metadata file using the [Research Object Crate (RO-Crate) format](https://www.researchobject.org/ro-crate/). This documents when the harvest was run, how many results were saved, and the version of the harvester. It is linked to the `harvester_config.json` file that save the query parameters and harvester settings. This function retrieves the RO-Crate file for a given harvest. It returns an RO-Crate object – see the [ro-crate.py package](https://github.com/ResearchObject/ro-crate-py) for more information.

In [25]:
# Prepare query parameters
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-state=Western%20Australia&l-illustrationType=Photo"
)

# Initialise the harvester
harvester = Harvester(
    query_params=query_params,
    key=API_KEY,
    text=True,
)

# Start the harvest
harvester.harvest()

# Get the most recent harvest
harvest = get_harvest()

# Get the metadata
crate = get_crate(harvest)

for eid in crate.get_entities():
    print(eid.id, eid.type)

assert crate.get("./").type == "Dataset"
assert crate.get("harvester_config.json").properties()["encodingFormat"] == "application/json"
assert crate.get("./").properties()["mainEntity"] == {"@id": "#harvester_run"}

shutil.rmtree(Path("data"))

  0%|          | 0/174 [00:00<?, ?article/s]

ro-crate-metadata.json CreativeWork
./ Dataset
harvester_config.json File
results.ndjson ['File', 'Dataset']
text ['File', 'Dataset']
#harvester_run CreateAction
https://github.com/wragge/trove-newspaper-harvester SoftwareApplication
http://rightsstatements.org/vocab/NKC/1.0/ CreativeWork
http://rightsstatements.org/vocab/CNE/1.0/ CreativeWork
https://creativecommons.org/publicdomain/zero/1.0/ CreativeWork


In [26]:
show_doc(NoQueryError)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L34){target="_blank" style="float:right; font-size:smaller"}

### NoQueryError



Exception triggered by empty query.

In [27]:
#| hide
import nbdev

nbdev.nbdev_export()

----

Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/). Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb).