# core

> a harvester for downloading large numbers of digitised newspaper articles from Trove

In [None]:
#| default_exp core

In [None]:
#| export
import argparse
import datetime
import json
import os
import re
import time
from importlib.metadata import version
from pathlib import Path
from pprint import pprint
from urllib.parse import parse_qs, parse_qsl, urlparse

import arrow
import html2text
import pandas as pd
import requests_cache
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.exceptions import HTTPError
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
from trove_newspaper_images.articles import download_images
from trove_query_parser.parser import parse_query

In [None]:
#| hide
import shutil

from nbdev.showdoc import *

# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv

In [None]:
#| export


class Harvester:
    """
    Harvest large quantities of digitised newspaper articles from Trove.

    Parameters:

    * `query_params` [required, dictionary of parameters]
    * `data_dir` [optional, directory for harvests, string]
    * `harvest_dir` [optional, directory for this harvest, string]
    * `text` [optional, save articles as text files, True or False]
    * `pdf` [optional, save articles as PDFs, True or False]
    * `image` [optional, save articles as images, True or False]
    * `include_linebreaks` [optional, include linebreaks in text files, True or False]
    * `max` [optional, maximum number of results, integer]
    """

    zoom = 3
    api_url = "https://api.trove.nla.gov.au/v2/result"

    def __init__(
        self,
        query_params,
        data_dir="data",
        harvest_dir=None,
        text=False,
        pdf=False,
        image=False,
        include_linebreaks=False,
        max=None,
    ):
        if query_params:
            self.query_params = query_params
        else:
            print("You must provide query parameters")
            return
        self.data_dir = Path(data_dir)
        if harvest_dir:
            self.harvest_dir = Path(self.data_dir, harvest_dir)
        else:
            self.harvest_dir = Path(
                self.data_dir, arrow.utcnow().format("YYYYMMDDHHmmss")
            )
        self.s = self.initialise_cache()
        self.ndjson_file = Path(self.harvest_dir, "results.ndjson")
        # Deletes existing file in case of restart
        self.ndjson_file.unlink(missing_ok=True)
        self.pdf = pdf
        self.text = text
        self.image = image
        self.create_dirs()
        self.include_linebreaks = include_linebreaks
        self.harvested = 0
        self.start = "*"
        self.number = 100
        if max:
            self.maximum = max
        else:
            self._get_total()
        self.save_meta()

    def initialise_cache(self):
        cache_name = "-".join(self.harvest_dir.parts)
        s = requests_cache.CachedSession(cache_name)
        retries = Retry(
            total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504]
        )
        s.mount("http://", HTTPAdapter(max_retries=retries))
        s.mount("https://", HTTPAdapter(max_retries=retries))
        return s

    def delete_cache(self):
        cache_name = f"{'-'.join(self.harvest_dir.parts)}.sqlite"
        Path(cache_name).unlink()

    def create_dirs(self):
        self.harvest_dir.mkdir(exist_ok=True, parents=True)
        if self.pdf:
            Path(self.harvest_dir, "pdf").mkdir(exist_ok=True)
        if self.text:
            Path(self.harvest_dir, "text").mkdir(exist_ok=True)
        if self.image:
            Path(self.harvest_dir, "image").mkdir(exist_ok=True)

    def _get_total(self):
        params = self.query_params.copy()
        params["n"] = 0
        response = self.s.get(self.api_url, params=params, timeout=30)
        # print(response.url)
        try:
            results = response.json()
        except (AttributeError, ValueError):
            print("No results!")
            self.maximum = 0
        else:
            self.maximum = int(results["response"]["zone"][0]["records"]["total"])

    def log_query(self):
        """
        Do something with details of query -- ie log date?
        """
        pass

    def harvest(self):
        """
        Start the harvest and loop over the result set until finished.
        """
        params = self.query_params.copy()
        params["n"] = self.number
        with tqdm(total=self.maximum, unit="article") as pbar:
            pbar.update(self.harvested)
            while self.start and (self.harvested < self.maximum):
                params["s"] = self.start
                response = self.s.get(self.api_url, params=params, timeout=30)
                response.raise_for_status()
                # print(response.url)
                try:
                    results = response.json()
                except (AttributeError, ValueError):
                    # Log errors?
                    pass
                else:
                    records = results["response"]["zone"][0]["records"]
                    self.process_results(records, pbar)
                    # pbar.update(len(records['article']))
        # Add the number harvested to the metadata file
        self.update_meta()
        self.delete_cache()

    def save_meta(self):
        """
        Save the query metadata in a JSON file.
        Useful for documenting your harvest.
        """
        meta = {
            "query_parameters": self.query_params,
            "harvest_directory": str(self.harvest_dir),
            "max": self.maximum,
            "text": self.text,
            "pdf": self.pdf,
            "image": self.image,
            "include_linebreaks": self.include_linebreaks,
            "date_started": arrow.utcnow().isoformat(),
            "harvester": f"trove_newspaper_harvester v{version('trove_newspaper_harvester')}",
        }
        with Path(self.harvest_dir, "metadata.json").open("w") as meta_file:
            json.dump(meta, meta_file, indent=4)

    def update_meta(self):
        """
        Update the metadata file with the total harvested.
        """
        meta = get_metadata(self.harvest_dir)
        if meta:
            meta["harvested"] = self.harvested
        with Path(self.harvest_dir, "metadata.json").open("w") as meta_file:
            json.dump(meta, meta_file, indent=4)

    def create_page_url(self, url):
        if url:
            page_id = re.search(r"page\/(\d+)", url).group(1)
            return f"http://trove.nla.gov.au/newspaper/page/{page_id}"

    def save_csv(self):
        """
        Flatten and rename data in the ndjson file to save as CSV.
        """
        json_data = []
        with self.ndjson_file.open("r") as ndjson_file:
            for line in ndjson_file:
                json_data.append(json.loads(line.strip()))
        df = pd.json_normalize(json_data)
        df["page_url"] = df["trovePageUrl"].apply(self.create_page_url)
        df["images"] = df["images"].str.join("|")
        for part in ["edition", "supplement", "section", "lastCorrection.lastupdated"]:
            if part not in df.columns:
                df[part] = ""
        df = df[
            [
                "id",
                "heading",
                "date",
                "pageSequence",
                "title.id",
                "title.value",
                "category",
                "wordCount",
                "illustrated",
                "edition",
                "supplement",
                "section",
                "identifier",
                "page_url",
                "snippet",
                "relevance.score",
                "correctionCount",
                "lastCorrection.lastupdated",
                "tagCount",
                "commentCount",
                "listCount",
                "articleText",
                "pdf",
                "images",
            ]
        ]
        df = df.rename(
            columns={
                "id": "article_id",
                "heading": "title",
                "pageSequence": "page",
                "title.id": "newspaper_id",
                "title.value": "newspaper_title",
                "wordCount": "words",
                "correctionCount": "corrections",
                "lastCorrection.lastupdated": "last_corrected",
                "identifier": "url",
                "relevance.score": "relevance",
                "tagCount": "tags",
                "commentCount": "comments",
                "listCount": "lists",
                "articleText": "text",
            }
        )

        df.to_csv(Path(self.harvest_dir, "results.csv"), index=False)

    def make_filename(self, article):
        """
        Create a filename for a text file or PDF.
        For easy sorting/aggregation the filename has the format:
            PUBLICATIONDATE-NEWSPAPERID-ARTICLEID
        """
        date = article["date"]
        date = date.replace("-", "")
        newspaper_id = article["title"]["id"]
        article_id = article["id"]
        return f"{date}-{newspaper_id}-{article_id}"

    def ping_pdf(self, ping_url):
        """
        Check to see if a PDF is ready for download.
        If a 200 status code is received, return True.
        """
        ready = False
        # req = Request(ping_url)
        try:
            # urlopen(req)
            with self.s.cache_disabled():
                response = self.s.get(ping_url, timeout=30)
            response.raise_for_status()
        except HTTPError:
            if response.status_code == 423:
                ready = False
            else:
                raise
        else:
            ready = True
        return ready

    def get_pdf_url(self, article_id, zoom=3):
        """
        Download the PDF version of an article.
        These can take a while to generate, so we need to ping the server to see if it's ready before we download.
        """
        pdf_url = None
        # Ask for the PDF to be created
        prep_url = "https://trove.nla.gov.au/newspaper/rendition/nla.news-article{}/level/{}/prep".format(
            article_id, zoom
        )
        response = self.s.get(prep_url)
        # Get the hash
        prep_id = response.text
        # Url to check if the PDF is ready
        ping_url = "https://trove.nla.gov.au/newspaper/rendition/nla.news-article{}.{}.ping?followup={}".format(
            article_id, zoom, prep_id
        )
        tries = 0
        ready = False
        time.sleep(1)  # Give some time to generate pdf
        # Are you ready yet?
        while ready is False and tries < 5:
            ready = self.ping_pdf(ping_url)
            if not ready:
                tries += 1
                time.sleep(2)
        # Download if ready
        if ready:
            pdf_url = "https://trove.nla.gov.au/newspaper/rendition/nla.news-article{}.{}.pdf?followup={}".format(
                article_id, zoom, prep_id
            )
        return pdf_url

    def get_aww_text(self, article_id):
        # Download text using the link from the web interface
        url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-article{article_id}.txt"
        response = self.s.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "lxml")
            # Remove the header
            soup.find("p").decompose()
            soup.find("hr").decompose()
            return str(soup)

    def save_text(self, article):
        text_filename = self.make_filename(article)
        text_file = Path(self.harvest_dir, "text", f"{text_filename}.txt")
        if not text_file.exists():
            html_text = article.get("articleText")
            if not html_text:
                # If the text isn't in the API response (as with AWW), download separately
                html_text = self.get_aww_text(article_id)
            if html_text:
                # Convert html to plain text
                text = html2text.html2text(html_text)
                if self.include_linebreaks == False:
                    text = re.sub("\s+", " ", text)

                with open(text_file, "wb") as text_output:
                    text_output.write(text.encode("utf-8"))
            else:
                return ""
        # Removes the output_dir from path
        return text_file.relative_to(*text_file.parts[:2])

    def save_pdf(self, article):
        pdf_filename = self.make_filename(article)
        pdf_file = Path(self.harvest_dir, "pdf", f"{pdf_filename}.pdf")
        if not pdf_file.exists():
            pdf_url = self.get_pdf_url(article["id"])
            if pdf_url:
                response = self.s.get(pdf_url)
                pdf_file.write_bytes(response.content)
                # Removes the output_dir from path
            else:
                return ""
        return pdf_file.relative_to(*pdf_file.parts[:2])

    def process_results(self, records, pbar):
        """
        Processes a page full of results.
        """
        rows = []
        try:
            articles = records["article"]
        except KeyError:
            raise
        else:
            with self.ndjson_file.open("a") as ndjson_file:
                for article in articles:
                    if self.harvested >= self.maximum:
                        break
                    article_id = article["id"]
                    # rows.append(self.prepare_row(article))

                    if self.pdf:
                        pdf_file = self.save_pdf(article)
                        article["pdf"] = str(pdf_file)
                    else:
                        article["pdf"] = ""
                    if self.text:
                        text_file = self.save_text(article)
                        article["articleText"] = str(text_file)
                    else:
                        article["articleText"] = ""
                    if self.image:
                        images = download_images(
                            article_id, output_dir=Path(self.harvest_dir, "image")
                        )
                        images = [str(Path("image", i)) for i in images]
                        article["images"] = images
                    else:
                        article["images"] = []
                    ndjson_file.write(json.dumps(article) + "\n")
                    pbar.update(1)
                    # Update the number harvested
                    self.harvested += 1
            time.sleep(0.2)
            # Get the nextStart token
            try:
                self.start = records["nextStart"]
            except KeyError:
                self.start = None
            # print('Harvested: {}'.format(self.harvested))


def prepare_query(query, api_key, text=False):
    """
    Converts a Trove search url into a set of parameters ready for harvesting.

    Parameters:

    * `query` [required, search url from Trove web interface or API, string]
    * `api_key` [required, Trove API key, string]
    * `text` [optional, save text files, True or False]

    Returns:

    * a dictionary of parameters
    """
    if text and "articleText" not in query:
        # If text is set to True, make sure the query is getting the article text
        # Adding it here rather than to the params dict to avoid overwriting any existing include values
        query += "&include=articleText"
    if "api.trove.nla.gov.au" in query:
        # If it's an API url, no further processing of parameters needed
        parsed_url = urlparse(query)
        new_params = parse_qs(parsed_url.query)
    else:
        # These params can be accepted as is.
        new_params = parse_query(query)
    new_params["key"] = api_key
    new_params["encoding"] = "json"
    new_params["reclevel"] = "full"
    new_params["bulkHarvest"] = "true"
    # The query parser defaults to 'newspaper,gazette' if no zone is set.
    # But multiple zones won't work with bulkHarvest, so set to 'newspaper'.
    if new_params["zone"] == "newspaper,gazette":
        new_params["zone"] = "newspaper"
    # return '{}?{}'.format('https://api.trove.nla.gov.au/v2/result', urlencode(new_params, doseq=True))
    return new_params


def get_harvest(data_dir="data", harvest_dir=None):
    """
    Get the path to a harvest.
    If data_dir and harvest_dir are not supplied, this will return the most recent harvest in the 'data' directory.

    Parameters:

    * `data_dir` [optional, directory for harvests, string]
    * `harvest_dir` [optional, directory for this harvest, string]

    Returns:

    * a pathlib.Path object pointing to the harvest directory
    """
    if harvest_dir:
        harvest = Path(data_dir, harvest_dir)
    else:
        harvests = Path("data").glob("*")
        harvests = sorted([d for d in harvests if d.is_dir()])
        harvest = Path(harvests[-1])
    return harvest


def get_metadata(harvest):
    """
    Get the query metadata from a harvest directory.

    Parameters:

    * `harvest` [required, path to harvest, string or pathlib.Path]

    Returns:

    * metadata dictionary
    """
    try:
        with Path(harvest, "metadata.json").open("r") as meta_file:
            meta = json.load(meta_file)
    except IOError:
        print("No harvest!")
        meta = None
    return meta

In [None]:
show_doc(prepare_query)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L415){target="_blank" style="float:right; font-size:smaller"}

### prepare_query

>      prepare_query (query, api_key, text=False)

Converts a Trove search url into a set of parameters ready for harvesting.

Parameters:

* `query` [required, search url from Trove web interface or API, string]
* `api_key` [required, Trove API key, string]
* `text` [optional, save text files, True or False]

Returns:

* a dictionary of parameters

The `prepare_query` function converts a search url from the Trove web interface or API into a set of parameters that you can feed to `Harvester`. It uses the [trove-query-parser](https://pypi.org/project/trove-query-parser/) to do most of the work, but adds in a few extra parameters needed for the harvest.

If you want to save the contents of the articles as text files you need to set `text` to `True`. This ensures that the `articleText` field is included in the results.

In [None]:
query_params = prepare_query(
    query="https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-state=New%20South%20Wales&l-artType=newspapers&l-title=508&l-decade=191&l-category=Article",
    api_key="MY_API_KEY",
)
query_params

{'q': 'wragge',
 'l-state': ['New South Wales'],
 'zone': 'newspaper',
 'l-title': ['508'],
 'l-decade': ['191'],
 'l-category': ['Article'],
 'key': 'MY_API_KEY',
 'encoding': 'json',
 'reclevel': 'full',
 'bulkHarvest': 'true'}

In [None]:
# TEST query_params()
# Convert a url from the Trove web interface, including text
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge",
    api_key="MY_API_KEY",
    text=True,
)

# Test the results
assert query_params == {
    "q": "wragge",
    "include": ["articleText"],
    "zone": "newspaper",
    "key": "MY_API_KEY",
    "encoding": "json",
    "reclevel": "full",
    "bulkHarvest": "true",
}

# Convert a url from an API request
query_params = prepare_query(
    "https://api.trove.nla.gov.au/v2/result?q=wragge&zone=newspaper&encoding=json&l-category=Article",
    api_key="MY_API_KEY",
)

assert query_params == {
    "q": ["wragge"],
    "zone": ["newspaper"],
    "encoding": "json",
    "l-category": ["Article"],
    "key": "MY_API_KEY",
    "reclevel": "full",
    "bulkHarvest": "true",
}

In [None]:
show_doc(Harvester)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L31){target="_blank" style="float:right; font-size:smaller"}

### Harvester

>      Harvester (query_params, data_dir='data', harvest_dir=None, text=False,
>                 pdf=False, image=False, include_linebreaks=False, max=None)

Harvest large quantities of digitised newspaper articles from Trove.

Parameters:

* `query_params` [required, dictionary of parameters]
* `data_dir` [optional, directory for harvests, string]
* `harvest_dir` [optional, directory for this harvest, string]
* `text` [optional, save articles as text files, True or False]
* `pdf` [optional, save articles as PDFs, True or False]
* `image` [optional, save articles as images, True or False]
* `include_linebreaks` [optional, include linebreaks in text files, True or False]
* `max` [optional, maximum number of results, integer]

The `Harvester` class configures and runs your harvest, saving results in a variety of formats.

By default, the harvester will save harvests in a directory called `data`, with each individual harvest in a directory named according to the current date and time (`YYYYMMDDHHmmss` format). You can change this by setting the `data_dir` and `harvest_dir` parameters. This can help you to manage your harvests by grouping together related searches, or giving them meaningful names.

The harvester generates two data files by default:

* `metadata.json` contains basic information about the harvest
* `results.ndjson` contains details of all the harvested articles in a newline delimited JSON format (each line is a JSON object)

You can convert the `ndjson` file to a CSV format using `Harvester.save_csv`.

The `text`, `pdf`, and `image` options give you the option to save the contents of the articles as either text files, PDF files, or JPG images. Note that saving PDFs and images can be very slow.

If you only want to harvest part of the results set you can set the `max` parameter to the number of records you want.

In [None]:
# TEST HARVESTER CREATES DEFAULT HARVEST DIRECTORY
# This example initialises a harvest, but doesn't actually run it.

API_KEY = os.getenv("TROVE_API_KEY")

# Prepare query params
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge",
    text=True,
    api_key=API_KEY,
)

# Initialise the Harvester with the query parameters
harvester = Harvester(query_params=query_params, text=True)

# if you haven't set the max parameter, the maximum value will be the total number of results
assert harvester.maximum > 0
print(f"Total results: {harvester.maximum:,}")

# Check that the data directory exists
assert Path("data").exists() is True

# Check that a harvest directory with the current date/hour exists in the data directory
assert len(list(Path("data").glob(f'{arrow.utcnow().format("YYYYMMDDHH")}*'))) == 1

# Check that a 'text' directory exists in the harvest directory
assert (
    Path(next(Path("data").glob(f'{arrow.utcnow().format("YYYYMMDDHH")}*'))).exists()
    is True
)

# Check that the cache has been initialised
assert Path(f"{'-'.join(harvester.harvest_dir.parts)}.sqlite").exists()

# Clean up
shutil.rmtree(Path("data"))
harvester.delete_cache()

Total results: 137,770


In [None]:
# TEST HARVESTER CREATES REQUESTED HARVEST DIRECTORY

query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge",
    api_key=API_KEY,
)

harvester = Harvester(
    query_params=query_params,
    data_dir="harvests",
    harvest_dir="my_trove_harvest",
    pdf=True,
    image=True,
)

assert harvester.maximum > 0
print(f"Total results: {harvester.maximum:,}")

# Check that the data directory exists
assert Path("harvests").exists() is True

assert Path("harvests", "my_trove_harvest").exists() is True

assert Path("harvests", "my_trove_harvest", "pdf").exists() is True

assert Path("harvests", "my_trove_harvest", "image").exists() is True

# Clean up
shutil.rmtree(Path("harvests"))
harvester.delete_cache()

Total results: 137,770


In [None]:
show_doc(Harvester.harvest)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L133){target="_blank" style="float:right; font-size:smaller"}

### Harvester.harvest

>      Harvester.harvest ()

Start the harvest and loop over the result set until finished.

Once the `Harvester` is initialised with your query parameters, you can call `Harvester.harvest` to actually start the process. The harvester will loop over the complete results set until finished.

In [None]:
# HARVEST WITH TEXT > 100 records

# Prepare query parameters
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-state=Western%20Australia&l-illustrationType=Photo",
    api_key=API_KEY,
    text=True,
)

# Initialise the harvester
harvester = Harvester(
    query_params=query_params,
    data_dir="harvests",
    harvest_dir="test_harvest",
    text=True,
)

# Start the harvest
harvester.harvest()


# ---TESTS---
# Check that the ndjson file exists and lines can be parsed as json
json_data = []
with harvester.ndjson_file.open("r") as ndjson_file:
    for line in ndjson_file:
        json_data.append(json.loads(line.strip()))

# The length of the ndjson file should equal the number of records harvested
assert len(json_data) == harvester.harvested

# Check that the metadata file has been created
metadata = get_metadata(harvester.harvest_dir)
assert metadata["query_parameters"] == query_params

# Check that a text file exists and can be read
assert Path("harvests", "test_harvest", json_data[0]["articleText"]).exists()
text = Path("harvests", "test_harvest", json_data[0]["articleText"]).read_text()
assert isinstance(text, str)

# Check that the cache file was deleted
assert Path(f"{'-'.join(harvester.harvest_dir.parts)}.sqlite").exists() is False

shutil.rmtree(Path("harvests"))

  0%|          | 0/130 [00:00<?, ?article/s]

In [None]:
# HARVEST WITH PDF AND IMAGE -- 1 RECORD MAX

# Prepare the query parameters
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-illustrationType=Cartoon",
    api_key=API_KEY,
    text=True,
)

# Initialise the harvester
harvester = Harvester(
    query_params=query_params,
    data_dir="harvests",
    harvest_dir="test_harvest",
    pdf=True,
    image=True,
    max=1,
)

# Start the harvest!
harvester.harvest()


# ---TESTS---

# Check that the ndjson file exists and lines can be parsed as json
json_data = []
with harvester.ndjson_file.open("r") as ndjson_file:
    for line in ndjson_file:
        json_data.append(json.loads(line.strip()))

assert harvester.maximum == harvester.harvested

# The length of the ndjson file should equal the number of records harvested
assert len(json_data) == harvester.harvested

# Check that a pdf and image file exist
assert Path("harvests", "test_harvest", json_data[0]["pdf"]).exists()
assert Path("harvests", "test_harvest", json_data[0]["images"][0]).exists()

shutil.rmtree(Path("harvests"))

  0%|          | 0/1 [00:00<?, ?article/s]

#### Restarting a failed harvest

The `Harvester` uses [requests-cache](https://pypi.org/project/requests-cache/) to cache API responses. This makes it easy to restart a failed harvest. All you need to do is call `Harvester.harvest()` again and it will pick up where it left off.

In [None]:
show_doc(Harvester.save_csv)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L193){target="_blank" style="float:right; font-size:smaller"}

### Harvester.save_csv

>      Harvester.save_csv ()

Flatten and rename data in the ndjson file to save as CSV.

Harvested metadata is saved, by default, in a newline-delimited JSON file. If you'd prefer the results in CSV format, just call `Harvester.save_csv()`. See below for more information on results formats.

In [None]:
# Prepare query parameters
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-state=Western%20Australia&l-illustrationType=Photo",
    api_key=API_KEY,
    text=True,
)

# Initialise the harvester
harvester = Harvester(
    query_params=query_params,
    data_dir="harvests",
    harvest_dir="test_harvest",
    text=True,
)

# Start the harvest
harvester.harvest()

# Save results as CSV
harvester.save_csv()

# ---TESTS---

# Check that CSV file exists
csv_file = Path(harvester.harvest_dir, "results.csv")
assert csv_file.exists()

# Open the CSV file and check that the number of rows equals number of records harvested
df = pd.read_csv(csv_file)
assert df.shape[0] == harvester.harvested

shutil.rmtree(Path("harvests"))

  0%|          | 0/130 [00:00<?, ?article/s]

In [None]:
show_doc(get_harvest)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L452){target="_blank" style="float:right; font-size:smaller"}

### get_harvest

>      get_harvest (data_dir='data', harvest_dir=None)

Get the path to a harvest.
If data_dir and harvest_dir are not supplied, this will return the most recent harvest in the 'data' directory.

Parameters:

* `data_dir` [optional, directory for harvests, string]
* `harvest_dir` [optional, directory for this harvest, string]

Returns:

* a pathlib.Path object pointing to the harvest directory

In [None]:
# TEST GET HARVEST

# Create test folders
Path("data", "20220919100000").mkdir(parents=True)
Path("data", "20220919200000").mkdir(parents=True)

# Get latest harvest folder
harvest = get_harvest()
print(harvest)

# ---TESTS---
assert harvest.name == "20220919200000"

harvest = get_harvest(data_dir="data", harvest_dir="20220919100000")
assert harvest.name == "20220919100000"

shutil.rmtree(Path("data"))

data/20220919200000


In [None]:
show_doc(get_metadata)

---

[source](https://github.com/wragge/trove-newspaper-harvester/blob/master/trove_newspaper_harvester/core.py#L475){target="_blank" style="float:right; font-size:smaller"}

### get_metadata

>      get_metadata (harvest)

Get the query metadata from a harvest directory.

Parameters:

* `harvest` [required, path to harvest, string or pathlib.Path]

Returns:

* metadata dictionary

The `metadata.json` file contains information about a harvest. Using `get_metadata` you can retrieve the `metadata.json` for for a particular harvest. This can be useful if, for example, you want to re-run a harvest at a later data – you can just grab the `query_paramaters` and feed them into a new `Harvester` instance.

In [None]:
# Prepare query parameters
query_params = prepare_query(
    "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-state=Western%20Australia&l-illustrationType=Photo",
    api_key=API_KEY,
    text=True,
)

# Initialise the harvester
harvester = Harvester(
    query_params=query_params,
    text=True,
)

# Start the harvest
harvester.harvest()

# Get the most recent harvest
harvest = get_harvest()

# Get the metadata
metadata = get_metadata(harvest)
# Obscure key
metadata["query_parameters"]["key"] = "########"
display(metadata)

# ---TESTS---
assert metadata["query_parameters"]["q"] == "wragge"
assert metadata["text"] is True
assert metadata["harvested"] == harvester.harvested

shutil.rmtree(Path("data"))

  0%|          | 0/130 [00:00<?, ?article/s]

{'query_parameters': {'q': 'wragge',
  'l-state': ['Western Australia'],
  'l-illustrated': 'true',
  'l-illtype': ['Photo'],
  'include': ['articleText'],
  'zone': 'newspaper',
  'key': '########',
  'encoding': 'json',
  'reclevel': 'full',
  'bulkHarvest': 'true'},
 'harvest_directory': 'data/20220921091131',
 'max': 130,
 'text': True,
 'pdf': False,
 'image': False,
 'include_linebreaks': False,
 'date_started': '2022-09-21T09:11:32.000609+00:00',
 'harvester': 'trove_newspaper_harvester v0.0.1',
 'harvested': 130}

### Results

There will be at least two files created for each harvest:

* `results.ndjson` – a newline-delimited JSON file containing the details of all harvested articles
* `metadata.json` – a JSON file which stores all the details of the harvest

The `results.ndjson` stores the API results from Trove *as is*, with a couple of exceptions:

* if the `text` parameter has been set to `True`, the `articleText` field will contain the path to a `.txt` file containing the OCRd text contents of the article (rather than containing the text itself)
* similarly if PDFs and images are requests, the `pdf` and `image` fields int the `ndjson` file will point to the saved files.

You'll probably find it easier to work with the results in CSV format. The `Harvester.save_csv()` method flattens the `ndjson` file and renames some columns to make them compatible with prevsious versions of the harvest. It produces a `results.csv` file, which is a plain text CSV (Comma Separated Values) file. You can open it with any spreadsheet program. The details recorded for each article are:

* `article_id` – a unique identifier for the article
* `title` – the title of the article
* `date` – in ISO format, YYYY-MM-DD
* `page` – page number (of course), but might also indicate the page is part of a supplement or special section
* `newspaper_id` – a unique identifier for the newspaper or gazette title (this can be used to retrieve more information or build a link to the web interface)
* `newspaper_title` – the name of the newspaper (or gazette)
* `category` – one of ‘Article’, ‘Advertising’, ‘Detailed lists, results, guides’, ‘Family Notices’, or ‘Literature’
* `words` – number of words in the article
* `illustrated` – is it illustrated (values are y or n)
* `edition` – edition of newspaper (rarely used)
* `supplement` – section of newspaper (rarely used)
* `section` – section of newspaper (rarely used)
* `url` – the persistent url for the article
* `page_url` – the persistent url of the page on which the article is published
* `snippet` – short text sample
* `relevance` – search relevance score of this result
* `corrections` – number of text corrections
* `last_correction` – date of last correction
* `tags` – number of attached tags
* `comments` – number of attached comments
* `lists` – number of lists this article is included in
* `text` – path to text file
* `pdf` – path to PDF file
* `image` – path to image file

If you’ve asked for text files PDFS or images, there will be additional directories containing those files. Files containing the OCRd text of the articles will be saved in a directory named `text`. These are just plain text files, stripped on any HTML. These files include some basic metadata in their file titles – the date of the article, the id number of the newspaper, and the id number of the article. So, for example, the filename `19460104-1002-206680758.txt` tells you:

* `19460104` – the article was published on 4 January 1946 (YYYYMMDD)
* `1002` – the article was published in [*The Tribune*](https://trove.nla.gov.au/newspaper/title/1002)
* `206680758` – the [article's unique identifier](http://nla.gov.au/nla.news-article206680758)

As you can see, you can use the newspaper and article ids to create direct links into Trove:

* to a newspaper or gazette `https://trove.nla.gov.au/newspaper/title/[newspaper id]`
* to an article `http://nla.gov.au/nla.news-article[article id]`

Similarly, if you've asked for copies of the articles as images, they'll be in a directory named `image`. The image file names are similar to the text files, but with an extra id number for the page from which the image was extracted. So, for example, the image filename `19250411-460-140772994-11900413.jpg` tells you:

* `19250411` – the article was published on 11 April 1925 (YYYYMMDD)
* `460` – the article was published in [*The Australasian*](https://trove.nla.gov.au/newspaper/title/460)
* `140772994` – the [article's unique identifier](http://nla.gov.au/nla.news-article140772994)
* `11900413` – the [page's unique identifier](https://trove.nla.gov.au/newspaper/page/11900413) (some articles can be split over multiple pages)

In [None]:
#| hide
import nbdev

nbdev.nbdev_export()

----

Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/). Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb).