In [88]:
import requests
import html2text
from pathlib import Path
import pandas as pd
import re
import time

In [89]:
API_KEY = 'YOUR API KEY'

In [108]:
# Set basic params
params = {
    'key': API_KEY,
    'zone': 'newspaper',
    'include': 'articleText',
    'n': 20,
    'encoding': 'json',
    'l-state': 'Victoria'
}

In [109]:
streets = [
    'little lonsdale street',
    'little bourke street'
]

## Harvest from Trove and save results as CSV files

In [110]:
def clean_text(html):
    try:
        text = html2text.html2text(html)
    # If the articleText field is empty (as with the Australian Women's Weekly) you'll get this error
    except AttributeError:
        text = ''
    return text

for street in streets:
    # Slugify street names to use in paths and filenames
    street_slug = street.replace(' ', '_')
    
    # pathlib makes working with files and directories easier
    street_path = Path(street_slug)
    street_path.mkdir(exist_ok=True)
    
    for num in range(1, 11):
        
        # Use text: and ~0 to make the search as exact as possible
        params['q'] = f'text:"{num} {street}"~0'
        
        # Get the data from the API
        response = requests.get('https://api.trove.nla.gov.au/v2/result', params=params)
        data = response.json()
        articles = data['response']['zone'][0]['records']['article']
        
        if articles:
            
            # Load the articles into Pandas
            # json_normlize will flatten the title and relevance fields into multiple columns
            df = pd.json_normalize(articles)
            
            # Strip html codes out of text
            df['text'] = df['articleText'].apply(lambda x: clean_text(x))
            
            # Drop the original articleText column
            df.drop(columns='articleText', inplace=True)
            
            # Save the dataframe as a CSV file
            df.to_csv(Path(street_path, f'{num}_{street_slug}.csv'), index=False)
            
        time.sleep(0.2)
        

## Explore the results

In [111]:
for street in streets:
    # Slugify street names to use in paths and filenames
    street_slug = street.replace(' ', '_')
    
    # pathlib makes working with files and directories easier
    street_path = Path(street_slug)
    
    for num in range(1, 11):
        print(f'\n\n{num} {street}')
        
        # Open up the harvested CSV file
        df = pd.read_csv(Path(street_path, f'{num}_{street_slug}.csv'))
        
        for row in df.loc[df['text'].notnull()].itertuples():
            
            # I noticed that there are sometimes hyphens before 'street'
            # So we'll allow patterns with hyphens to match
            words = street.split()
            street_with_hyphens = '\s*-?\s*'.join(words)
            
            matches = re.findall(fr'.{{0,50}}{num}.{{0,5}}{street_with_hyphens}.{{0,50}}', row.text, flags=re.I | re.DOTALL)
            for match in matches:
                match = match.replace('\n', ' ')
                print(f'{row.id} - ...{match}...')



1 little lonsdale street
10649082 - ...land being the obstacle Could not that portion of 1 Little Lonsdale-street be tween Swanston and Russell streets, he  handed...
10764063 - ...kiriil firing, a final polishing. not Him just in 1 little- Lonsdale street Due of tin in. writing He was ntsiirniinl to line...
10705105 - ...zaar Hotel little Bourke street Letters mm Hotel, 1 little Lonsdale street Railway Hotel under on street, and the Exchange H...
242866630 - ... STOLEN.  Tho shop of Joseph Linnos, storekeeper, 1 1 Little Lonsdale street, was broken into and rubbed between 8 and 10 o'cl...
8643417 - ...maintained by the association at I stroke street, 1 Little Lonsdale street, and 1 debt bourne street, and the desire of the ...
221745138 - ... If a tight fitting bodice be sent to 0, Growley, 1 Little Lonsdale street west, Melbourne, a stand will be returned, the up...
4801246 - ...On the 28th inst., at her father's residence, No. 1?? Little Lonsdale-street east, Mary, third daughter 