In [1]:
# default_exp parser

# Trove query parser

> Use `parse_query` to convert a search query from the Trove web interface into a set of parameters that the API will understand.

In [2]:
#hide
from nbdev.showdoc import *

## Functions

In [87]:
#export
from urllib.parse import urlparse, parse_qsl, parse_qs
import requests
import arrow

def format_date(date, start=False):
    '''
    The web interface uses YYYY-MM-DD dates, but the API expects YYYY-MM-DDT00:00:00Z. Reformat dates accordingly.
    
    Also the start date in an API query needs to be set to the day before you want. So if this is a start date, take it back in time by a day.
    '''
    if date != '*':
        date_obj = arrow.get(date)
        if start:
            date_obj = date_obj.shift(days=-1)
        date = '{}Z'.format(date_obj.format('YYYY-MM-DDT00:00:00'))
    return date

def parse_query(query):
    '''
    Converts the parameters of a search using the Trove web interface into a form the API will understand.
    
    Parameters:  
    * `query` – the url of a search in the Trove newspapers & gazettes category
    
    Returns:  
    * a dict containing the parameters (multiple values will be in a list)
    '''
    parsed_url = urlparse(query)
    if 'api.trove.nla.gov.au' in query:
        # If it's an API url, no further processing of parameters needed
        new_params = parse_qs(parsed_url.query)
    else:
        # These params can be accepted as is.
        safe = ['l-category', 'l-title', 'l-decade', 'l-year', 'l-month', 'l-state', 'l-word', 'include']
        new_params = {}
        dates = {}
        keywords = []
        params = parse_qsl(parsed_url.query)
        # Loop through all the parameters
        for key, value in params:
            if key in safe:
                try:
                    new_params[key].append(value)
                except KeyError:
                    new_params[key] = [value]
            elif key == 'l-advWord':
                new_params['l-word'] = value
            elif key == 'l-advstate':
                try:
                    new_params['l-state'].append(value)
                except KeyError:
                    new_params['l-state'] = [value]
            elif key == 'l-advcategory':
                try:
                    new_params['l-category'].append(value)
                except KeyError:
                    new_params['l-category'] = [value]
            elif key == 'l-advtitle':
                try:
                    new_params['l-title'].append(value)
                except KeyError:
                    new_params['l-title'] = [value]
            elif key in ['l-illustrationType', 'l-advIllustrationType']:
                new_params['l-illustrated'] = 'true'
                try:
                    new_params['l-illtype'].append(value)
                except KeyError:
                    new_params['l-illtype'] = [value]
            elif key == 'date.from':
                dates['from'] = value
            elif key == 'date.to':
                dates['to'] = value
            elif key == 'keyword':
                new_params['q'] = value
            elif key == 'keyword.phrase':
                keywords.append('"{}"'.format(value))
            elif key == 'keyword.not':
                keywords.append('NOT ({})'.format(' OR '.join(value.split())))
            elif key == 'keyword.any':
                keywords.append('({})'.format(' OR '.join(value.split())))
            elif key in ['l-ArtType', 'l-advArtType', 'l-artType']:
                if value == 'newspapers':
                    new_params['zone'] = 'newspaper'
                elif value == 'gazette':
                    new_params['zone'] = 'gazette'
        if keywords:
            if 'q' in new_params:
                new_params['q'] += ' AND {}'.format(' AND '.join(keywords))
            else:
                new_params['q'] = ' AND '.join(keywords)
        if dates:
            if 'from' not in dates:
                dates['from'] = '*'
            if 'to' not in dates:
                dates['to'] = '*'
            date_query = 'date:[{} TO {}]'.format(format_date(dates['from'], True), format_date(dates['to']))
            if 'q' in new_params:
                new_params['q'] += ' {}'.format(date_query)
            else:
                new_params['q'] = date_query
        if 'q' not in new_params:
            new_params['q'] = ' '
        if 'zone' not in new_params:
            new_params['zone'] = 'newspaper,gazette'
    # return '{}?{}'.format('https://api.trove.nla.gov.au/v2/result', urlencode(new_params, doseq=True))
    return new_params

## Basic usage

Here's the url of a search in Trove's newspapers: https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-state=Queensland&l-category=Article&l-illustrationType=Cartoon

If we feed this url to `parse_query()` we get back a dict with the query parameters translated into a form the Trove API understands.

In [89]:
params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-state=Queensland&l-category=Article&l-illustrationType=Cartoon')
params

{'q': 'wragge',
 'zone': 'newspaper',
 'l-state': ['Queensland'],
 'l-category': ['Article'],
 'l-illustrated': 'true',
 'l-illtype': ['Cartoon']}

If you want to use this to get data back from the Trove API, you'll need to add a `key` parameter with your Trove API key. You might also want to change the `encoding` of the results to 'json'. Then you can just give the parameters as `params` to `requests`. For example:

``` python
params = parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-state=Queensland&l-category=Article&l-illustrationType=Cartoon')
params['key'] = 'mYApiKEY'
params['encoding'] = 'json'
response = requests.get('https://api.trove.nla.gov.au/v2/result', params=params)
data = response.json()
```

Assuming your API key is valid, this will return the following results:

``` python
{'response': {'query': 'wragge',
  'zone': [{'name': 'newspaper',
    'records': {'s': '*',
     'n': '3',
     'total': '3',
     'article': [{'id': '76672882',
       'url': '/newspaper/76672882',
       'heading': 'THE POLITICAL STIGER YORTEX.',
       'category': 'Article',
       'title': {'id': '274',
        'value': 'The Charleville Times (Brisbane, Qld. : 1896 - 1954)'},
       'date': '1901-10-12',
       'page': 4,
       'pageSequence': 4,
       'relevance': {'score': '250.99701', 'value': 'very relevant'},
       'snippet': 'PREMIER PHILP: "I think that\'s a better shot than Wragge\'s."',
       'troveUrl': 'https://trove.nla.gov.au/ndp/del/article/76672882?searchTerm=wragge'},
      {'id': '50294024',
       'url': '/newspaper/50294024',
       'heading': 'We nearly broke the drought  (. . . WE THINK)',
       'category': 'Article',
       'title': {'id': '12',
        'value': 'The Courier-Mail (Brisbane, Qld. : 1933 - 1954)'},
       'date': '1952-02-16',
       'page': 2,
       'pageSequence': 2,
       'relevance': {'score': '12.74085', 'value': 'very relevant'},
       'snippet': 'WE were determined to try our hand at rainmaking, and',
       'troveUrl': 'https://trove.nla.gov.au/ndp/del/article/50294024?searchTerm=wragge'},
      {'id': '76372015',
       'url': '/newspaper/76372015',
       'heading': 'Digest What YOU Eat.',
       'category': 'Article',
       'title': {'id': '266',
        'value': 'The Western Champion and General Advertiser for the Central-Western Districts (Barcaldine, Qld. : 1892 - 1922)'},
       'date': '1906-01-08',
       'page': 5,
       'pageSequence': 5,
       'relevance': {'score': '5.734701', 'value': 'very relevant'},
       'snippet': "The reason why any wholesome food is not properly digested is because the stomach lacks some important element of digestion. Some stomach' lack peptone, others are deficient in gastric juice or hydrochloric",
       'troveUrl': 'https://trove.nla.gov.au/ndp/del/article/76372015?searchTerm=wragge'}]}}]}}
```

Note that the API includes some additional parameters such as `reclevel` and `include`. Have a look at the [Trove API Console](https://troveconsole.herokuapp.com/) for examples.

## Tests

### Simple search with facets

Multiple keywords are just passed along as is and are combined with a boolean `AND`. This is the same in both the Simple and Advanced search.

In [21]:
assert {'q': 'wragge weather', 'zone': 'newspaper,gazette'} == parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge%20weather')

Multiple keywords with `OR` are passed along as is.

In [22]:
assert {'q': 'wragge OR weather', 'zone': 'newspaper,gazette'} == parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge%20OR%20weather')

Phrase search passed along as is.

In [24]:
assert {'q': '"inclement wragge"', 'zone': 'newspaper,gazette'} == parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=%22inclement%20wragge%22')

More complex queries such as date ranges should be passed along as is.

In [49]:
parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge%20date%3A%5B1901%20TO%201903%5D&l-artType=newspapers')

{'q': 'wragge date:[1901 TO 1903]', 'zone': 'newspaper'}

Limit to gazettes using facets.

In [27]:
assert {'q': 'wragge', 'zone': 'gazette'} == parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=gazette')

Limit state to NSW using facets.

In [36]:
assert {'q': 'wragge', 'l-state': ['New South Wales'], 'zone': 'newspaper,gazette'} == parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-state=New%20South%20Wales')

Limit newspaper to SMH using facets.

In [40]:
assert {'q': 'wragge', 'zone': 'newspaper', 'l-title': ['35']} == parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-title=35')

Limit to 'Article' category using facets.

In [42]:
assert {'q': 'wragge', 'zone': 'newspaper', 'l-category': ['Article']} == parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-category=Article')

Limit to specific decade using facets.

In [46]:
parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-decade=190')

{'q': 'wragge', 'zone': 'newspaper', 'l-decade': ['190']}

Limit to specific year using facets.

In [47]:
parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-decade=190&l-year=1903')

{'q': 'wragge', 'zone': 'newspaper', 'l-decade': ['190'], 'l-year': ['1903']}

Limit to articles with illustration type of 'Photo' with facets.

In [57]:
assert {'q': 'wragge', 'zone': 'newspaper', 'l-illustrated': 'true', 'l-illtype': ['Photo']} == parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-illustrationType=Photo')

Limit to articles containing more than 1,000 words using facets.

In [72]:
assert {'q': 'wragge', 'zone': 'newspaper', 'l-word': ['1000+ Words']} == parse_query('https://trove.nla.gov.au/search/category/newspapers?keyword=wragge&l-artType=newspapers&l-word=1000%2B%20Words')

### Advanced search

Multiple keywords in 'Any of these words' box.

In [31]:
assert {'q': '(wragge OR weather)', 'zone': 'newspaper,gazette'} == parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword.any=wragge%20weather')

Multiple keywords in 'The phrase' box.

In [32]:
assert {'q': '"inclement wragge"', 'zone': 'newspaper,gazette'} == parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword.phrase=inclement%20wragge')

Keywords in 'All of these words' and 'Without these words' boxes.

In [33]:
assert {'q': 'wragge AND NOT (weather)', 'zone': 'newspaper,gazette'} == parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword.not=weather&keyword=wragge')

Limit to a specific date range.

In [71]:
assert {'q': 'wragge date:[1899-12-31T00:00:00Z TO 1900-02-04T00:00:00Z]', 'zone': 'newspaper'} == parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword=wragge&date.from=1900-01-01&date.to=1900-02-04&l-advArtType=newspapers')

Limit to a specific state.

In [83]:
assert {'q': 'wragge', 'zone': 'newspaper', 'l-state': ['Queensland']} == parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword=wragge&l-advArtType=newspapers&l-advstate=Queensland')

Limit to specific newspapers.

In [76]:
assert {'q': 'wragge', 'zone': 'newspaper', 'l-title': ['16', '1055']} == parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword=wragge&l-advArtType=newspapers&l-advtitle=16&l-advtitle=1055')

Limit to a specific category.

In [80]:
assert {'q': 'wragge', 'zone': 'newspaper', 'l-category': ['Family Notices']} == parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword=wragge&l-advArtType=newspapers&l-advcategory=Family%20Notices')

Limit to a specific illustration type.

In [82]:
assert {'q': 'wragge', 'zone': 'newspaper', 'l-illustrated': 'true', 'l-illtype': ['Photo']} == parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword=wragge&l-advArtType=newspapers&l-advIllustrationType=Photo')

Limit to a specific number of words.

In [85]:
assert {'q': 'wragge', 'zone': 'newspaper', 'l-word': '100 - 1000 Words'} == parse_query('https://trove.nla.gov.au/search/advanced/category/newspapers?keyword=wragge&l-advArtType=newspapers&l-advWord=100%20-%201000%20Words')