# Introduction

This notebook will download all the JSON data for bird sounds in the US. The [REST API](https://www.xeno-canto.org/article/153) is documented. 

## Download the JSON

In [8]:
import httplib2
import json

In [45]:
h = httplib2.Http()
h.follow_all_redirects = False

In [9]:
url = 'https://www.xeno-canto.org/api/2/recordings?query=cnt:%22United+States%22'
response, content = h.request(url, 'GET')
data = json.loads(content)

In [13]:
num_pages = data['numPages']

In [15]:
dataset = [data]

In [17]:
from tqdm import tqdm

for i in tqdm(range(2, num_pages + 1)):
    url = 'https://www.xeno-canto.org/api/2/recordings?query=cnt:%22United+States%22&page={}'.format(i)
    _, content = h.request(url, 'GET')
    dataset.append(json.loads(content))

100%|██████████████████████████████████████████████████████████████████████████████████| 82/82 [02:08<00:00,  1.56s/it]


## Save the JSON locally

In [24]:
for i, d in tqdm(enumerate(dataset)):
    fname = './json/{}.json'.format(i+1)
    with open(fname, 'wb') as f:
        f.write(json.dumps(d, f))

83it [00:00, 200.00it/s]


# Get the download URLs and output file names of each recording

In [34]:
def get_recording_download_info(item):
    url = 'https:{}'.format(item['file'])
    mp3 = './mp3/{}.mp3'.format(item['id'])
    return {
        'url': url,
        'mp3': mp3
    }
    
def get_recording_download_infos(dataset):
    recordings = []
    for data in dataset:
        download_infos = [get_recording_download_info(item) for item in data['recordings']]
        recordings.extend(download_infos)
    return recordings

In [35]:
download_info = get_recording_download_infos(dataset)

Check to make sure we have 41,291 recording download informations. Also, check to make sure the output paths are unique.

In [36]:
len(download_info)

41291

In [37]:
len(set([info['mp3'] for info in download_info]))

41291

In [38]:
download_info[0]

{'mp3': './mp3/316537.mp3',
 'url': 'https://www.xeno-canto.org/316537/download'}

# Download the mp3 files

In [91]:
from httplib2 import RelativeURIError

def download_save(item):
    h = httplib2.Http()
    url = item['url']
    mp3 = item['mp3']
    try:
        _, content = h.request(url, 'GET')
        with open(mp3, 'wb') as f:
            f.write(content)
    except RelativeURIError as e:
        s = str(e)
        eq_index = s.find('=')
        url = s[eq_index + 1: len(s)].strip()
        url = 'https:{}'.format(url)
        try:
            _, content = h.request(url, 'GET')
            with open(mp3, 'wb') as f:
                f.write(content)
            return 1
        except RelativeURIError as e:
            return 0

In [94]:
download_save(download_info[0])

1

In [None]:
from joblib import Parallel, delayed

results = Parallel(n_jobs=5, backend='threading', verbose=5)(delayed(download_save)(item) for item in download_info)

[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    3.5s
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:   24.7s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:  1.3min
