# Downloading Data in Bulk

There are allot of things that can go wrong when downloading in bulk. Not only can requests time out but allot of entries are incorrect. Sometimes the API says it gave you a CSV link when it really gave you xml. Or maybe the API didn't say how big the file is and it turns out to be a 2`GB` census file. Or maybe it is a CSV file but it is zipped up.

Below is code to use `asyncio` to peek at our downloads and abort any files that are not real `CSVs`. This code will also allow for downloading in parrallel and will cache the data into subfolders to avoid redownloading anything we already have.

In [1]:
import aiohttp, asyncio, pandas, os, requests, collections, urllib, shutil
from time import sleep

host_semaphores = collections.defaultdict(asyncio.Semaphore) #only 1 request per host name

async def peek_if_csv_and_write(url, folder, max_readline=1024*12, max_size=1024*1024*10): #max_size = 10MB
    if os.path.exists(folder):
        #print(folder, "Already exists")
        return True
    else:
        os.mkdir(folder)
    host = urllib.parse.urlparse(url).netloc
    await host_semaphores[host].acquire()
    print(folder, "Downloading:", url)
    async with aiohttp.ClientSession() as session:
        try:
            resp_with_timer = await asyncio.wait_for(session.get(url), 5.0)
            async with resp_with_timer as resp:
                if resp.status != 200:
                    print(folder, "got non 200 response:", resp.status)
                    return False
                #content-disposition: attachment; filename=foo.pdf
                #content-length
                if 'CONTENT-DISPOSITION' not in resp.headers:
                    print(folder, "no content disposition found")
                    return False
                dst = resp.headers['CONTENT-DISPOSITION'].split('; filename=')[-1]
                dst = os.path.join(folder, dst)
                print(folder, "Destination:", dst)
                if os.path.exists(dst):
                    return dst
                size = int(resp.headers.get('CONTENT-LENGTH', 0))
                #if size is 0:
                #    print("no size reported")
                #    #return False
                if size > max_size:
                    print(folder, "too big of a file")
                    return False
                if dst.endswith('.csv'):
                    read_bytes = await resp.content.read(max_readline)
                    if b'\n' not in read_bytes:
                        print(folder, "no new line found, probably not a csv")
                        return False
                    output = open(dst, 'wb')
                    output.write(read_bytes)
                    while True:
                        chunk = await resp.content.read()
                        if chunk:
                            output.write(chunk)
                        else:
                            break
                    output.close()
                    try:
                        pandas.read_csv(dst, nrows=3)
                    except Exception as error:
                        print(folder, "Not a CSV:", error)
                        return False
                elif dst.endswith('.zip'):
                    print(folder, "zipfile found, skipping")
                    return False
                    if size is 0:
                        #ouch, dangerous
                        return False
                    #TODO figure out folder structure to handle this
                    bytes_so_far = 0
                    output = open(dst, 'wb')
                    while True:
                        if bytes_so_far > max_size:
                            return False #TOO big of a zip file and they didn't tell us how big!
                        chunk = await resp.content.read()
                        if chunk:
                            bytes_so_far += len(chunk)
                            output.write(chunk)
                        else:
                            break
                    output.close()
                    #TODO unzip
                else:
                    print(folder, "Unrecognized filetype:", dst)
                    return False
                return dst
        except asyncio.TimeoutError:
            print(folder, "timed out")
            shutil.rmtree(folder, True)
            return False
        except (aiohttp.errors.ServerDisconnectedError, ConnectionResetError) as error:
            print(folder, "angry server", error)
            shutil.rmtree(folder, True)
            return False
        finally:
            host_semaphores[host].release()

loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)

#download all the csvs (or the first 10 pages...)
for i in range(10): #int(13460/10) + 1):
    offset = i * 10
    raw_response = requests.get('http://catalog.data.gov/api/search/dataset', params={'all_fields':1, 'res_format':'CSV', 'offset': offset})
    try:
        response = raw_response.json()
    except:
        print("ERROR", ras_response.status)
        sleep(5)
        continue
    results = response['results']
    urls = [(dataset['res_url'][dataset['res_format'].index('CSV')], dataset['id']) for dataset in results]
    futures = [asyncio.ensure_future(peek_if_csv_and_write(url, did)) for url, did in urls]
    loop.run_until_complete(asyncio.gather(*futures))

loop.close()

9bbedbcc-06b9-4762-a214-feed76df94d8 Downloading: https://data.illinois.gov/api/views/rry2-usaj/rows.csv?accessType=DOWNLOAD
9bbedbcc-06b9-4762-a214-feed76df94d8 Destination: 9bbedbcc-06b9-4762-a214-feed76df94d8/05to09_Iquery_Hep_Cchronic_Data.csv
4923637b-0eca-4820-ad7b-ef8678fd6777 Downloading: https://data.illinois.gov/api/views/5cng-jyjz/rows.csv?accessType=DOWNLOAD
4923637b-0eca-4820-ad7b-ef8678fd6777 Destination: 4923637b-0eca-4820-ad7b-ef8678fd6777/05to12_Iquery_Crypto_Data.csv
c08f27d3-9ebc-4ef4-8303-1789f4d7676c Downloading: https://data.illinois.gov/api/views/rjxh-tv66/rows.csv?accessType=DOWNLOAD
c08f27d3-9ebc-4ef4-8303-1789f4d7676c Destination: c08f27d3-9ebc-4ef4-8303-1789f4d7676c/05to12_Iquery_Hep_AData.csv
fd0ddbe4-4187-4ad0-8a7b-2ab24e6ef763 Downloading: https://data.illinois.gov/api/views/pwfa-6r2g/rows.csv?accessType=DOWNLOAD
fd0ddbe4-4187-4ad0-8a7b-2ab24e6ef763 Destination: fd0ddbe4-4187-4ad0-8a7b-2ab24e6ef763/05to12_Iquery_Hep_Bchronic_Data.csv
46d2b61c-7384-4a50-a85