In [1]:
import pandas as pd
import requests
import os
from google.cloud import bigquery

In [2]:
def getCounts(endpoint:str)->int:
    count_str=f'{endpoint}?$select=count(1) as nrows'
    cnt_req=requests.get(count_str).json()
    return int(cnt_req[0]['nrows'])

In [3]:
def makeRequests(endpoint:str, nrows:int, date_field:str, limit=5*10**5) -> list:

    offsets=list(range(0, nrows, limit))

    return [f'{endpoint}?$limit={limit}&$offset={o}&$order={date_field} DESC' for o in offsets]

In [4]:
def readRequest(request:str) -> pd.DataFrame:
    print(f'reading request for: {request}')
    resp=requests.get(request).json()
    print('returning dataframe')
    return pd.DataFrame(resp)

In [5]:
def writeBq(request_list:list, schema:str, table_name:str, write_type='WRITE_APPEND'):
    
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'secret.json'

    bq_client = bigquery.Client()
    
    table_id=f'pluto-panel.{schema}.{table_name}'

    job_config = bigquery.LoadJobConfig(
            autodetect=True,
            write_disposition=write_type)
    
    writeRequest=lambda x: bq_client.load_table_from_dataframe(readRequest(x), table_id, job_config=job_config)
    
    all(map(writeRequest, request_list))

In [10]:
ep='https://data.cityofnewyork.us/resource/ipu4-2q9a.json'

nr=getCounts(ep)

reqs=makeRequests(ep, nr, 'dobrundate')
reqs

['https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=0&$order=dobrundate DESC',
 'https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=500000&$order=dobrundate DESC',
 'https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=1000000&$order=dobrundate DESC',
 'https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=1500000&$order=dobrundate DESC',
 'https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=2000000&$order=dobrundate DESC',
 'https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=2500000&$order=dobrundate DESC',
 'https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=3000000&$order=dobrundate DESC',
 'https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=3500000&$order=dobrundate DESC']

In [12]:
writeBq(reqs, 'real_estate', 'raw_dob_permits')

reading request for: https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=0&$order=dobrundate DESC
returning dataframe
reading request for: https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=500000&$order=dobrundate DESC
returning dataframe
reading request for: https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=1000000&$order=dobrundate DESC
returning dataframe
reading request for: https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=1500000&$order=dobrundate DESC
returning dataframe
reading request for: https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=2000000&$order=dobrundate DESC
returning dataframe
reading request for: https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=2500000&$order=dobrundate DESC
returning dataframe
reading request for: https://data.cityofnewyork.us/resource/ipu4-2q9a.json?$limit=500000&$offset=3000000&$order=dobrund