In [70]:
#imports
import requests
import pandas as pd
from datetime import datetime
from google.cloud import bigquery,storage

In [71]:
# plug the path to your credentials json here
gcpcreds = "C:/Users/TK/Desktop/neat-geode-386415-ca51a6c03e2f.json"

# init the clients
client = bigquery.Client.from_service_account_json(json_credentials_path=gcpcreds)
storageclient = storage.Client.from_service_account_json(json_credentials_path=gcpcreds)

bucket = storageclient.get_bucket('msds432_project_data')

# write your query here
job = client.query('SELECT MAX(APPLICATION_START_DATE)  FROM `neat-geode-386415.data.building_permit_data`')

In [72]:
for row in job.result():
    currentmaxdate = row[0]

currentmaxdate = currentmaxdate.strftime("%Y-%m-%d")

In [73]:
currentmaxdate

'2023-05-12'

In [74]:
# add your SODA token here
soda_token = '85GaAh9i4LaVP9BRVhapXCPao'

In [75]:
timestamp = datetime.now()
timestamp = timestamp.strftime("%Y-%m-%d-%H-%M-%S")

In [85]:
columns_to_keep = ['id',
 'permit_',
 'permit_type',
 'review_type',
 'application_start_date',
 'issue_date',
 'processing_time',
 'street_number',
 'street_direction',
 'street_name',
 'suffix',
 'work_description',
 'building_fee_paid',
 'zoning_fee_paid',
 'other_fee_paid',
 'subtotal_paid',
 'building_fee_unpaid',
 'zoning_fee_unpaid',
 'other_fee_unpaid',
 'subtotal_unpaid',
 'building_fee_waived',
 'zoning_fee_waived',
 'other_fee_waived',
 'subtotal_waived',
 'total_fee',
 'reported_cost',
 'community_area',
 'census_tract',
 'ward',
 'xcoordinate',
 'ycoordinate',
 'latitude',
 'longitude']

In [86]:
floatcols = ['processing_time',
             'building_fee_paid',
            'zoning_fee_paid',
            'other_fee_paid',
            'subtotal_paid',
            'building_fee_unpaid',
            'zoning_fee_unpaid',
            'other_fee_unpaid',
            'subtotal_unpaid',
            'building_fee_waived',
            'zoning_fee_waived',
            'other_fee_waived',
            'subtotal_waived',
            'total_fee',
            'reported_cost',
            'ward',
            'xcoordinate',
            'ycoordinate',
            'latitude',
            'longitude']

In [96]:
# some key info
nameof = 'permits'
web_api = 'ydr8-5enu'
limit = 500000
i = 0
offset_counter = 0
pickle = True


# dict for logging
log_dict = {}

# base url for the request
baseurl = ("https://data.cityofchicago.org/resource/"
        f"{web_api}.json?")

while pickle:
    #set our params
    params = (
            f"$where=application_start_date%20%3E%3D%20%27{currentmaxdate}%27&"
            f"$limit={limit}&"
            f"$offset={offset_counter}&"
            f"$$app_token={soda_token}")
    
    # make the request 
    api_response = requests.get(baseurl+params)
    if api_response.status_code == 200:
        try:
            # convert to json
            response_data = api_response.json()
            if response_data:
            # make it a df
                df = pd.json_normalize(response_data)
                df = df[columns_to_keep]
                df['application_start_date'] = pd.to_datetime(df['application_start_date']).dt.date
                df['issue_date'] = pd.to_datetime(df['issue_date']).dt.date
                df[floatcols] = df[floatcols].apply(pd.to_numeric,errors='coerce',downcast="float")
                df.rename(columns={'permit_':'permit_num'},inplace=True)
                # write df
                df.to_json(f'./data/{nameof}_{timestamp}_{i}.json',orient='records', lines=True)

                # log
                minilog = {}
                minilog['records'] = df.shape[0]
                minilog['chunk'] = i
                minilog['status'] = api_response.status_code
                minilog['offset'] = offset_counter
                minilog['success'] = True
                minilog['mindate'] = df['application_start_date'].min()
                minilog['maxdate'] = df['application_start_date'].max()
                log_dict[f'run_{i}'] = minilog
                log_df = pd.DataFrame.from_dict(log_dict,orient='index')
                log_df.to_csv(f'./logs/{nameof}_{timestamp}_log.csv')
                
                # increment the chunk count
                i += 1

                # increment the offset
                offset_counter += limit

            else:
                pickle = False


        except:
            pickle = False

    else:
        pickle = False
        minilog = {}
        minilog['records'] = 0
        minilog['chunk'] = i
        minilog['status'] = api_response.status_code
        minilog['offset'] = offset_counter
        minilog['success'] = False
        minilog['mindate'] = df['application_start_date'].min()
        minilog['maxdate'] = df['application_start_date'].max()
        log_dict[f'run_{i}'] = minilog
        log_df = pd.DataFrame.from_dict(log_dict,orient='index')
        log_df.to_csv(f'./logs/{nameof}_{timestamp}_log.csv')





In [80]:
df[floatcols] = df[floatcols].apply(pd.to_numeric,errors='coerce',downcast="float")

In [97]:
df.columns

Index(['id', 'permit_num', 'permit_type', 'review_type',
       'application_start_date', 'issue_date', 'processing_time',
       'street_number', 'street_direction', 'street_name', 'suffix',
       'work_description', 'building_fee_paid', 'zoning_fee_paid',
       'other_fee_paid', 'subtotal_paid', 'building_fee_unpaid',
       'zoning_fee_unpaid', 'other_fee_unpaid', 'subtotal_unpaid',
       'building_fee_waived', 'zoning_fee_waived', 'other_fee_waived',
       'subtotal_waived', 'total_fee', 'reported_cost', 'community_area',
       'census_tract', 'ward', 'xcoordinate', 'ycoordinate', 'latitude',
       'longitude'],
      dtype='object')

In [91]:
object_name_in_gcs_bucket = bucket.blob(f'permit_update_logs/{nameof}_{timestamp}_log.csv')
object_name_in_gcs_bucket.upload_from_filename(f'./logs/{nameof}_{timestamp}_log.csv')

In [98]:
job_config = bigquery.LoadJobConfig(schema=[
    bigquery.SchemaField("my_string", "STRING"),
])
job = client.load_table_from_dataframe(df,'neat-geode-386415.data.building_permit_data')