In [10]:
import pandas as pd
import os
import pluto_parquet as pp
from zips import unzip_to_temp
from shutil import rmtree
from functools import partial
import warnings
from google.cloud import storage
import tempfile
import logging
from logging.config import fileConfig
from utils import cpu_use
import fiona
import geopandas as gpd

In [5]:
warnings.filterwarnings('ignore', message='.*initial implementation of Parquet.*')
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'secret.json'

In [6]:
log_file='extract_load.log'
log_path=os.path.join('logs', log_file)

loginipath='logs/logging_config.ini'
fileConfig(loginipath, defaults={'logfilename': log_path})
logger = logging.getLogger('sLogger')

In [7]:
client = storage.Client()
bucket = client.bucket('raw-pluto')

In [8]:
zip_links=pd.read_csv("data/zip_links.csv").iloc[-1:]

In [7]:
for _, row in zip_links.iterrows():
    
    pluto_year=str(row.year)
    zip_path=row.path
    
    tmp_dir=unzip_to_temp(zip_path)
    paths=pp.filter_valid_pluto(tmp_dir)
    
    

INFO - Downloading ZIPFile https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyc_mappluto_21v1_arc_shp.zip
INFO - Unzipping https://www1.nyc.gov/assets/planning/download/zip/data-maps/open-data/nyc_mappluto_21v1_arc_shp.zip


In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

In [None]:
    root_paths=[p for p in paths]
    
    aggregate_year = partial(pp.agg_from_path, year=pluto_year)
    gdf_list=list(map(aggregate_year, root_paths))
    
    logger.info(f'{cpu_use()} aggregating files: {len(gdf_list)} from year: {pluto_year}')
    df=pd.concat(gdf_list)
    del gdf_list
    logger.info(cpu_use())
    
    
    rmtree(tmp_dir)
    
    logger.info(f'{cpu_use()} cleaning temporary directory')
    
    with tempfile.NamedTemporaryFile() as fp:
        logger.info(f'{cpu_use()} aggregating files: writing aggregated file to tempfile')
        df.to_parquet(fp)
        data_shape=df.shape
        del df
        logger.info(f'{cpu_use()} deleted in memory dataframe')
        blob_name=f'pluto_{pluto_year}.parquet'
        logger.info(f'making blob: {blob_name}')
        blob=bucket.blob(blob_name)
        blob.upload_from_filename(fp.name)
        logger.info(f'succesfully loaded df with rows: {data_shape[0]} and columns: {data_shape[1]} to file: {blob_name}')

In [None]:
paths