In [None]:
import os
import re
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
import pandas as pd
from shutil import copy, rmtree
import logging
import logging.config

pd.options.mode.chained_assignment = None


In [None]:
# logger = logging.getLogger(__name__)
# logging.config.fileConfig('logging.ini', disable_existing_loggers=False)

In [None]:
def files_from_url(url = None, year = None):
    """
    get all path names for all zips
    """
    l = []
    with urlopen(url) as zipresp:
#         logger.info("Downloading MapPluto ZipFile from url", extra = {"url":url})
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            l = zfile.namelist()
            df = pd.DataFrame(l, columns = ['path'])
            df['year'] = year
            df['url'] = url
            return df

In [None]:
def get_all_pluto_files(zip_links:pd.DataFrame):
    """
    Extracts all filenames from the MapPluto Archives
    Saves
    """

    out_path = 'all_pluto_files.csv'

    if not os.path.exists(out_path):
        file_df = []

        for index, row in zip_links.iterrows():
            file_df.append(files_from_url(url=row.path, year=row.year))
        d = pd.concat(file_df)
        
        d.to_csv(out_path, index=False)
        return d
    else:
        print(f"{out_path} already exists")
        return pd.read_csv(out_path)

In [None]:
for index, row in d4.iterrows():
    
    download_metadata(zip_url = row.url, year = row.year, ext = row.ext, path = row.path)

In [None]:
def main():
    
    zip_links = pd.read_csv('zip_links.csv')
    d = get_all_pluto_files(zip_links)
    
    d['ext'] = d['path'].str.rsplit('.', expand = True, n=1)[1].str.lower()
    d2 = d[d['ext'].isin(['pdf', 'htm', 'html'])].reset_index(drop = True)
    d3 = d2[~d2['path'].str.contains('readme', case=False)].reset_index(drop = True)
    d3['len'] = d3.path.str.len()
    d4 = d3.iloc[d3.groupby('year')['len'].agg(pd.Series.idxmin)].reset_index(drop = True)
    
    for index, row in d4.iterrows():
    
        download_metadata(zip_url = row.url, year = row.year, ext = row.ext, path = row.path)

In [None]:
if __name__ == '__main__':
    main()