In [162]:
import csv
import logging
import io
import json
import re
import shutil
import subprocess
import tempfile
import zipfile
from collections import OrderedDict
from datetime import datetime as dt
from pathlib import Path
from typing import Iterator, TextIO, Union, List, Tuple
from urllib.parse import urljoin, urlsplit

import requests

In [47]:
log = logging.getLogger("PurePythonEtl")
logging.basicConfig(level=logging.INFO)

In [31]:
tax_year = dt.now().strftime("%Y")
domain = "https://pdata.hcad.org"
remote = Path("/data/cama/2019/Hearing_files.zip")
dictionary = re.findall(r"(\w+)\s+(\w+)\s+(\d+)\s?", requests.get(urljoin(domain, "/Desc/Layout_and_Length.txt")).text)
samples = Path("samples/hcad")

In [109]:
get_fields = lambda table: [x[1] for x in dictionary if x[0] in table]

In [153]:
def download(*urls: str) -> Iterator[Path]:
    for url in urls:
        print(url)
        remote = Path(urlsplit(url).path)
        dst = samples.joinpath(remote.parent.name).joinpath(remote.stem).joinpath(remote.name)
        dst.parent.mkdir(parents=True, exist_ok=True)
        r = sess.get(urljoin(domain, remote.as_posix()))
        dst.write_bytes(r.content)
        yield dst

In [179]:
def process_zip(*files: Path) -> Iterator[TextIO]:
    for file in files:
        print(file)
        with zipfile.ZipFile(file) as zip_file:
            for f in zip_file.namelist():
                yield io.TextIOWrapper(zip_file.open(f), encoding="iso-8859-1", newline="")

def read_csv(*files: TextIO) -> Iterator[OrderedDict]:
    for file in files:
        print(file)
        fields = get_fields(file.name)
        reader = csv.DictReader(file, fieldnames=fields, dialect="excel-tab")
        for row in reader:
            yield row

def process_csv(*args: OrderedDict) -> Iterator[OrderedDict]:
    for row in args:
        yield row

def write_csv(*rows: OrderedDict, fields: List[str], dst: Path) -> None:
    writer = csv.DictWriter(dst.open("w+"), fieldnames=fields)
    writer.writeheader()
    writer.writerows(rows)

start = dt.now()
print("start=%s" % start)
next(write_csv(*process_csv(*read_csv(*process_zip(*download(urljoin(domain, remote.as_posix())))))))
end = dt.now()
print("start=%s,end=%s,elapsed=%s" % (start, end, end - start))