In [1]:
import csv
import json
import logging
import math
import os
import re
import sys
from datetime import datetime as dt
from pathlib import Path
from urllib.parse import urlparse, urljoin
from zipfile import ZipFile

import dateutil
import tqdm
import requests
import pandas as pd
from pyspark.shell import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.4
      /_/

Using Python version 3.7.3 (default, Jul  1 2019 21:52:21)
SparkSession available as 'spark'.


In [2]:
log = logging.getLogger("hcad.etl")
logging_format = "%(asctime)s:%(name)s:%(funcName)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=logging_format)

In [3]:
# vars
tax_year = dt.now().strftime("%Y")
domain = "https://pdata.hcad.org"
dictionary = "/Desc/Layout_and_Length.txt"
archives = f"/data/cama/{tax_year}/"
landing = Path("./samples")
sources = ["Hearing_files.zip"]

In [4]:
def extract():
    urls = [urljoin(domain, p) for p in [dictionary, *[os.path.join(archives, s) for s in sources]]]
    for url in urls:
        dst = landing.joinpath(urlparse(url).path.lstrip("/"))
        try:
            os.makedirs(os.path.dirname(dst))
        except FileExistsError:
            pass
        with requests.session() as sess:
            headers = sess.head(url).headers
            log.info(json.dumps(dict(headers), indent='    '))
            if dst.exists():
                sizes = sess.head(url).headers.get("Content-Length"), dst.stat().st_size
                a, b = map(lambda x: int(x) if x is not None else 0, sizes)
                log.debug(f"src={a}, src={b}")
                if not math.isclose(a, b, rel_tol=1.0):
                    r = sess.get(url)
                    dst.write_bytes(r.content)
            else:
                r = sess.get(url)
                dst.write_bytes(r.content)
        yield dst

In [5]:
extracts = extract()
list(extracts)

2019-12-14 16:52:03,871:hcad.etl:extract - {
    "Server": "nginx/1.14.0 (Ubuntu)",
    "Date": "Sat, 14 Dec 2019 16:52:03 GMT",
    "Content-Type": "text/plain",
    "Content-Length": "132512",
    "Connection": "keep-alive",
    "Last-Modified": "Sun, 27 Oct 2019 20:41:13 GMT",
    "Accept-Ranges": "bytes",
    "ETag": "\"5bcf67df68dd51:0\"",
    "X-Powered-By": "ASP.NET"
}
2019-12-14 16:52:04,248:hcad.etl:extract - {
    "Server": "nginx/1.14.0 (Ubuntu)",
    "Date": "Sat, 14 Dec 2019 16:52:04 GMT",
    "Content-Type": "application/x-zip-compressed",
    "Content-Length": "13699272",
    "Connection": "keep-alive",
    "Last-Modified": "Mon, 09 Dec 2019 02:37:07 GMT",
    "Accept-Ranges": "bytes",
    "ETag": "\"8fdd9d8c39aed51:0\"",
    "X-Powered-By": "ASP.NET"
}


[PosixPath('samples/Desc/Layout_and_Length.txt'),
 PosixPath('samples/data/cama/2019/Hearing_files.zip')]