In [101]:
import bz2
import csv
import gzip
import io
import re
import tarfile
import zipfiles
import shutil
from pathlib import Path
from typing import Iterable
from urllib.parse import urlsplit
import requests

## Compress staged files

These files are big, very big. We can save 50% of the space consumed if we compress them. 

In [102]:
samples = Path("samples")

In [103]:
remote = "http://pdata.hcad.org/data/cama/2019/Hearing_files.zip"

In [106]:
dictionary = re.findall(
    r"(\w+)\s+(\w+)\s+(\d+)",
    requests.get("https://pdata.hcad.org/Desc/Layout_and_Length.txt").text
)

In [109]:
dictionary[:5]

[('arb_hearings_pp', 'acct', '7'),
 ('arb_hearings_pp', 'Tax_Year', '2'),
 ('arb_hearings_pp', 'Personal', '1'),
 ('arb_hearings_pp', 'Hearing_Type', '1'),
 ('arb_hearings_pp', 'State_Class_Code', '2')]

In [40]:
def scaffold(*paths: Path):
    for path in paths:
        try:
            path.mkdir()
            print("Created %s" % path)
        except FileExistsError:
            print("%s already exists, skipping." % path)

In [45]:
scaffold(samples.joinpath("zip"), samples.joinpath("txt"), samples.joinpath("gz"))

samples/zip already exists, skipping.
samples/txt already exists, skipping.
Created samples/gz


In [70]:
def source(*urls):
    for url in urls:
        print("Downloading %s" % url)
        yield requests.get(url)
        print("Downloaded %s" % url)

In [92]:
def land(*sources: requests.Response) -> Iterable[Path]:
    for src in sources:
        print("Landing %s" % src.url)
        fn = Path(urlsplit(remote).path).name
        dst = samples.joinpath('zip').joinpath(fn)
        dst.write_bytes(src.content)
        yield dst
        print("Landed %s" % dst)

In [81]:
def process_zip(*files):
    for file in files:
        print("Processing %s" % file)
        with zipfile.ZipFile(file) as zip_file:
            zip_file.printdir()
            for name in zip_file.namelist():
                yield io.TextIOWrapper(zip_file.open(name), encoding="iso-8859-1", newline="")

In [115]:
def process_txt(*files):
    for file in files:
        print("Processing %s" % file.name)
        dst = samples.joinpath("csv").joinpath(file.name).with_suffix(".csv")
        dst.parent.mkdir(parents=True, exist_ok=True)
        field_names = [i[1] for i in dictionary if i[0] in file.name]
        reader = csv.DictReader(file, fieldnames=field_names, dialect="excel-tab")
        writer = csv.DictWriter(dst.open("w+"), fieldnames=field_names)
        try:
            writer.writeheader()
            for row in reader:
                writer.writerow(row)
            print("Processed %s" % file)
        except csv.Error as csv_error:
            print(csv_error)

In [116]:
process_txt(*process_zip(*land(*source(remote))))

Downloading http://pdata.hcad.org/data/cama/2019/Hearing_files.zip
Downloaded http://pdata.hcad.org/data/cama/2019/Hearing_files.zip
Landing https://pdata.hcad.org/data/cama/2019/Hearing_files.zip
Landed samples/zip/Hearing_files.zip
Processing samples/zip/Hearing_files.zip
File Name                                             Modified             Size
arb_hearings_pp.txt                            2019-12-22 15:52:36      1503419
arb_hearings_real.txt                          2019-12-22 15:52:40     45698132
arb_protest_pp.txt                             2019-12-22 15:52:40       363036
arb_protest_real.txt                           2019-12-22 15:52:42     13179840
Processing arb_hearings_pp.txt
Processed <_io.TextIOWrapper name='arb_hearings_pp.txt' encoding='iso-8859-1'>
Processing arb_hearings_real.txt
Processed <_io.TextIOWrapper name='arb_hearings_real.txt' encoding='iso-8859-1'>
Processing arb_protest_pp.txt
Processed <_io.TextIOWrapper name='arb_protest_pp.txt' encoding='iso-88

In [135]:
def compress(*files):
    for file in files:
        with file.open("rb") as fin:
            print("Compressing %s" % fin)
            with gzip.open(file.with_suffix(f"{file.suffix}.gz"), 'wb') as fout:
                shutil.copyfileobj(fin, fout)
                print("Compressed %s" % fout)
        file.unlink()

In [136]:
files_to_compress = list(samples.rglob("*.txt"))

In [137]:
compress(*files_to_compress)

Compressing <_io.BufferedReader name='samples/txt/arb_hearings_pp.txt'>
Compressed <gzip _io.BufferedWriter name='samples/txt/arb_hearings_pp.txt.gz' 0x7f921feebb38>
Compressing <_io.BufferedReader name='samples/txt/arb_protest_real.txt'>
Compressed <gzip _io.BufferedWriter name='samples/txt/arb_protest_real.txt.gz' 0x7f921fefe550>
Compressing <_io.BufferedReader name='samples/txt/arb_hearings_real.txt'>
Compressed <gzip _io.BufferedWriter name='samples/txt/arb_hearings_real.txt.gz' 0x7f921feebb38>
Compressing <_io.BufferedReader name='samples/txt/arb_protest_pp.txt'>
Compressed <gzip _io.BufferedWriter name='samples/txt/arb_protest_pp.txt.gz' 0x7f921fefe1d0>
