In [2]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import zipfile
import os


BASE_URL = "https://static.nhtsa.gov/nhtsa/downloads/FARS/{year}/National/FARS{year}NationalCSV.zip"
ROOT = Path("..")
DATA_DIR_ZIP = ROOT / "fars_data_zip"
DATA_DIR_UNZIP = ROOT / "fars_data_unzipped"
YEARS = range(2018, 2023)

In [13]:
def unzip_fars(zip_path: Path, dest_root: Path):
    """
    Unzips the `zip_path` zip file into a directory under `dest_root`.
    Ensures that the contents of zip file end up under dest_root / zip_stem / ....
    whether or not the zip itself contains the data files under a folder name.
    """

    zip_stem = zip_path.stem  # e.g. FARS2018National
    target_dir = dest_root / zip_stem

    with zipfile.ZipFile(zip_path, "r") as zf:
        members: list[str] = zf.namelist()
        common_prefix = Path(members[0]).parts[0]

        # Check if all members of the zip file have the same prefix. If so the prefix will
        # be removed to save just the file name
        all_under_same_prefix: bool = all(
            Path(m).parts[0] == common_prefix for m in members
        )

        if all_under_same_prefix:
            for member in members:
                member_path = Path(member)
                parts = member_path.parts[1:]  # skip top-level folder
                if parts:
                    out_path = target_dir.joinpath(
                        *parts
                    )  # e.g. FARS2018National/accidents.csv
                    out_path.parent.mkdir(parents=True, exist_ok=True)

                    with zf.open(member) as source, open(out_path, "wb") as target:
                        target.write(source.read())

        else:
            zf.extractall(path=target_dir)

In [18]:
zip_files = DATA_DIR_ZIP.glob("*.zip")


zip_files

<map at 0x118357820>

In [19]:
for zip_file in zip_files:
    unzip_fars(zip_file, DATA_DIR_UNZIP)