# 02 Census data cleaning

This notebook removes unnecessary columns from the original files and creates new files with relevant information.

In [None]:
from glob import glob
from pathlib import Path

import pandas as pd

In [None]:
# Load files
files = glob("/data/uscuni-restricted/01_preprocessed_census/*")

# Load file with relevant column information
names = pd.read_csv(
    "/data/uscuni-restricted/01_preprocessed_census/_col_names.csv"
).set_index("File/Column Name")

# Choose columns to drop
drop = names[names["use"] == 0]

# Remove unnecessary columns
drop = drop.drop(columns={"use", "name"})

# Process all files
for file in files:
    # Read path of the files
    path = Path(file)
    # Exclude files that start with _
    if path.stem.startswith("_"):
        continue
    # Open data
    data = pd.read_csv(path)
    # Clean data
    data_cleaned = data.drop(columns=drop.index, errors="ignore")
    # Save the new dataset
    data_cleaned.to_csv(
        f"/data/uscuni-restricted/02_cleaned_preprocessed_census/{path.stem}_cleaned.csv",
        index=False,
    )

In [None]:
# Load cleaned files
cleaned_files = glob("/data/uscuni-restricted/02_cleaned_preprocessed_census/*")

In [None]:
cleaned_files

In [None]:
# Define common columns
common_columns = [
    "NUTS_2",
    "naz_oblast",
    "NUTS_3",
    "naz_kraj",
    "kod_okres",
    "naz_okres",
    "kod_orp",
    "naz_orp",
    "kod_obec",
    "naz_obec",
    "kod_mco",
    "nazev_mco",
    "nadzsjd",
]

In [None]:
# Load all data for households and safe them to a new folder
households = pd.read_csv(cleaned_files[11])
households = households.drop(
    [
        "Hospodařící domácnosti v bytech - počet členů domácnosti: 1",
        "Hospodařící domácnosti v bytech - počet členů domácnosti: 2",
        "Hospodařící domácnosti v bytech - počet členů domácnosti: 3",
        "Hospodařící domácnosti v bytech - počet členů domácnosti: 4",
        "Hospodařící domácnosti v bytech - počet členů domácnosti: 5 a více",
        "Hospodařící domácnosti v bytech - typ HD: rodinné domácnosti - počet členů domácnosti: 2",
        "Hospodařící domácnosti v bytech - typ HD: rodinné domácnosti - počet členů domácnosti: 3",
        "Hospodařící domácnosti v bytech - typ HD: rodinné domácnosti - počet členů domácnosti: 4",
        "Hospodařící domácnosti v bytech - typ HD: rodinné domácnosti - počet členů domácnosti: 5 a více",
    ],
    axis=1,
)
households.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_households_2021_.csv", index=False
)

In [None]:
# Load all data for housing
housing1 = pd.read_csv(cleaned_files[8])
housing1.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_housing_size_facilities_2021.csv",
    index=False,
)
housing2 = pd.read_csv(cleaned_files[6])
housing2.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_housing_houses_2021.csv",
    index=False,
)
housing3 = pd.read_csv(cleaned_files[1])
housing3.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_housing_flats_2021.csv",
    index=False,
)

In [None]:
# Load all data for education and safe them to a new folder
education = pd.read_csv(cleaned_files[14])
education.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_education_2021.csv", index=False
)

In [None]:
# Load all data for the employed population
emp1 = pd.read_csv(cleaned_files[16])
# Rename columns
emp1 = emp1.rename(
    columns={
        "nuts_2_regs": "NUTS_2",
        "nazev_regionu_soudrznosti": "naz_oblast",
        "nuts3_kraj": "NUTS_3",
        "nazev_kraje": "naz_kraj",
        "nazev_okresu": "naz_okres",
        "nazev_orp": "naz_orp",
        "nazev_obec": "naz_obec",
    }
)
emp1.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_emp_employed_2021.csv", index=False
)
emp2 = pd.read_csv(cleaned_files[2])
emp2.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_emp_type_age_2021.csv", index=False
)
emp3 = pd.read_csv(cleaned_files[7])
emp3.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_emp_ea_age_2021.csv", index=False
)

In [None]:
# Load all data for population
pop1 = pd.read_csv(cleaned_files[0])
pop1.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_pop_age_gender_2021.csv",
    index=False,
)
pop2 = pd.read_csv(cleaned_files[3])
# Rename columns
pop2 = pop2.rename(
    columns={
        "nuts_2_regs": "NUTS_2",
        "nazev_regionu_soudrznosti": "naz_oblast",
        "nuts3_kraj": "NUTS_3",
        "nazev_kraje": "naz_kraj",
        "nazev_okresu": "naz_okres",
        "nazev_orp": "naz_orp",
        "nazev_obec": "naz_obec",
    }
)
pop2.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_pop_nationality_2021.csv",
    index=False,
)
pop3 = pd.read_csv(cleaned_files[4])
pop3.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_pop_ea_gender_2021.csv",
    index=False,
)
pop4 = pd.read_csv(cleaned_files[5])
pop4.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_pop_status_gender_2021.csv",
    index=False,
)
pop5 = pd.read_csv(cleaned_files[12])
pop5.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_pop_religion_gender_2021.csv",
    index=False,
)
pop6 = pd.read_csv(cleaned_files[15])
pop6.to_csv(
    "/data/uscuni-restricted/03_ready_census/nadzsjd_pop_residence_gender_2021.csv",
    index=False,
)