# Census data cleaning

This notebook removes unnecessary columns from the original files and creates new files with relevant information.

In [None]:
from glob import glob
import pandas as pd
from pathlib import Path

In [None]:
# Load files
files = glob("/data/uscuni-restricted/preprocessed_census/*")

# Load file with relevant column information
names = pd.read_csv(
    "/data/uscuni-restricted/preprocessed_census/_col_names.csv"
).set_index("File/Column Name")

# Choose columns to drop
drop = names[names["use"] == 0]

# Remove unnecessary columns
drop = drop.drop(columns={"use", "name"})

# Process all files
for file in files:
    # Read path of the files
    path = Path(file)
    # Exclude files that start with _
    if path.stem.startswith("_"):
        continue
    # Open data
    data = pd.read_csv(path)
    # Clean data
    data_cleaned = data.drop(columns=drop.index, errors="ignore")
    # Save the new dataset
    data_cleaned.to_csv(
        f"/data/uscuni-restricted/cleaned_preprocessed_census/{path.stem}_cleaned.csv",
        index=False,
    )

In [None]:
# Load cleaned files
cleaned_files = glob("/data/uscuni-restricted/cleaned_preprocessed_census/*")

In [None]:
# Define common columns
common_columns = [
    "NUTS_2",
    "naz_oblast",
    "NUTS_3",
    "naz_kraj",
    "kod_okres",
    "naz_okres",
    "kod_orp",
    "naz_orp",
    "kod_obec",
    "naz_obec",
    "kod_mco",
    "nazev_mco",
    "nadzsjd",
]

In [None]:
# Load all data for households and safe them to a new folder
households = pd.read_csv(cleaned_files[11])
households.to_csv(
    "/data/uscuni-restricted/ready_census/nadzsjd_households_2021.csv", index=False
)

In [None]:
# Load all data for housing
housing1 = pd.read_csv(cleaned_files[8])
housing2 = pd.read_csv(cleaned_files[6])
housing3 = pd.read_csv(cleaned_files[1])

In [None]:
# Merge data for housing
housing4 = pd.merge(housing1, housing2, on=common_columns, how="outer")
housing5 = pd.merge(housing3, housing4, on=common_columns, how="outer")

In [None]:
# Save merged housing data
housing5.to_csv(
    "/data/uscuni-restricted/ready_census/nadzsjd_housing_2021.csv", index=False
)

In [None]:
# Load all data for education and safe them to a new folder
education = pd.read_csv(cleaned_files[14])
education.to_csv(
    "/data/uscuni-restricted/ready_census/nadzsjd_education_2021.csv", index=False
)

In [None]:
# Load all data for the employed population
emp1 = pd.read_csv(cleaned_files[16])
emp2 = pd.read_csv(cleaned_files[2])
emp3 = pd.read_csv(cleaned_files[7])

In [None]:
# Rename columns
emp1 = emp1.rename(
    columns={
        "nuts_2_regs": "NUTS_2",
        "nazev_regionu_soudrznosti": "naz_oblast",
        "nuts3_kraj": "NUTS_3",
        "nazev_kraje": "naz_kraj",
        "nazev_okresu": "naz_okres",
        "nazev_orp": "naz_orp",
        "nazev_obec": "naz_obec",
    }
)

In [None]:
# Merge data for employment
emp4 = pd.merge(emp1, emp2, on=common_columns, how="outer")
emp5 = pd.merge(emp3, emp4, on=common_columns, how="outer")

In [None]:
# Save merged employment data
emp5.to_csv(
    "/data/uscuni-restricted/ready_census/nadzsjd_employed_2021.csv", index=False
)

In [None]:
# Load all data for population
pop1 = pd.read_csv(cleaned_files[1])
pop2 = pd.read_csv(cleaned_files[3])
pop3 = pd.read_csv(cleaned_files[4])
pop4 = pd.read_csv(cleaned_files[5])
pop5 = pd.read_csv(cleaned_files[12])
pop6 = pd.read_csv(cleaned_files[15])

In [None]:
# Rename columns
pop2 = pop2.rename(
    columns={
        "nuts_2_regs": "NUTS_2",
        "nazev_regionu_soudrznosti": "naz_oblast",
        "nuts3_kraj": "NUTS_3",
        "nazev_kraje": "naz_kraj",
        "nazev_okresu": "naz_okres",
        "nazev_orp": "naz_orp",
        "nazev_obec": "naz_obec",
    }
)

In [None]:
# Merge data for population
pop7 = pd.merge(pop1, pop2, on=common_columns, how="outer")
pop8 = pd.merge(pop3, pop4, on=common_columns, how="outer")
pop9 = pd.merge(pop5, pop6, on=common_columns, how="outer")
pop10 = pd.merge(pop7, pop8, on=common_columns, how="outer")
pop11 = pd.merge(pop9, pop10, on=common_columns, how="outer")

In [None]:
# Save merged popualtion data
pop11.to_csv(
    "/data/uscuni-restricted/ready_census/nadzsjd_population_2021.csv", index=False
)