# Census data merging

This notebook merges preprocessed and cleaned data from CSU with polygons

In [None]:
from glob import glob
import pandas as pd
import geopandas as gpd
from pathlib import Path

In [None]:
# Read polygons
polygons = gpd.read_file(
    "/data/uscuni-restricted/Data_SLDB_2021/zsj_sldb_2021_-7697413790081074647.zip"
)
polygons.head(2)

In [None]:
# Prepare data for merging
polygons = polygons.iloc[:, [0, 1, -1]]
polygons = polygons.rename(columns={"KOD_ZSJ_P": "kod_zsj"})
polygons["geometry"] = polygons.geometry.force_2d()
polygons.head(2)

In [None]:
# Read data for converting between zsj and nadzsj
converter = pd.read_excel(
    "/data/uscuni-restricted/Data_SLDB_2021/zsj_data/ZSJD_slouceneZSJD/sloucene/_prevodnik_zsjd_nadzsjd.xlsx"
)

In [None]:
# Prepare data for merging
converter.columns = converter.iloc[2]
converter = converter.iloc[4:]
converter = converter.iloc[:, 18:]
converter["kod_nadzsj_d"] = converter["kod_nadzsj_d"].astype(int)
converter.head(5)

In [None]:
# Read processed data
files = glob("/data/uscuni-restricted/ready_census/*.csv")

In [None]:
# Process all files
for file in files:
    # Read path of the files
    path = Path(file)
    # Open data
    data = pd.read_csv(path)
    # Merge data
    data_c = pd.merge(
        data,
        converter[["kod_zsj", "kod_nadzsj_d"]],
        left_on="nadzsjd",
        right_on="kod_nadzsj_d",
        how="left",
    )
    data_df = pd.merge(data_c, polygons, on="kod_zsj", how="left")
    # Convert to gdf
    data_gdf = gpd.GeoDataFrame(data_df, geometry="geometry", crs=polygons.crs)
    # Dissolve the merged zsj
    data_dissolved = data_gdf.dissolve(
        by="kod_nadzsj_d", aggfunc="first", as_index=False
    )
    # Save the new dataset
    data_dissolved.to_parquet(
        f"/data/uscuni-restricted/geometries/{path.stem}.parquet", index=False
    )