# 03 Census data merging

This notebook merges preprocessed and cleaned data from CSU with polygons

In [None]:
from glob import glob
from pathlib import Path

import geopandas as gpd
import pandas as pd

In [None]:
# Read polygons
polygons = gpd.read_file(
    "/data/uscuni-restricted/Data_SLDB_2021/d_zsj_010122", driver="ESRI Shapefile"
)
polygons.head(2)

In [None]:
# Prepare data for merging
# polygons = polygons.iloc[:, [0, 1, -1]]
polygons["geometry"] = polygons.geometry.force_2d()
polygons.head(2)

In [None]:
# Read data for converting between zsj and nadzsj
converter = pd.read_excel(
    "/data/uscuni-restricted/Data_SLDB_2021/zsj_data/ZSJD_slouceneZSJD/sloucene/_prevodnik_zsjd_nadzsjd.xlsx"
)

In [None]:
# Prepare data for merging
converter.columns = converter.iloc[2]
converter = converter.iloc[4:]
converter = converter.iloc[:, 18:]
# converter["kod_nadzsj_d"] = converter["kod_nadzsj_d"].astype(int)
converter.head(5)

In [None]:
mapping = converter.set_index("kod_zsj_d")["kod_nadzsj_d"]

In [None]:
mapping.head()

In [None]:
polygons["kod_nadzsj_d"] = polygons["KOD_ZSJ_D"].map(mapping)

In [None]:
polygons["kod_nadzsj_d"].isna().sum()

In [None]:
polygons[polygons["kod_nadzsj_d"].isna()]

In [None]:
polygons.loc[158, "kod_nadzsj_d"] = "53806003"

In [None]:
nadzjs_polygons = polygons.dissolve("kod_nadzsj_d")

In [None]:
nadzjs_polygons.plot()

In [None]:
nadzjs_polygons.to_parquet("/data/uscuni-restricted/geometries/nadzsj_d.parquet")

In [None]:
nadzjs_polygons.shape

## Link to data files

In [None]:
files = glob("/data/uscuni-restricted/03_ready_census/*")
files

In [None]:
# Process all files
for file in files:
    # Read path of the files
    path = Path(file)
    # Open data
    data = pd.read_csv(path, dtype={"nadzsjd": str})
    series = data.columns
    series = series.str.lstrip(" ")
    data.columns = series
    # Merge data
    data_df = pd.merge(
        data,
        nadzjs_polygons[["geometry"]],
        left_on="nadzsjd",
        right_index=True,
        how="left",
    )
    # Convert to gdf
    data_gdf = gpd.GeoDataFrame(data_df, geometry="geometry", crs=polygons.crs)
    data_gdf = data_gdf.dropna(subset="geometry")

    # Save the new dataset
    data_gdf.to_parquet(
        f"/data/uscuni-restricted/04_spatial_census/{path.stem}.parquet", index=False
    )