# Data merging

This notebook merges all datasets from census and assigns each unit its cluster

In [None]:
from glob import glob

import geopandas as gpd
import pandas as pd

In [None]:
# Define path
files = glob("/data/uscuni-restricted/04_spatial_census/*nadzsj*.parquet")

In [None]:
files.remove(
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_statni_obcanstvi_narodnost_2021.parquet"
)

In [None]:
files

In [None]:
# Create empty list to store all files
dfs = []

# Process all files
for file in files:
    dfs.append(gpd.read_parquet(file))

In [None]:
dfs = [df.set_geometry(df.geometry) for df in dfs]

# Drop additional geometry columns before concatenation
for i in range(1, len(dfs)):
    dfs[i] = dfs[i].drop(columns=["geometry"])

# Concatenate along columns
concat_df = pd.concat(dfs, axis=1)

# Reassign geometry column after concatenation
concat_df = gpd.GeoDataFrame(concat_df, geometry=dfs[0].geometry)

In [None]:
concat_df = pd.concat(dfs, axis=1)

In [None]:
# Remove duplicate columns
merged_df = concat_df.loc[:, ~concat_df.columns.duplicated()]

In [None]:
# Save dataset
merged_df.to_parquet(
    "/data/uscuni-restricted/04_spatial_census/_merged_census_2021.parquet"
)

In [None]:
for col in merged_df.columns:
    print(col)