# Extract Prague data

This notebook extracts geometries for Prague metropolitan area a creates census files with geometries 

In [None]:
from glob import glob
from pathlib import Path

import geopandas as gpd
import pandas as pd

In [None]:
# Load data
prague = gpd.read_file("/data/uscuni-restricted/PMO ITI/")
geom = gpd.read_parquet("/data/uscuni-restricted/geometries/nadzsj_d.parquet")

In [None]:
# Create one polygon
prague = prague.union_all()

In [None]:
# Extract geometries that are within Prague metropolitan area
prague_geom = geom.iloc[geom.centroid.sindex.query(prague, predicate="contains")]

## Merge geometry of PMI with census data

In [None]:
files = glob("/data/uscuni-restricted/03_ready_census/*")

In [None]:
# Process all files
for file in files:
    # Read path of the files
    path = Path(file)
    # Open data
    data = pd.read_csv(path, dtype={"nadzsjd": str})
    series = data.columns
    series = series.str.lstrip(" ")
    data.columns = series
    # Merge data
    data_df = pd.merge(
        data,
        prague_geom[["geometry"]],
        left_on="nadzsjd",
        right_index=True,
        how="left",
    )
    # Convert to gdf
    data_gdf = gpd.GeoDataFrame(data_df, geometry="geometry", crs=prague_geom.crs)
    data_gdf = data_gdf.dropna(subset="geometry")

    # Save the new dataset
    data_gdf.to_parquet(
        f"/data/uscuni-restricted/prague/prague_{path.stem}.parquet", index=False
    )