In [12]:
import numpy as np
import pandas as pd
from shapely import wkt
import geopandas as gpd
from pathlib import Path
import matplotlib.pyplot as plt
from shapely import wkt
from shapely.geometry import Polygon
import json
from tqdm import tqdm
import os

In [None]:
RAW_DIR = "../../../data/tm/1-raw/"
PROCESSED_DIR = "../../../data/tm/2-processed/"

# Crop buildings from quadkey files

This notebook does the following steps: 

(1) determines which batch files contain project-included cities (link [here](https://sites.research.google/open-buildings/))

(2) extracts buildings from parts intersecting with city boundaries

(3) combines all city building extracts to a single file.

For Google Open Bldgs:

1. 339_buildings.csv.gz 
Navotas City
Mandaluyong City
Muntinlupa City
Dagupan City
Palayan City


2. 33b_buildings.csv.gz 
Legazpi City
Iloilo City
Mandaue City

3. 331_buildings.csv.gz 
Tacloban City

4. 325_buildings.csv.gz 
Zamboanga City

5. 32f_buildings.csv.gz 
Cagayan de Oro City
Davao City

In [None]:
filepath = f"{RAW_DIR}google-open-buildings/raw/"
fn_list = os.listdir(filepath)
len(fn_list)

## 1. Extract buildings from parts intersecting with city boundaries


In [46]:
# Read cities bounds
bounds_gdf = gpd.read_file(
    "../../../data/1-admin-bounds/target_admin_bounds.shp"
).set_crs(4326)

Unnamed: 0,ADM1_EN,ADM1_PCODE,ADM2_EN,ADM2_PCODE,ADM3_EN,ADM3_PCODE,ADM4_EN,ADM4_PCODE,geometry
0,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Lomboy,PH015518016,"POLYGON ((120.32742 16.05423, 120.32719 16.053..."
1,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Tapuac,PH015518031,"POLYGON ((120.33380 16.03974, 120.33389 16.039..."
2,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Pantal,PH015518022,"POLYGON ((120.34737 16.06009, 120.34761 16.060..."
3,Region I,PH010000000,Pangasinan,PH015500000,Dagupan City,PH015518000,Barangay I (T. Bugallon),PH015518024,"POLYGON ((120.34054 16.04489, 120.34054 16.044..."
4,Region III,PH030000000,Nueva Ecija,PH034900000,Palayan City,PH034919000,Imelda Valley,PH034919017,"POLYGON ((121.12250 15.58028, 121.12687 15.579..."


In [44]:
# Loop over all quadkey files, append and save part of batches within a city
google_gdf_list = []
for fn in tqdm(fn_list):
    google_gdf = gpd.read_file(f"{filepath}{fn}")
    google_gdf["geometry"] = google_gdf["geometry"].apply(wkt.loads)
    google_gdf = google_gdf.set_geometry("geometry")
    google_gdf = google_gdf.set_crs(4326)

    bldgs_gdf = gpd.sjoin(google_gdf, bounds_gdf, predicate="intersects")
    bldgs_gdf = bldgs_gdf[
        ["ADM3_EN", "ADM4_PCODE", "area_in_meters", "confidence", "geometry"]
    ]

    if len(bldgs_gdf) > 0:
        print(f"Quadkey file {fn} building count: {len(bldgs_gdf)}")
        google_gdf_list.append(bldgs_gdf)

Unnamed: 0,latitude,longitude,area_in_meters,confidence,geometry,full_plus_code
0,6.174536,125.151687,86.3767,0.8381,"POLYGON((125.151728732943 6.17449456203432, 12...",6QR755F2+RM7H
1,6.637727,125.340554,105.2245,0.8311,"POLYGON((125.340603340237 6.63768720794224, 12...",6QR7J8QR+36V4
2,7.852184,125.053792,65.4915,0.7008,"POLYGON((125.053839696371 7.85220414855485, 12...",6QV7V323+VGF7
3,7.107919,125.627632,28.9986,0.7041,"POLYGON((125.627670623458 7.10792916199382, 12...",6QV74J5H+536J
4,6.944953,124.87706,79.0381,0.7258,"POLYGON((124.877107790642 6.94497167814368, 12...",6QR6WVVG+XRM5


## 2. Combine all city building extracts to a single file

In [None]:
all_google_gdf = pd.concat(google_gdf_list)
all_google_gdf.info()

In [48]:
city_names = bldgs_gdf["ADM3_EN"].unique()
short_city_names = [
    "".join([s.lower() for s in c.split(" ") if len(s) > 4]) for c in city_names
]
short_city_names = ["cdo" if s == "cagayan" else s for s in short_city_names]
city_names, short_city_names

(array(['Davao City', 'Cagayan de Oro City'], dtype=object), ['davao', 'cdo'])

## 3. Save

In [37]:
for short_city_name, city_name in tqdm(zip(short_city_names, city_names)):
    save_gdf = bldgs_gdf[bldgs_gdf["ADM3_EN"] == city_name]
    print(f"Number of bldgs for {short_city_name}: {len(save_gdf)}")
    save_gdf.to_file(
        f"../../../data/tm/2-processed/google-open-buildings/google_lacuna_cities_bldgs_{short_city_name}.csv",
        driver="GeoJSON",
        index=False,
    )

0it [00:00, ?it/s]

Number of bldgs for muntinlupa: 104383


1it [00:13, 13.30s/it]

Number of bldgs for palayan: 18772


2it [00:15,  6.97s/it]

Number of bldgs for mandaluyong: 33089


3it [00:19,  5.66s/it]

Number of bldgs for navotas: 29995


4it [00:23,  4.93s/it]

Number of bldgs for dagupan: 63044


5it [00:31,  6.28s/it]
