In [2]:
import numpy as np
import pandas as pd
from shapely import wkt
import geopandas as gpd
from pathlib import Path
import matplotlib.pyplot as plt
from shapely import wkt
from shapely.geometry import Polygon
import json
from tqdm import tqdm
import os

In [None]:
RAW_DIR = "../../../data/tm/1-raw/"
PROCESSED_DIR = "../../../data/tm/2-processed/"

# Crop buildings from quadkey files

This notebook does the following steps: 

(1) determines which quadkey files contain project-included cities (see link [here](https://minedbuildings.blob.core.windows.net/global-buildings/dataset-links.csv), or [here](https://github.com/microsoft/GlobalMLBuildingFootprints))

(2) extracts buildings from parts intersecting with city boundaries

(3) combines all city building extracts to a single file.

## 1. Check which quadkey files contain project cities

In [4]:
filepath = f"{RAW_DIR}ms-open-buildings/raw/"
fn_list = os.listdir(filepath)
len(fn_list)

144

In [7]:
# Read cities bounds
bounds_gdf = gpd.read_file(
    "../../../data/1-admin-bounds/target_admin_bounds.shp"
).set_crs(4326)

## 2. Extract buildings from parts intersecting with city boundaries


In [9]:
# Loop over all quadkey files, append and save part of batches within a city
ms_gdf_list = []
for fn in tqdm(fn_list):
    ms_gdf = gpd.read_file(f"{filepath}{fn}")
    ms_gdf = ms_gdf.set_geometry("geometry")
    ms_gdf = ms_gdf.set_crs(4326)

    bldgs_gdf = gpd.sjoin(ms_gdf, bounds_gdf, predicate="intersects")
    bldgs_gdf = bldgs_gdf[["ADM3_EN", "ADM4_PCODE", "geometry"]]

    if len(bldgs_gdf) > 0:
        print(f"Quadkey file {fn} building count: {len(bldgs_gdf)}")
        ms_gdf_list.append(bldgs_gdf)

  0%|          | 0/144 [00:00<?, ?it/s]

  1%|          | 1/144 [00:00<00:18,  7.68it/s]

Quadkey file 132301122.geojson building count: 0
Quadkey file 132301123.geojson building count: 0


  2%|▏         | 3/144 [00:19<16:38,  7.08s/it]

Quadkey file 132301213.geojson building count: 0


  3%|▎         | 4/144 [00:49<35:09, 15.07s/it]

Quadkey file 132301231.geojson building count: 0


  3%|▎         | 5/144 [01:14<42:37, 18.40s/it]

Quadkey file 132301233.geojson building count: 0


  4%|▍         | 6/144 [01:14<29:05, 12.65s/it]

Quadkey file 132301300.geojson building count: 0
Quadkey file 132301301.geojson building count: 0


  6%|▌         | 8/144 [01:25<21:12,  9.35s/it]

Quadkey file 132301302.geojson building count: 0


  6%|▋         | 9/144 [01:33<20:03,  8.91s/it]

Quadkey file 132301303.geojson building count: 0


  7%|▋         | 10/144 [01:47<23:26, 10.49s/it]

Quadkey file 132301320.geojson building count: 0


  8%|▊         | 11/144 [02:04<27:39, 12.48s/it]

Quadkey file 132301321.geojson building count: 0


  8%|▊         | 12/144 [02:27<33:50, 15.39s/it]

Quadkey file 132301322.geojson building count: 0


  9%|▉         | 13/144 [02:52<40:33, 18.58s/it]

Quadkey file 132301323.geojson building count: 0


 10%|▉         | 14/144 [02:53<28:17, 13.06s/it]

Quadkey file 132301330.geojson building count: 0


 10%|█         | 15/144 [02:53<19:57,  9.28s/it]

Quadkey file 132301332.geojson building count: 0


 11%|█         | 16/144 [03:06<21:54, 10.27s/it]

Quadkey file 132303010.geojson building count: 0


 12%|█▏        | 17/144 [04:14<58:40, 27.72s/it]

Quadkey file 132303011.geojson building count: 25652


 12%|█▎        | 18/144 [04:24<46:53, 22.33s/it]

Quadkey file 132303012.geojson building count: 0


 13%|█▎        | 19/144 [06:02<1:34:02, 45.14s/it]

Quadkey file 132303013.geojson building count: 0


 14%|█▍        | 20/144 [06:10<1:10:21, 34.04s/it]

Quadkey file 132303030.geojson building count: 0


 15%|█▍        | 21/144 [07:50<1:50:13, 53.77s/it]

Quadkey file 132303031.geojson building count: 1033


 15%|█▌        | 22/144 [08:35<1:43:38, 50.97s/it]

Quadkey file 132303033.geojson building count: 0


 16%|█▌        | 23/144 [08:57<1:25:31, 42.41s/it]

Quadkey file 132303100.geojson building count: 0


 17%|█▋        | 24/144 [09:05<1:04:22, 32.19s/it]

Quadkey file 132303101.geojson building count: 0


 17%|█▋        | 25/144 [09:37<1:03:22, 31.95s/it]

Quadkey file 132303102.geojson building count: 12358


 18%|█▊        | 26/144 [09:37<44:08, 22.45s/it]  

Quadkey file 132303103.geojson building count: 0


 19%|█▉        | 27/144 [11:09<1:24:25, 43.29s/it]

Quadkey file 132303120.geojson building count: 5470


 19%|█▉        | 28/144 [11:12<1:00:15, 31.17s/it]

Quadkey file 132303121.geojson building count: 0


 20%|██        | 29/144 [13:41<2:07:45, 66.66s/it]

Quadkey file 132303122.geojson building count: 51909


 22%|██▏       | 31/144 [13:48<1:04:16, 34.13s/it]

Quadkey file 132303123.geojson building count: 0
Quadkey file 132303130.geojson building count: 0


 22%|██▏       | 32/144 [13:59<50:38, 27.13s/it]  

Quadkey file 132303132.geojson building count: 0


 23%|██▎       | 33/144 [14:00<35:38, 19.26s/it]

Quadkey file 132303133.geojson building count: 0


 24%|██▎       | 34/144 [14:01<25:02, 13.65s/it]

Quadkey file 132303210.geojson building count: 0


 24%|██▍       | 35/144 [14:08<21:27, 11.81s/it]

Quadkey file 132303211.geojson building count: 0


 25%|██▌       | 36/144 [14:12<17:01,  9.45s/it]

Quadkey file 132303213.geojson building count: 0


 26%|██▌       | 37/144 [14:13<12:06,  6.79s/it]

Quadkey file 132303223.geojson building count: 0


 26%|██▋       | 38/144 [14:14<09:14,  5.23s/it]

Quadkey file 132303230.geojson building count: 0


 27%|██▋       | 39/144 [14:15<06:36,  3.78s/it]

Quadkey file 132303231.geojson building count: 0


 28%|██▊       | 40/144 [14:15<04:53,  2.82s/it]

Quadkey file 132303232.geojson building count: 0


 28%|██▊       | 41/144 [14:15<03:31,  2.06s/it]

Quadkey file 132303233.geojson building count: 0


 29%|██▉       | 42/144 [15:01<25:49, 15.19s/it]

Quadkey file 132303300.geojson building count: 0


 30%|██▉       | 43/144 [15:12<23:11, 13.78s/it]

Quadkey file 132303301.geojson building count: 0


 31%|███       | 44/144 [15:23<21:56, 13.16s/it]

Quadkey file 132303302.geojson building count: 0


 31%|███▏      | 45/144 [15:25<15:56,  9.66s/it]

Quadkey file 132303303.geojson building count: 0


 32%|███▏      | 46/144 [15:35<16:00,  9.80s/it]

Quadkey file 132303310.geojson building count: 0


 33%|███▎      | 47/144 [16:14<30:02, 18.58s/it]

Quadkey file 132303311.geojson building count: 0


 33%|███▎      | 48/144 [16:16<21:28, 13.42s/it]

Quadkey file 132303312.geojson building count: 0


 34%|███▍      | 49/144 [16:30<21:52, 13.81s/it]

Quadkey file 132303313.geojson building count: 19689


 35%|███▍      | 50/144 [16:37<18:22, 11.73s/it]

Quadkey file 132303320.geojson building count: 0


 35%|███▌      | 51/144 [16:42<15:00,  9.68s/it]

Quadkey file 132303321.geojson building count: 0


 36%|███▌      | 52/144 [16:42<10:31,  6.86s/it]

Quadkey file 132303322.geojson building count: 0


 37%|███▋      | 53/144 [16:51<11:18,  7.45s/it]

Quadkey file 132303323.geojson building count: 0


 38%|███▊      | 54/144 [16:52<08:22,  5.58s/it]

Quadkey file 132303330.geojson building count: 0


 38%|███▊      | 55/144 [17:00<09:06,  6.14s/it]

Quadkey file 132303331.geojson building count: 0


 39%|███▉      | 56/144 [17:23<16:36, 11.32s/it]

Quadkey file 132303332.geojson building count: 0


 40%|███▉      | 57/144 [17:29<14:03,  9.70s/it]

Quadkey file 132303333.geojson building count: 0


 40%|████      | 58/144 [17:30<10:07,  7.06s/it]

Quadkey file 132312022.geojson building count: 0


 41%|████      | 59/144 [17:37<10:05,  7.12s/it]

Quadkey file 132312200.geojson building count: 0


 42%|████▏     | 60/144 [17:51<12:42,  9.07s/it]

Quadkey file 132312202.geojson building count: 11733


 42%|████▏     | 61/144 [17:52<09:15,  6.69s/it]

Quadkey file 132312203.geojson building count: 0
Quadkey file 132312212.geojson building count: 0


 44%|████▍     | 63/144 [18:00<07:20,  5.44s/it]

Quadkey file 132312220.geojson building count: 0


 44%|████▍     | 64/144 [18:11<09:02,  6.78s/it]

Quadkey file 132312221.geojson building count: 0


 45%|████▌     | 65/144 [18:20<09:34,  7.28s/it]

Quadkey file 132312222.geojson building count: 0


 46%|████▌     | 66/144 [18:40<14:17, 11.00s/it]

Quadkey file 132312223.geojson building count: 34343


 47%|████▋     | 67/144 [18:45<11:36,  9.05s/it]

Quadkey file 132312230.geojson building count: 0


 47%|████▋     | 68/144 [18:49<09:42,  7.67s/it]

Quadkey file 132312232.geojson building count: 0


 48%|████▊     | 69/144 [18:50<07:21,  5.89s/it]

Quadkey file 132320131.geojson building count: 0


 49%|████▊     | 70/144 [18:51<05:19,  4.31s/it]

Quadkey file 132320132.geojson building count: 0


 49%|████▉     | 71/144 [18:56<05:25,  4.46s/it]

Quadkey file 132320133.geojson building count: 0


 50%|█████     | 72/144 [18:56<03:58,  3.31s/it]

Quadkey file 132320310.geojson building count: 0
Quadkey file 132320333.geojson building count: 0


 51%|█████▏    | 74/144 [18:58<02:40,  2.29s/it]

Quadkey file 132321001.geojson building count: 0


 52%|█████▏    | 75/144 [19:00<02:27,  2.13s/it]

Quadkey file 132321002.geojson building count: 0


 53%|█████▎    | 76/144 [19:02<02:30,  2.21s/it]

Quadkey file 132321003.geojson building count: 0


 53%|█████▎    | 77/144 [19:03<02:07,  1.90s/it]

Quadkey file 132321010.geojson building count: 0


 54%|█████▍    | 78/144 [19:04<01:36,  1.46s/it]

Quadkey file 132321012.geojson building count: 0


 55%|█████▍    | 79/144 [19:11<03:21,  3.10s/it]

Quadkey file 132321020.geojson building count: 0


 56%|█████▌    | 80/144 [19:12<02:38,  2.47s/it]

Quadkey file 132321100.geojson building count: 0


 56%|█████▋    | 81/144 [19:23<05:21,  5.10s/it]

Quadkey file 132321101.geojson building count: 0


 57%|█████▋    | 82/144 [19:24<03:57,  3.83s/it]

Quadkey file 132321103.geojson building count: 0


 58%|█████▊    | 83/144 [20:11<16:45, 16.48s/it]

Quadkey file 132321110.geojson building count: 54522


 58%|█████▊    | 84/144 [20:25<15:53, 15.89s/it]

Quadkey file 132321111.geojson building count: 0


 59%|█████▉    | 85/144 [20:39<15:02, 15.29s/it]

Quadkey file 132321112.geojson building count: 0


 60%|█████▉    | 86/144 [21:07<18:26, 19.08s/it]

Quadkey file 132321113.geojson building count: 0


 60%|██████    | 87/144 [21:08<12:49, 13.51s/it]

Quadkey file 132321120.geojson building count: 0


 61%|██████    | 88/144 [21:17<11:22, 12.19s/it]

Quadkey file 132321130.geojson building count: 0


 62%|██████▏   | 89/144 [21:35<12:48, 13.97s/it]

Quadkey file 132321131.geojson building count: 0


 62%|██████▎   | 90/144 [21:36<09:06, 10.12s/it]

Quadkey file 132321132.geojson building count: 0


 63%|██████▎   | 91/144 [21:48<09:22, 10.61s/it]

Quadkey file 132321133.geojson building count: 0


 64%|██████▍   | 92/144 [21:48<06:32,  7.56s/it]

Quadkey file 132321202.geojson building count: 0


 65%|██████▌   | 94/144 [21:48<03:09,  3.80s/it]

Quadkey file 132321220.geojson building count: 0
Quadkey file 132321222.geojson building count: 0


 66%|██████▌   | 95/144 [21:49<02:12,  2.70s/it]

Quadkey file 132321231.geojson building count: 0


 67%|██████▋   | 96/144 [21:50<01:50,  2.29s/it]

Quadkey file 132321233.geojson building count: 0


 67%|██████▋   | 97/144 [21:51<01:30,  1.92s/it]

Quadkey file 132321301.geojson building count: 0


 68%|██████▊   | 98/144 [21:59<02:47,  3.64s/it]

Quadkey file 132321303.geojson building count: 32396


 69%|██████▉   | 99/144 [22:13<05:11,  6.93s/it]

Quadkey file 132321310.geojson building count: 0


 69%|██████▉   | 100/144 [22:37<08:51, 12.07s/it]

Quadkey file 132321311.geojson building count: 0


 70%|███████   | 101/144 [22:45<07:41, 10.74s/it]

Quadkey file 132321312.geojson building count: 1125


 71%|███████   | 102/144 [22:51<06:33,  9.37s/it]

Quadkey file 132321313.geojson building count: 0


 72%|███████▏  | 103/144 [22:51<04:33,  6.66s/it]

Quadkey file 132321320.geojson building count: 0


 72%|███████▏  | 104/144 [23:15<07:46, 11.66s/it]

Quadkey file 132321321.geojson building count: 111847


 73%|███████▎  | 105/144 [23:23<06:48, 10.48s/it]

Quadkey file 132321322.geojson building count: 0


 74%|███████▎  | 106/144 [23:23<04:44,  7.48s/it]

Quadkey file 132321323.geojson building count: 0


 74%|███████▍  | 107/144 [23:23<03:18,  5.37s/it]

Quadkey file 132323003.geojson building count: 0


 75%|███████▌  | 108/144 [23:25<02:33,  4.28s/it]

Quadkey file 132323010.geojson building count: 0


 76%|███████▌  | 109/144 [23:27<02:03,  3.52s/it]

Quadkey file 132323011.geojson building count: 0


 76%|███████▋  | 110/144 [23:27<01:26,  2.55s/it]

Quadkey file 132323012.geojson building count: 0


 77%|███████▋  | 111/144 [23:46<04:07,  7.49s/it]

Quadkey file 132330000.geojson building count: 0


 78%|███████▊  | 112/144 [24:12<06:56, 13.01s/it]

Quadkey file 132330001.geojson building count: 0


 78%|███████▊  | 113/144 [25:04<12:48, 24.78s/it]

Quadkey file 132330002.geojson building count: 33430


 79%|███████▉  | 114/144 [25:15<10:19, 20.64s/it]

Quadkey file 132330003.geojson building count: 0


 80%|███████▉  | 115/144 [25:19<07:31, 15.56s/it]

Quadkey file 132330010.geojson building count: 0


 81%|████████  | 116/144 [25:24<05:45, 12.33s/it]

Quadkey file 132330012.geojson building count: 0


 81%|████████▏ | 117/144 [25:25<04:04,  9.07s/it]

Quadkey file 132330013.geojson building count: 0


 82%|████████▏ | 118/144 [25:39<04:28, 10.32s/it]

Quadkey file 132330020.geojson building count: 0


 83%|████████▎ | 119/144 [25:41<03:20,  8.01s/it]

Quadkey file 132330021.geojson building count: 0


 83%|████████▎ | 120/144 [25:46<02:51,  7.14s/it]

Quadkey file 132330022.geojson building count: 0


 84%|████████▍ | 121/144 [26:13<04:59, 13.02s/it]

Quadkey file 132330023.geojson building count: 96652


 85%|████████▍ | 122/144 [26:23<04:23, 11.98s/it]

Quadkey file 132330030.geojson building count: 0


 85%|████████▌ | 123/144 [26:27<03:23,  9.68s/it]

Quadkey file 132330031.geojson building count: 0


 86%|████████▌ | 124/144 [26:46<04:08, 12.43s/it]

Quadkey file 132330032.geojson building count: 0


 87%|████████▋ | 125/144 [26:55<03:38, 11.48s/it]

Quadkey file 132330033.geojson building count: 0


 88%|████████▊ | 126/144 [27:26<05:09, 17.21s/it]

Quadkey file 132330200.geojson building count: 0


 88%|████████▊ | 127/144 [27:50<05:28, 19.31s/it]

Quadkey file 132330201.geojson building count: 9440


 89%|████████▉ | 128/144 [28:07<04:57, 18.58s/it]

Quadkey file 132330202.geojson building count: 0


 90%|████████▉ | 129/144 [28:40<05:43, 22.93s/it]

Quadkey file 132330203.geojson building count: 0


 90%|█████████ | 130/144 [28:48<04:18, 18.49s/it]

Quadkey file 132330210.geojson building count: 0


 91%|█████████ | 131/144 [29:02<03:42, 17.08s/it]

Quadkey file 132330211.geojson building count: 0


 92%|█████████▏| 132/144 [30:05<06:11, 30.95s/it]

Quadkey file 132330212.geojson building count: 277159


 92%|█████████▏| 133/144 [30:22<04:53, 26.72s/it]

Quadkey file 132330213.geojson building count: 0


 93%|█████████▎| 134/144 [30:32<03:36, 21.61s/it]

Quadkey file 132330220.geojson building count: 0


 94%|█████████▍| 135/144 [31:15<04:13, 28.21s/it]

Quadkey file 132330221.geojson building count: 0


 94%|█████████▍| 136/144 [31:16<02:41, 20.14s/it]

Quadkey file 132330222.geojson building count: 0


 95%|█████████▌| 137/144 [31:33<02:13, 19.07s/it]

Quadkey file 132330223.geojson building count: 0


 96%|█████████▌| 138/144 [31:53<01:55, 19.20s/it]

Quadkey file 132330230.geojson building count: 12966


 97%|█████████▋| 139/144 [31:58<01:15, 15.14s/it]

Quadkey file 132330231.geojson building count: 0


 97%|█████████▋| 140/144 [32:11<00:58, 14.52s/it]

Quadkey file 132330232.geojson building count: 0


 98%|█████████▊| 141/144 [32:12<00:30, 10.26s/it]

Quadkey file 132330233.geojson building count: 0
Quadkey file 132330300.geojson building count: 0


 99%|█████████▉| 143/144 [32:12<00:05,  5.68s/it]

Quadkey file 132330302.geojson building count: 0


100%|██████████| 144/144 [32:13<00:00, 13.43s/it]

Quadkey file 132332010.geojson building count: 0





## 3. Combines all city building extracts to a single file.

In [10]:
all_ms_gdf = pd.concat(ms_gdf_list)
all_ms_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 791724 entries, 143 to 199647
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   ADM3_EN     791724 non-null  object  
 1   ADM4_PCODE  791724 non-null  object  
 2   geometry    791724 non-null  geometry
dtypes: geometry(1), object(2)
memory usage: 24.2+ MB


In [11]:
city_names = all_ms_gdf["ADM3_EN"].unique()
short_city_names = [
    "".join([s.lower() for s in c.split(" ") if len(s) > 4]) for c in city_names
]
short_city_names = ["cdo" if s == "cagayan" else s for s in short_city_names]
print(city_names, short_city_names)

['Dagupan City' 'City of Navotas' 'Palayan City' 'City of Muntinlupa'
 'City of Mandaluyong' 'Legazpi City' 'Tacloban City' 'Iloilo City'
 'Zamboanga City' 'Mandaue City' 'Cagayan de Oro City' 'Davao City'] ['dagupan', 'navotas', 'palayan', 'muntinlupa', 'mandaluyong', 'legazpi', 'tacloban', 'iloilo', 'zamboanga', 'mandaue', 'cdo', 'davao']


In [12]:
bounds_gdf["ADM3_EN"].unique()

array(['Dagupan City', 'Palayan City', 'Legazpi City', 'Iloilo City',
       'Mandaue City', 'Tacloban City', 'Zamboanga City',
       'Cagayan de Oro City', 'Davao City', 'City of Mandaluyong',
       'City of Navotas', 'City of Muntinlupa'], dtype=object)

## 3. Save

In [13]:
for short_city_name, city_name in tqdm(zip(short_city_names, city_names)):
    save_gdf = all_ms_gdf[all_ms_gdf["ADM3_EN"] == city_name]
    print(f"Number of bldgs for {short_city_name}: {len(save_gdf)}")
    save_gdf.to_file(
        f"{PROCESSED_DIR}google-open-buildings/within_city/ms_lacuna_cities_bldgs_{short_city_name}_v2.geojson",
        driver="GeoJSON",
        index=False,
    )

0it [00:00, ?it/s]

Number of bldgs for dagupan: 25652


1it [00:02,  2.64s/it]

Number of bldgs for navotas: 6503


2it [00:03,  1.50s/it]

Number of bldgs for palayan: 12358


3it [00:04,  1.38s/it]

Number of bldgs for muntinlupa: 41306


4it [00:08,  2.49s/it]

Number of bldgs for mandaluyong: 10603


5it [00:09,  2.02s/it]

Number of bldgs for legazpi: 31422


6it [00:13,  2.40s/it]

Number of bldgs for tacloban: 34343


7it [00:16,  2.79s/it]

Number of bldgs for iloilo: 54522


8it [00:22,  3.67s/it]

Number of bldgs for zamboanga: 145368


9it [00:36,  7.07s/it]

Number of bldgs for mandaue: 33430


10it [00:40,  6.01s/it]

Number of bldgs for cdo: 106092


11it [00:51,  7.45s/it]

Number of bldgs for davao: 290125


12it [01:19,  6.67s/it]
