In [2]:
import numpy as np
import pandas as pd
from shapely import wkt
import geopandas as gpd
from pathlib import Path
import matplotlib.pyplot as plt
from shapely import wkt
from shapely.geometry import Polygon
import json
from tqdm import tqdm
import os

In [1]:
RAW_DIR = "../../../data/tm/1-raw/"
PROCESSED_DIR = "../../../data/tm/2-processed/"

# Crop buildings from country file

This notebook does the following steps: 

(1) crops the single country-wide buildings `geojsonl` (link to file [here](https://github.com/microsoft/IdMyPhBuildingFootprints)) to small batches of geojson files

(2) extracts intersecting buildings within project-included city bounds, and finally 

(3) combines all city building extracts to a single file.

## 1. Create small batches

In [3]:
filepath = f"{RAW_DIR}ms-open-buildings/philippines.geojsonl"
total_lines = sum(1 for line in open(filepath))
batch_size = 500000
nbatches = int(total_lines / batch_size + 1)
print(f"Total lines: {total_lines}, Number of batches: {nbatches}")

Total lines: 17421764, Number of batches: 35


In [6]:
with open(filepath, "r") as f:
    counter = 0
    batch = []
    for line in tqdm(f, total=total_lines):
        feature = json.loads(line)
        batch.append(feature)
        counter += 1
        if counter % batch_size == 0:
            batch_id = int(counter / batch_size)
            print(f"Saving batch {batch_id}...", end="")
            gdf = gpd.GeoDataFrame.from_features(batch)
            output_file = (
                f"{PROCESSED_DIR}ms-open-buildings/batch/batch_{batch_id}.geojson"
            )
            gdf.to_file(output_file, driver="GeoJSON")
            # clear batch content
            batch = []
            print("DONE!")
    # for last batch
    if batch:
        gdf = gpd.GeoDataFrame.from_features(batch)
        output_file = f"{PROCESSED_DIR}ms-open-buildings/batch/batch_{int((counter-1)/batch_size)+1}.geojson"
        gdf.to_file(output_file, driver="GeoJSON")

  3%|▎         | 498656/17421764 [00:07<02:33, 109933.26it/s]

Saving batch 1...


  6%|▌         | 998560/17421764 [01:21<02:30, 108978.98it/s]

Saving batch 2...


  9%|▊         | 1490024/17421764 [02:33<03:05, 86035.90it/s] 

Saving batch 3...


 11%|█▏        | 1993892/17421764 [03:48<07:24, 34725.86it/s] 

Saving batch 4...


 14%|█▍        | 2495504/17421764 [05:01<02:22, 104596.36it/s]

Saving batch 5...


 17%|█▋        | 2992131/17421764 [06:15<06:32, 36716.85it/s] 

Saving batch 6...


 20%|██        | 3495554/17421764 [07:30<04:19, 53642.26it/s] 

Saving batch 7...


 23%|██▎       | 3996541/17421764 [08:46<02:15, 98718.53it/s] 

Saving batch 8...


 26%|██▌       | 4488186/17421764 [10:03<02:08, 100634.63it/s]

Saving batch 9...


 29%|██▊       | 4996990/17421764 [11:19<03:00, 68703.74it/s] 

Saving batch 10...


 32%|███▏      | 5498560/17421764 [12:33<02:22, 83667.45it/s] 

Saving batch 11...


 34%|███▍      | 5992614/17421764 [14:04<01:51, 102331.90it/s]

Saving batch 12...


 37%|███▋      | 6489187/17421764 [15:24<01:44, 104327.10it/s]

Saving batch 13...


 40%|████      | 6999871/17421764 [16:41<03:20, 52103.49it/s] 

Saving batch 14...


 43%|████▎     | 7499913/17421764 [17:58<01:39, 99551.17it/s] 

Saving batch 15...


 46%|████▌     | 7998731/17421764 [19:16<01:30, 104134.31it/s]

Saving batch 16...


 49%|████▉     | 8497888/17421764 [20:37<01:39, 89501.68it/s] 

Saving batch 17...


 52%|█████▏    | 8998859/17421764 [21:57<02:33, 54841.02it/s] 

Saving batch 18...


 54%|█████▍    | 9492552/17421764 [23:18<01:13, 108081.12it/s]

Saving batch 19...


 57%|█████▋    | 9991122/17421764 [24:37<03:24, 36389.18it/s] 

Saving batch 20...


 60%|██████    | 10497829/17421764 [26:00<01:27, 79312.38it/s] 

Saving batch 21...


 63%|██████▎   | 10996641/17421764 [27:24<01:01, 103784.26it/s]

Saving batch 22...


 66%|██████▌   | 11499754/17421764 [28:43<03:16, 30121.35it/s] 

Saving batch 23...


 69%|██████▉   | 11994490/17421764 [29:57<00:55, 97381.27it/s] 

Saving batch 24...


 72%|███████▏  | 12487807/17421764 [31:14<01:56, 42354.08it/s] 

Saving batch 25...


 75%|███████▍  | 12998860/17421764 [32:28<01:12, 60980.73it/s] 

Saving batch 26...


 77%|███████▋  | 13496741/17421764 [33:49<00:38, 102911.55it/s]

Saving batch 27...


 80%|████████  | 13985966/17421764 [34:51<01:11, 47937.28it/s] 

Saving batch 28...


 83%|████████▎ | 14492330/17421764 [36:03<00:48, 60576.77it/s] 

Saving batch 29...


 86%|████████▌ | 14992377/17421764 [37:07<00:22, 109671.89it/s]

Saving batch 30...


 89%|████████▉ | 15494802/17421764 [38:04<00:17, 111046.95it/s]

Saving batch 31...


 92%|█████████▏| 15995463/17421764 [39:03<00:38, 36696.38it/s] 

Saving batch 32...


 95%|█████████▍| 16489727/17421764 [40:01<00:07, 117013.67it/s]

Saving batch 33...


 98%|█████████▊| 16995633/17421764 [41:01<00:06, 63538.26it/s] 

Saving batch 34...


100%|██████████| 17421764/17421764 [41:57<00:00, 6921.58it/s]  


## 2. Check which batch files are within included cities

In [4]:
# Read cities bounds
bounds_gdf = gpd.read_file(
    "../../../data/1-admin-bounds/target_admin_bounds.shp"
).set_crs(4326)

In [5]:
# Loop over all batch, append and save part of batches within a city
ms_gdf_list = []
for batch in tqdm(np.arange(1, nbatches + 1)):
    ms_gdf = gpd.read_file(
        f"{PROCESSED_DIR}ms-open-buildings/batch/batch_{batch}.geojson"
    )
    ms_gdf = ms_gdf.set_geometry("geometry")
    ms_gdf = ms_gdf.set_crs(4326)

    bldgs_gdf = gpd.sjoin(ms_gdf, bounds_gdf, predicate="intersects")
    bldgs_gdf = bldgs_gdf[["ADM3_EN", "ADM4_PCODE", "geometry"]]
    print(f"Batch {batch} building count: {len(bldgs_gdf)}")

    if len(bldgs_gdf) > 0:
        ms_gdf_list.append(bldgs_gdf)

  3%|▎         | 1/35 [00:49<28:02, 49.50s/it]

Batch 1 building count: 4630


  6%|▌         | 2/35 [01:41<28:11, 51.25s/it]

Batch 2 building count: 174282


  9%|▊         | 3/35 [02:31<26:54, 50.47s/it]

Batch 3 building count: 0


 11%|█▏        | 4/35 [03:17<25:09, 48.69s/it]

Batch 4 building count: 0


 14%|█▍        | 5/35 [04:05<24:12, 48.42s/it]

Batch 5 building count: 0


 17%|█▋        | 6/35 [04:51<22:57, 47.51s/it]

Batch 6 building count: 0


 20%|██        | 7/35 [05:36<21:52, 46.87s/it]

Batch 7 building count: 1522


 23%|██▎       | 8/35 [06:22<20:59, 46.65s/it]

Batch 8 building count: 0


 26%|██▌       | 9/35 [07:07<19:59, 46.15s/it]

Batch 9 building count: 852


 29%|██▊       | 10/35 [07:54<19:17, 46.31s/it]

Batch 10 building count: 96976


 31%|███▏      | 11/35 [08:39<18:22, 45.95s/it]

Batch 11 building count: 0


 34%|███▍      | 12/35 [09:22<17:14, 45.00s/it]

Batch 12 building count: 0


 37%|███▋      | 13/35 [10:05<16:15, 44.33s/it]

Batch 13 building count: 9338


 40%|████      | 14/35 [10:50<15:34, 44.48s/it]

Batch 14 building count: 101109


 43%|████▎     | 15/35 [11:32<14:39, 43.95s/it]

Batch 15 building count: 26511


 46%|████▌     | 16/35 [12:17<13:58, 44.14s/it]

Batch 16 building count: 64920


 49%|████▊     | 17/35 [13:01<13:11, 43.96s/it]

Batch 17 building count: 94309


 51%|█████▏    | 18/35 [13:43<12:21, 43.59s/it]

Batch 18 building count: 19106


 54%|█████▍    | 19/35 [14:26<11:32, 43.26s/it]

Batch 19 building count: 6566


 57%|█████▋    | 20/35 [15:08<10:43, 42.91s/it]

Batch 20 building count: 0


 60%|██████    | 21/35 [15:51<10:03, 43.11s/it]

Batch 21 building count: 17282


 63%|██████▎   | 22/35 [16:36<09:24, 43.43s/it]

Batch 22 building count: 34703


 66%|██████▌   | 23/35 [17:21<08:49, 44.10s/it]

Batch 23 building count: 21902


 69%|██████▊   | 24/35 [18:08<08:13, 44.85s/it]

Batch 24 building count: 5002


 71%|███████▏  | 25/35 [18:54<07:30, 45.09s/it]

Batch 25 building count: 12359


 74%|███████▍  | 26/35 [19:39<06:47, 45.30s/it]

Batch 26 building count: 5239


 77%|███████▋  | 27/35 [20:26<06:06, 45.78s/it]

Batch 27 building count: 0


 80%|████████  | 28/35 [21:12<05:21, 45.90s/it]

Batch 28 building count: 0


 83%|████████▎ | 29/35 [21:57<04:33, 45.55s/it]

Batch 29 building count: 1784


 86%|████████▌ | 30/35 [22:40<03:44, 44.89s/it]

Batch 30 building count: 33685


 89%|████████▊ | 31/35 [23:23<02:56, 44.23s/it]

Batch 31 building count: 0


 91%|█████████▏| 32/35 [24:06<02:11, 43.83s/it]

Batch 32 building count: 54830


 94%|█████████▍| 33/35 [24:48<01:26, 43.40s/it]

Batch 33 building count: 7910


 97%|█████████▋| 34/35 [25:32<00:43, 43.38s/it]

Batch 34 building count: 0


100%|██████████| 35/35 [26:08<00:00, 44.81s/it]

Batch 35 building count: 0





In [6]:
all_ms_gdf = pd.concat(ms_gdf_list)
all_ms_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 794817 entries, 437436 to 372916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   ADM3_EN     794817 non-null  object  
 1   ADM4_PCODE  794817 non-null  object  
 2   geometry    794817 non-null  geometry
dtypes: geometry(1), object(2)
memory usage: 24.3+ MB


## 3. Read filtered files and save by city

In [7]:
city_names = all_ms_gdf["ADM3_EN"].unique()
short_city_names = [
    "".join([s.lower() for s in c.split(" ") if len(s) > 4]) for c in city_names
]
short_city_names = ["cdo" if s == "cagayan" else s for s in short_city_names]
city_names, short_city_names

(array(['Zamboanga City', 'Davao City', 'City of Navotas',
        'Cagayan de Oro City', 'Legazpi City', 'Tacloban City',
        'Dagupan City', 'City of Mandaluyong', 'City of Muntinlupa',
        'Palayan City', 'Mandaue City', 'Iloilo City'], dtype=object),
 ['zamboanga',
  'davao',
  'navotas',
  'cdo',
  'legazpi',
  'tacloban',
  'dagupan',
  'mandaluyong',
  'muntinlupa',
  'palayan',
  'mandaue',
  'iloilo'])

In [10]:
for short_city_name, city_name in tqdm(zip(short_city_names, city_names)):
    save_gdf = all_ms_gdf[all_ms_gdf["ADM3_EN"] == city_name]
    print(f"Number of bldgs for {short_city_name}: {len(save_gdf)}")
    save_gdf.to_file(
        f"{PROCESSED_DIR}ms-open-buildings/within_city/ms_lacuna_cities_bldgs_{short_city_name}_v1.geojson",
        driver="GeoJSON",
        index=False,
    )

0it [00:00, ?it/s]

Number of bldgs for zamboanga: 146792


1it [00:17, 17.46s/it]

Number of bldgs for davao: 290748


2it [00:52, 27.53s/it]

Number of bldgs for navotas: 6524


3it [00:52, 15.33s/it]

Number of bldgs for cdo: 106234


4it [01:05, 14.34s/it]

Number of bldgs for legazpi: 31503


5it [01:09, 10.53s/it]

Number of bldgs for tacloban: 34485


6it [01:13,  8.43s/it]

Number of bldgs for dagupan: 25672


7it [01:17,  6.74s/it]

Number of bldgs for mandaluyong: 10629


8it [01:18,  5.05s/it]

Number of bldgs for muntinlupa: 41356


9it [01:23,  5.05s/it]

Number of bldgs for palayan: 12359


10it [01:25,  3.95s/it]

Number of bldgs for mandaue: 33685


11it [01:29,  3.97s/it]

Number of bldgs for iloilo: 54830


12it [01:35,  7.97s/it]
