In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import os
from shapely.geometry import LineString, MultiLineString, Polygon
from shapely.geometry.base import GeometrySequence

In [3]:
# 1 liner to import a geodataframe of all meshblocks data
GDF : gpd.GeoDataFrame = gpd.GeoDataFrame(pd.concat([gpd.read_file(f'../data/core/mb-geojson/{file}') for file in os.listdir('../data/core/mb-geojson')], ignore_index=True))
# 20 - 30 secs per file. About 5 - 15 - 20 minutes total.

  GDF : gpd.GeoDataFrame = gpd.GeoDataFrame(pd.concat([gpd.read_file(f'../data/core/mb-geojson/{file}') for file in os.listdir('../data/core/mb-geojson')], ignore_index=True))


In [None]:
GDF = GDF[['MB_CODE21', 'SA4_CODE21', 'geometry']]
GDF.rename(columns={'MB_CODE21': 'id'}, inplace=True)
GDF.set_index('id', inplace=True)
os.makedirs('../data/mb-geojson', exist_ok=True)

In [8]:

GDF.groupby('SA4_CODE21').apply(lambda x: x[['geometry']].to_file(f'../data/mb-geojson/MB-SA4-{x.name}.geojson', driver='GeoJSON', index=True))
# 10m - 20m

In [None]:
DFS = { file.split('.')[0] : pd.read_csv(f'../data/core/mb-info/{file}', dtype=str) for file in os.listdir('../data/core/mb-info')}

In [None]:
# Convert the AREA_ALBERS_SQKM columns to np.float64
# Assert all columns that contains "AREA" are named AREA_ALBERS_SQKM
for df_name, df in DFS.items():
  for col in df.columns:
    if 'AREA' in col:
      assert (col == 'AREA_ALBERS_SQKM' or 'AREA_ALBERS_SQKM_' in col), df_name
      df[col] = df[col].astype(np.float64)

In [None]:
GDF = GDF[['MB_CODE21', 'geometry']]

In [None]:
GDF = GDF[['MB_CODE21', 'SA4_CODE21', 'geometry']]

In [None]:
os.makedirs('../data/mb-geojson', exist_ok=True)

In [None]:
GDF.groupby('SA4_CODE21').apply(lambda x: x[['MB_CODE21', 'geometry']].to_file(f'../data/mb-geojson/MB-SA4-{x.name}.geojson', driver='GeoJSON'))
# 10m - 20m


In [None]:
GDF.dropna(subset=['geometry'], inplace=True)

In [None]:
GDF['geometry'] = GDF['geometry'].boundary
# 10s

In [None]:
GDF['geometry'] = GDF['geometry'].apply(lambda x: list(l.coords for l in x.geoms) if isinstance(x, MultiLineString) else [x.coords])
# 30s - 1m

In [None]:
GDF['line_order'] = GDF['geometry'].apply(lambda x: list(range(len(x))))

In [None]:
GDF = GDF.explode(['line_order', 'geometry'])

In [None]:
is_self_loop = GDF['geometry'].apply(lambda x: x[0] == x[-1]).all()
# 10s - 20s
assert is_self_loop

GDF['geometry'] = GDF['geometry'].apply(lambda x: x[:-1])
# 40s - 2m

not_self_loop = GDF['geometry'].apply(lambda x: x[0] != x[-1]).all()
# 10s - 20s

assert not_self_loop

In [None]:
GDF['point_order'] = GDF['geometry'].apply(lambda x: list(range(len(x))))
# 10s - 20s

In [None]:
GDF = GDF.explode(['point_order', 'geometry'])
# 3m - 5m - 10m

In [None]:
GDF['lat'] = GDF['geometry'].apply(lambda x: x[1])
# 1m 30s - 3m

In [None]:
GDF['lon'] = GDF['geometry'].apply(lambda x: x[0])
# 1m 30s - 3m

In [None]:
GDF.drop(columns=['geometry'], inplace=True)
# 1m 30s

In [None]:
GDF['id'] = GDF['MB_CODE21'] + '-' + GDF['line_order'].astype(str)
# 1m 30s

In [None]:
GDF.drop(columns=['MB_CODE21', 'line_order'], inplace=True)
# 40s - 1m

In [None]:
os.makedirs('../data/mb-edges', exist_ok=True)

In [None]:
GDF.groupby('id').apply(lambda x: x[['point_order', 'lat', 'lon']].to_csv(f'../data/mb-edges/{x.name}.csv', index=False))
# 45m - 1h 30m
# Compressed down from 1GB to 800MB
# -> Too time consuming to compress, and probably not worth it.