# Explore UK Buildings

In [None]:
from pathlib import Path
import zipfile
import io
import tempfile

import geopandas as gpd
import shapely
import numpy as np
import pandas as pd
import requests
import requests_cache

%matplotlib inline

In [None]:
UKBUILDINGS_POINTS_FOLDER_PATH = Path('./data/ukbuildings/POINTS/')
UKBUILDINGS_POLYGONS_FOLDER_PATH = Path('./data/ukbuildings/POLYGONS/')
LONDON_BOUNDARY_FILE_URL = 'https://files.datapress.com/london/dataset/statistical-gis-boundary-files-london/2016-10-03T13:52:28/statistical-gis-boundaries-london.zip'

BOROUGH_SHAPE_FILE_PATH = Path('./statistical-gis-boundaries-london/ESRI/London_Borough_Excluding_MHW.shp')

In [None]:
requests_cache.install_cache('../build/cache')

## Helper Functions

In [None]:
HB_MIN_X = 500000
HB_MAX_X = 600000
HB_MIN_Y = 100000
HB_MAX_Y = 200000


def production_blocks(minx, miny, maxx, maxy):
    """Generator of GeoInformationGroup production blocks.
    
    Based on a rectangular bounding box defined in OS national grid, 
    this generator will yield all GeoInformationGroup production blocks 
    that are touched by the box.
    
    Supports only bounding boxes entirely in the HB production block
    reference.
    
    Parameters:
        * minx, miny, maxx, maxy: the parameters of the bounding box 
                                  defined in OS national grid
    
    Yields:
        The string name of each bounding box.
    """
    assert minx >= HB_MIN_X # supports only HB
    assert miny >= HB_MIN_Y # supports only HB
    assert maxx <= HB_MAX_X # supports only HB
    assert maxy <= HB_MAX_Y # supports only HB
    start_x = (int(minx) - HB_MIN_X) // 5000 + 1
    end_x = (int(maxx) - HB_MIN_X) // 5000 + 1
    start_y = (int(miny) - HB_MIN_Y) // 5000 + 1
    end_y = (int(maxy) - HB_MIN_Y) // 5000 + 1
    for x in range(start_x, end_x + 1):
        for y in range(start_y, end_y + 1):
            yield 'HB{:0>2}{:0>2}'.format(x, y)

assert set(production_blocks(500000, 100000, 500001, 100001)) == set(['HB0101'])
assert set(production_blocks(500000, 100000, 500000.1, 100000.1)) == set(['HB0101'])
assert set(production_blocks(505000, 100000, 505001, 100001)) == set(['HB0201'])
assert set(production_blocks(500000, 105000, 500001, 105001)) == set(['HB0102'])
assert set(production_blocks(500000, 100000, 505000, 100001)) == set(['HB0101', 'HB0201'])
assert set(production_blocks(500000, 100000, 500001, 105000)) == set(['HB0101', 'HB0102'])
assert set(production_blocks(504999, 100000, 505001, 100001)) == set(['HB0101', 'HB0201'])
assert set(production_blocks(504999.9, 100000, 505001, 100001)) == set(['HB0101', 'HB0201'])

In [None]:
def ukbuildings_polygon_file(production_blocks):
    """Generator of file paths of UKBuilding production blocks.
    
    Parameters:
        * an iterable of production block names
        
    Yields:
        * file path of the file containing the production block
    """
    for production_block in production_blocks:
        yield list(UKBUILDINGS_POLYGONS_FOLDER_PATH.glob('{}*.shp'.format(production_block)))[0]

## Read in Haringey Buildings

### Read in Haringey shape

In [None]:
r = requests.get(LONDON_BOUNDARY_FILE_URL)
z = zipfile.ZipFile(io.BytesIO(r.content))
with tempfile.TemporaryDirectory(prefix='london-boundary-files') as tmpdir:
    z.extractall(path=tmpdir)
    borough_file = Path(tmpdir) / BOROUGH_SHAPE_FILE_PATH
    borough_data = gpd.read_file(borough_file.as_posix())
borough_data.plot()

In [None]:
haringey = borough_data[borough_data.NAME == 'Haringey'].geometry.iloc[0]

In [None]:
haringey.boundary

### Read all UKBuilding files that include Haringey buildings

Theoretically we could read all UKBuilding files, but the reading and especially the merging takes too long. So in a smarter way, let's filter all files not including Haringey buildings.

In [None]:
ukb_data = None
for shape_file_path in ukbuildings_polygon_file(production_blocks(*haringey.bounds)):
    print('Reading {}'.format(shape_file_path))
    shape_file_data = gpd.read_file(shape_file_path.as_posix())
    if ukb_data is None:
        ukb_data = shape_file_data
    else:
        ukb_data = ukb_data.append(shape_file_data)

In [None]:
col_types = {
    'BASE': np.bool8,
    'BEC': np.int8,
    'BUNG': np.bool8,
    'DOR': np.int16,
    'DPS': np.int16,
    'GET': 'category',
    'MBN': 'category',
    'NAB': 'category',
    'RBCA': 'category',
    'RBCAT': 'category',
    'RBCC': 'category',
    'RBCS': np.bool8,
    'RBCT': 'category',
    'RBCTT': 'category',
    'RBN': np.int8,
    # TODO RBQ ??
    # TODO KBD ??
    'RDT': 'category',
    'RDTT': 'category',
    'RNR': 'category',
    'RRN': np.int8,
    'RRT': 'category',
    'RRTT': 'category',
    'RWN': np.int8,
    'RWT': 'category',
    'RWTT': 'category',
    'SBC': 'category'
}

In [None]:
ukb_data = ukb_data.astype(col_types)
ukb_data = gpd.GeoDataFrame(ukb_data)

### Cut out Haringey

The read in files contain all buildings from all GeoInformationGroup production block files in which Haringey buildings are present. Let's filter for only Haringey buildings.

In [None]:
from shapely.prepared import prep
haringey_prep = prep(haringey) # improves performace for the next step
in_haringey_mask = ukb_data.geometry.map(haringey_prep.contains)

In [None]:
ukb_data = ukb_data[ukb_data.geometry.map(haringey_prep.contains)]

### Tests

In [None]:
ukb_poly = shapely.geometry.MultiPolygon([polygon for polygon in ukb_data.geometry])

In [None]:
assert ukb_poly.convex_hull.difference(haringey.convex_hull).area / 1000000 < 2
assert haringey.convex_hull.difference(ukb_poly.convex_hull).area / 1000000 < 2

The difference between the convex hull of all Haringey buildings in the UKBuildings dataset and the convex hull of the borough boundary is smaller than 2 * 2km<sup>2</sup>. _(Arbitrarily chosen to be small enough.)_

In [None]:
len(ukb_data)

## Explore Data Set