In [1]:
# Datasets (in general)
# 1. How comprehensive are these datasets?
# 2. How recent are they? 
# 3. Can any aspects be independently validated/verified?
# 4. What makes these datasets authoritative?

# LBSM
# How accurate is the London Building Stock Model?
# What social indicators to use? Why?

# EPC 
# How reliable/variable is the EPC data? 

# Methodology
# Any critical consideration?

In [2]:
# LBSMv2 как источник, в котором данных больше всего. EPC как источник, на который можно опереться в плане качества данных (проверка по пересечению с LBSMv2) + 
# обновление данных на основе EPC (потому что там данные свежее)
# LBSMv2 - продукт Greater London Authority в партнёрстве (в описаниях фигурирует UCL Energy Institute), используемый для городских программ энергоэффективности
# (https://data.london.gov.uk/dataset/london-building-stock-model-lbsm-296oy/) - каких?
# Ограничения:
#   (1) Данные - снэпшот на определенную дату (какую?)

# EPC - государственный реестр энерго-сертификатов с публичным доступом и документацией - кто его делает?. Государственные источники авторитетные. 
# (https://epc.opendatacommunities.org/?utm_source=chatgpt.com)
# Ограничения:
#   (1) Данные - только для части домов (каких?)

# Camden Open Data - государственный источник с открытыми данными по району - а его кто делает?
# (https://opendata.camden.gov.uk/?)

# Верификационные стратегии:
# 1) сравнение данных LBSMv2 и EPC по пересекающимся домам и выяснить:
#   (1) сколько процентов - исходные данные и сколько - смоделированные
#   (2) в каких домах выводы на основе моделирования
# 2) сравнение зон охраны с зонами, указанными в LBSMv2:
#   (1) сколько домов в зонах охраны
#   (2) насколько данные правильные в LBSMv2

# какие именно дома мы не трогаем в conservation area?
# Camden Retrofit at Scale - почитать (https://camden.moderngov.co.uk/mgConvert2PDF.aspx?ID=122232&utm_source=chatgpt.com)

In [3]:
# Тема? Почему именно такая тема? - готово 

# Почему именно такие признаки? - мы здесь - ну мы их написали, но не объяснили

# Почему именно такая методология?
# Почему именно такие данные?

# Действительно ли сложно проводить ретрофитинг в зонах охраны? - готово 

In [4]:
import pandas as pd
from shapely.geometry import Point
import geopandas as gpd 
import numpy as np
import hdbscan

In [5]:
lbsm = pd.read_csv('../data/row/lbsmv2_camden.csv')
lbsm.head(3)

Unnamed: 0,uprn,os_topo_toid,easting,northing,postcode_locator,administrative_area,oa21cd,lsoa21cd,lsoa21nm,lsoa11cd,...,avg_tilt,imd19_national_decile,imd19_income_decile,loac_supergroup,loac_group,fuel_poverty,heat_risk_quintile,listed_building_grade,conservation_area_flag,conservation_area_site_id
0,200125425,1000004055135,524239.02,185679.53,NW2 2RT,Camden,E00000684,E01000143,Barnet 041D,E01000143,...,35.0,3,2,F - Young Families and Mainstream Employment,F2 - Social Rented Sector and Diverse Origins,11.4,2,Unknown,not in conservation area,not in conservation area
1,200125422,1000004055130,524218.9,185697.6,NW2 2RT,Camden,E00000684,E01000143,Barnet 041D,E01000143,...,35.0,3,2,F - Young Families and Mainstream Employment,F2 - Social Rented Sector and Diverse Origins,11.4,2,Unknown,not in conservation area,not in conservation area
2,200125427,1000004055137,524243.74,185671.32,NW2 2RT,Camden,E00000684,E01000143,Barnet 041D,E01000143,...,35.0,3,2,F - Young Families and Mainstream Employment,F2 - Social Rented Sector and Diverse Origins,11.4,2,Unknown,not in conservation area,not in conservation area


In [6]:
# CURRENT_ENERGY_RATING, POTENTIAL_ENERGY_RATING - буква, например, 'C'
# CURRENT_ENERGY_EFFICIENCY, POTENTIAL_ENERGY_EFFICIENCY - число от 1 до 100

# CO2_EMISSIONS_CURRENT, CO2_EMISSIONS_POTENTIAL - прикольно, что они есть
# UPRN - уникальный идентификатор здания, можно скрестить с LBSM

epc = pd.read_csv('../data/row/epc_camden/certificates.csv', usecols=['UPRN', 'CURRENT_ENERGY_RATING', 'POTENTIAL_ENERGY_RATING',
                                                                       'CURRENT_ENERGY_EFFICIENCY', 'POTENTIAL_ENERGY_EFFICIENCY'])
epc.head(3)

Unnamed: 0,CURRENT_ENERGY_RATING,POTENTIAL_ENERGY_RATING,CURRENT_ENERGY_EFFICIENCY,POTENTIAL_ENERGY_EFFICIENCY,UPRN
0,C,C,69,77,5086054.0
1,C,C,74,80,5143505.0
2,D,B,64,81,5198075.0


In [7]:
# retrofit candidates: non-conservation areas with low EPC ratings
low_epc = ['D', 'E', 'F', 'G', 'F-G']

lbsm_cand = lbsm[
    (lbsm['conservation_area_flag'] == 'not in conservation area') &
    (lbsm['epc_rating'].isin(low_epc))
].copy()

# uprn are located into different buildings - topo
print(f'URPN num: {lbsm_cand["uprn"].nunique()}, Topo TOID num: {lbsm_cand["os_topo_toid"].nunique()}')

bldg_pts = (
    lbsm_cand
    .groupby('os_topo_toid', as_index=True)[['easting', 'northing']]
    .median()
)

X = bldg_pts[['easting', 'northing']].to_numpy()

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=25,
    min_samples=10,
    metric='euclidean',
    cluster_selection_method='eom'
)

labels = clusterer.fit_predict(X)
bldg_pts['cluster_id'] = labels

# cluster stats
unique_clusters = sorted(set(labels) - {-1})
print("Clusters from Topo TOID:", len(unique_clusters))
print("Noise share:", (labels == -1).mean()) # 16% домохозяйств вне кластеров

# export clustered buildings
bldg_pts = gpd.GeoDataFrame(
    bldg_pts,
    geometry=gpd.points_from_xy(bldg_pts['easting'], bldg_pts['northing']),
    crs="EPSG:27700"
)

bldg_pts.reset_index().to_file('clustered_buildings.geojson', driver='GeoJSON')

URPN num: 21206, Topo TOID num: 7102
Clusters from Topo TOID: 45
Noise share: 0.15784286116586876


In [8]:
lbsm_clustered = lbsm_cand.merge(
    bldg_pts[['cluster_id']],
    how='left',
    left_on='os_topo_toid',
    right_on='os_topo_toid'
)

In [9]:
lbsm_archetypes = lbsm_clustered.copy()

# Fabric profiling
lbsm_archetypes["wall_need_solid"]  = (lbsm_archetypes["wall_type"] == "solid") & (lbsm_archetypes["wall_insulation"] == "uninsulated")
lbsm_archetypes["wall_need_cavity"] = (lbsm_archetypes["wall_type"] == "cavity") & (lbsm_archetypes["wall_insulation"] == "uninsulated")

lbsm_archetypes["roof_need_pitched"] = (lbsm_archetypes["roof_type"] == "pitched") & (lbsm_archetypes["roof_insulation"] == "uninsulated")
lbsm_archetypes["roof_need_flat"]    = (lbsm_archetypes["roof_type"] == "flat") & (lbsm_archetypes["roof_insulation"] == "uninsulated")

lbsm_archetypes["glazing_need_single"]    = (lbsm_archetypes["glazing_type"] == "single/partial")
lbsm_archetypes["glazing_need_secondary"] = (lbsm_archetypes["glazing_type"] == "secondary")

# Heating profiling
lbsm_archetypes["heating_system"] = lbsm_archetypes["main_heat_type"].astype(str) + " | " + lbsm_archetypes["main_fuel_type"].astype(str)

def heat_bucket(row):
    fuel = row["main_fuel_type"]
    ht = str(row["main_heat_type"]).lower()

    if fuel == "mains gas":
        return "Gas-based heating"
    if fuel == "electricity":
        if "storage" in ht:
            return "Electric storage/resistance"
        return "Electric heating"
    return "Other / unknown"

lbsm_archetypes["heating_bucket"] = lbsm_archetypes.apply(heat_bucket, axis=1)

# EPC gap profiling
lbsm_archetypes["efficiency_gap"] = lbsm_archetypes["potential_epc_score"] - lbsm_archetypes["epc_score"]
lbsm_archetypes["gap_band"] = pd.cut(lbsm_archetypes["efficiency_gap"], bins=[-1, 4, 9, 100], labels=["Low", "Medium", "High"])

# measures bundling
def measure_bundle(r):
    measures = []
    if r["wall_need_solid"]:  measures.append("Solid wall insulation")
    if r["wall_need_cavity"]: measures.append("Cavity wall insulation")
    if r["roof_need_pitched"]: measures.append("Loft/roof insulation (pitched)")
    if r["roof_need_flat"]:    measures.append("Roof insulation (flat)")
    if r["glazing_need_single"]: measures.append("Glazing upgrade (single/partial)")
    if r["glazing_need_secondary"]: measures.append("Improve/replace secondary glazing")
    if r["heating_bucket"] in ["Gas-based heating", "Electric storage/resistance"]:
        measures.append(f"Heating pathway: {r['heating_bucket']}")
    return measures

lbsm_archetypes["bundle"] = lbsm_archetypes.apply(measure_bundle, axis=1)

In [10]:
cluster_profile = (
    lbsm_archetypes[lbsm_archetypes["cluster_id"] >= 0]
    .groupby("cluster_id")
    .agg(
        n=("uprn", "nunique"),
        share_solid_wall=("wall_need_solid", "mean"),
        share_cavity_wall=("wall_need_cavity", "mean"),
        share_roof_pitched=("roof_need_pitched", "mean"),
        share_roof_flat=("roof_need_flat", "mean"),
        share_glazing_single=("glazing_need_single", "mean"),
        share_glazing_secondary=("glazing_need_secondary", "mean"),
        median_gap=("efficiency_gap", "median"),
        share_gap_high=("gap_band", lambda x: (x == "High").mean()),
        top_heating=("heating_bucket", lambda x: x.value_counts().index[0]),
        top_heating_share=("heating_bucket", lambda x: x.value_counts(normalize=True).iloc[0]),
    )
    .reset_index()
)

In [11]:
# сколько домохозяйств в кластере страдает от проблемы
cluster_profile.head(3)

Unnamed: 0,cluster_id,n,share_solid_wall,share_cavity_wall,share_roof_pitched,share_roof_flat,share_glazing_single,share_glazing_secondary,median_gap,share_gap_high,top_heating,top_heating_share
0,0,72,0.388889,0.111111,0.027778,0.069444,0.5,0.041667,10.0,0.541667,Gas-based heating,0.736111
1,1,216,0.12963,0.328704,0.046296,0.125,0.013889,0.0,6.0,0.37037,Gas-based heating,0.833333
2,2,291,0.024055,0.034364,0.006873,0.061856,0.474227,0.0,6.0,0.347079,Gas-based heating,0.993127


In [12]:
cp = cluster_profile.copy()

THRESH = 0.5

measure_cols = {
    "Solid wall insulation (uninsulated solid walls)": "share_solid_wall",
    "Cavity wall insulation (uninsulated cavity walls)": "share_cavity_wall",
    "Loft/roof insulation (pitched, uninsulated)": "share_roof_pitched",
    "Roof insulation (flat, uninsulated)": "share_roof_flat",
    "Glazing upgrade (single/partial)": "share_glazing_single",
    "Glazing improvements (secondary glazing)": "share_glazing_secondary",
}

def build_bundle(row, thresh=THRESH):
    items = [name for name, col in measure_cols.items() if row[col] >= thresh]
    return items

cp["bundle_list"] = cp.apply(build_bundle, axis=1)
cp["bundle"] = cp["bundle_list"].apply(lambda xs: "; ".join(xs) if xs else "Mixed / no dominant bundle")

cp['bundle'].value_counts()

bundle
Solid wall insulation (uninsulated solid walls)                                      26
Mixed / no dominant bundle                                                            9
Solid wall insulation (uninsulated solid walls); Glazing upgrade (single/partial)     5
Glazing upgrade (single/partial)                                                      4
Cavity wall insulation (uninsulated cavity walls)                                     1
Name: count, dtype: int64

In [13]:
cp.head(3)

Unnamed: 0,cluster_id,n,share_solid_wall,share_cavity_wall,share_roof_pitched,share_roof_flat,share_glazing_single,share_glazing_secondary,median_gap,share_gap_high,top_heating,top_heating_share,bundle_list,bundle
0,0,72,0.388889,0.111111,0.027778,0.069444,0.5,0.041667,10.0,0.541667,Gas-based heating,0.736111,[Glazing upgrade (single/partial)],Glazing upgrade (single/partial)
1,1,216,0.12963,0.328704,0.046296,0.125,0.013889,0.0,6.0,0.37037,Gas-based heating,0.833333,[],Mixed / no dominant bundle
2,2,291,0.024055,0.034364,0.006873,0.061856,0.474227,0.0,6.0,0.347079,Gas-based heating,0.993127,[],Mixed / no dominant bundle


In [14]:
# Качественный показатель: профиль мер (из bundle) - 5 штук - What?
# Социально-экономический показатель: насколько эти меры нужны населению? - Who?
# Насколько радмкальные возможны улучшения энергоэффективности? - How much?

In [15]:
soc = (
    lbsm_archetypes[lbsm_archetypes["cluster_id"] >= 0]
    .groupby("cluster_id")
    .agg(
        imd_income_median=("imd19_income_decile", "median"),
        fuel_poverty_median=("fuel_poverty", "median"),
    )
    .reset_index()
)
cp = cp.merge(soc, on="cluster_id", how="left")

cp["imd_norm"] = (10 - cp["imd_income_median"]) / 9  # 1 is worst, 0 is best
cp["fuel_pov_norm"] = cp["fuel_poverty_median"].rank(pct=True)  # 1 is worst, 0 is best

cp["social_index"] = 0.5*cp["imd_norm"] + 0.5*cp["fuel_pov_norm"]
# cp['social_index_category'] = cp.apply(lambda row: 'High' if row['social_index'] > 0.5 else 'Low', axis=1)

In [16]:
lbsm_archetypes['efficiency_gap'] = lbsm_archetypes['potential_epc_score'] - lbsm_archetypes['epc_score']

eff_gap = (
    lbsm_archetypes[lbsm_archetypes["cluster_id"] >= 0]
    .groupby("cluster_id")
    .agg(
        efficiency_gap_median=("efficiency_gap", "median"),
    )
    .reset_index()
)

cp = cp.merge(eff_gap, on="cluster_id", how="left")
cp["efficiency_index"] = cp["efficiency_gap_median"].rank(pct=True)  # 1 is worst, 0 is best
# cp['efficiency_index_category'] = cp.apply(lambda row: 'High' if row['efficiency_index'] > 0.5 else 'Low', axis=1)

In [17]:
cp_join = cp[['cluster_id', 'bundle', 'social_index', 'efficiency_gap_median', 'efficiency_index']].copy()

bldg_pts_enriched = (
    bldg_pts
    .reset_index()  
    .merge(cp_join, on='cluster_id', how='left')
)

In [18]:
bldg_pts_enriched.to_file('clustered_buildings_enriched.geojson', driver='GeoJSON')

In [19]:
gdf = bldg_pts_enriched.copy()

if not isinstance(gdf, gpd.GeoDataFrame):
    gdf = gpd.GeoDataFrame(gdf, geometry="geometry", crs="EPSG:27700")

gdf0 = gdf[gdf["cluster_id"] >= 0].copy()

cluster_hulls = (
    gdf0.dissolve(by="cluster_id")
        .convex_hull
        .reset_index()
)

cluster_hulls = gpd.GeoDataFrame(cluster_hulls, geometry=0, crs=gdf0.crs).rename(columns={0: "geometry"})
cluster_hulls.head()

Unnamed: 0,cluster_id,geometry
0,0,"POLYGON ((531096 181642, 531084 181805, 531099..."
1,1,"POLYGON ((530332 182517, 530079 182740, 530491..."
2,2,"POLYGON ((529893 184157, 529806 184159, 529780..."
3,3,"POLYGON ((529325 182441, 529240 182473, 529117..."
4,4,"POLYGON ((528995 182421, 528894 182440, 528890..."


In [20]:
cluster_hulls_enriched = (
    cluster_hulls
    .reset_index()  
    .merge(cp_join, on='cluster_id', how='left')
)

In [22]:
cluster_hulls_enriched = gpd.GeoDataFrame(cluster_hulls_enriched, geometry='geometry', crs=gdf0.crs)
cluster_hulls_enriched['geometry'] = cluster_hulls_enriched.buffer(30)
cluster_hulls_enriched = gpd.GeoDataFrame(cluster_hulls_enriched, geometry='geometry', crs=gdf0.crs)
cluster_hulls_enriched.to_file('cluster_hulls_enriched.geojson', driver='GeoJSON')