### Import Lib

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import sys;sys.path.append('../')
from shapely.wkt import loads
import constant as c
from pyproj import CRS
from sklearn.neighbors import NearestNeighbors
from util.distance import sphere_distance
from itertools import combinations
from sklearn.preprocessing import StandardScaler
from libpysal.weights import Queen
import math

import warnings; warnings.filterwarnings('ignore')

### Acquire Data

In [2]:
# then read the data
df_2020 = pd.read_csv('../asset/preprocess/df_2020.csv', index_col=0)
df_2020['geometry_grids'] = df_2020['geometry_grids'].apply(lambda x: loads(x))
df_2021 = pd.read_csv('../asset/preprocess/df_2021.csv', index_col=0)
df_2021['geometry_grids'] = df_2021['geometry_grids'].apply(lambda x: loads(x))
df_2022 = pd.read_csv('../asset/preprocess/df_2022.csv', index_col=0)
df_2022['geometry_grids'] = df_2022['geometry_grids'].apply(lambda x: loads(x))

crs = 'EPSG:5179'  # Specify the coordinate reference system
gdf_2020 = gpd.GeoDataFrame(df_2020, geometry=df_2020['geometry_grids'], crs=crs)
gdf_2021 = gpd.GeoDataFrame(df_2021, geometry=df_2021['geometry_grids'], crs=crs)
gdf_2022 = gpd.GeoDataFrame(df_2022, geometry=df_2022['geometry_grids'], crs=crs)

gdf_2020_scbd = gdf_2020[gdf_2020['sigungunm'].isin(c.SCBD_NMS)]
gdf_2020_gbd = gdf_2020[gdf_2020['sigungunm'].isin(c.GBD_NMS)]
gdf_2020_ybd = gdf_2020[gdf_2020['sigungunm'].isin(c.YBD_NMS)]
gdf_2021_scbd = gdf_2021[gdf_2021['sigungunm'].isin(c.SCBD_NMS)]
gdf_2021_gbd = gdf_2021[gdf_2021['sigungunm'].isin(c.GBD_NMS)]
gdf_2021_ybd = gdf_2021[gdf_2021['sigungunm'].isin(c.YBD_NMS)]
gdf_2022_scbd = gdf_2022[gdf_2022['sigungunm'].isin(c.SCBD_NMS)]
gdf_2022_gbd = gdf_2022[gdf_2022['sigungunm'].isin(c.GBD_NMS)]
gdf_2022_ybd = gdf_2022[gdf_2022['sigungunm'].isin(c.YBD_NMS)]

#### 1. Get Boundary

In [3]:
# Setting COLS
SIM_CAL_COLS = [
    'pp_pop', # X1
    'pp_od', # X2
    'bs_ebit', # X3
    'bs_gas', # X4
    'bs_elct', # X5
    'bd_platarea', # X6
    'bd_archarea', # X7
    'bd_totarea', # X8
    'bd_totflrcnt', # X9
    'bd_elvtent', # X10
    'bd_height', # X11
    'bd_vintage', # X12
    'bd_ilp' # X13
    ]

- YBD

- Data preparation(calculating nearest grid_idx, and apply Standard Scaler)

In [6]:
# calculate the extent to which grids in CBD boundary are similar to each other
gdf = gpd.GeoDataFrame(gdf_2022_ybd, geometry='geometry_grids')
# Set the CRS for the GeoDataFrame
crs = CRS.from_epsg(5179)
gdf = gdf.set_crs(crs)
gdf = gdf.dropna() # drop any data have null value

In [7]:
gdf.explore()

In [109]:
# # Get the rows of the DataFrame as tuples
# rows = list(gdf[['grid_idx', 'x', 'y']].iterrows())

# # Calculate the combinations of 2 for the rows
# combinations_ = list(combinations(rows, 2))
# a_grids = []
# b_grids = []
# dists = []
# for i in range(len(combinations_)):
#     a, b = combinations_[i]
#     dist = sphere_distance((a[1].x, a[1].y), (b[1].x, b[1].y))
#     a_grids.append(a[1].grid_idx)
#     b_grids.append(b[1].grid_idx)
#     dists.append(dist)
# a_grids = [np.int64(x) for x in a_grids]
# b_grids = [np.int64(x) for x in b_grids]

# grid_dist1 = pd.DataFrame({'a_grid_idx': a_grids, 'b_grid_idx': b_grids, 'distance': dists})
# grid_dist2 = pd.DataFrame({'a_grid_idx': b_grids, 'b_grid_idx': a_grids, 'distance': dists})
# gdf_grids_dist = pd.concat([grid_dist1, grid_dist2])

# kijun_idxs = []
# nearest_idxs = []
# for grid_idx in list(gdf_grids_dist.a_grid_idx.unique()):
#     filtered_df = gdf_grids_dist[gdf_grids_dist['a_grid_idx'] == grid_idx]
#     nearest_idx = filtered_df.sort_values('distance', ascending = True).iloc[0].b_grid_idx
#     kijun_idxs.append(int(grid_idx))
#     nearest_idxs.append(int(nearest_idx))
# gdf_near_idx_pair = pd.DataFrame({'grid_idx': kijun_idxs, 'near_grid_idx': nearest_idxs})

In [110]:
# gdf_near_idx_pair.head(5)

- and normalize

In [90]:
# Initialize the StandardScaler object
scaler = StandardScaler()
# Fit the scaler to the data
scaler.fit(gdf[SIM_CAL_COLS])
# Transform the data using the scaler
normalized_data = scaler.transform(gdf[SIM_CAL_COLS])
gdf_normalized = gdf.copy()
gdf_normalized[SIM_CAL_COLS] = normalized_data


- and make cluster

In [91]:
# def within_boundary_similarity(gdf, sim_cal_cols) -> float:
#     pairwise_distances = []
#     for row1, row2 in combinations(gdf.iterrows(), 2):
#         distance = sum(abs(row1[1][col] - row2[1][col]) for col in SIM_CAL_COLS)
#         pairwise_distances.append(distance)

#     # average_distance = sum(pairwise_distances) / len(pairwise_distances)
#     average_distance = sum(pairwise_distances) / (len(gdf))
#     # boundary_similarity = math.exp(-(average_distance/100))
#     return average_distance

In [138]:
# Function to find grid indexes within a radius of 300 meters
def find_neighbors_within_radius(gdf, grid_idx):
    target_geometry = gdf.loc[gdf['grid_idx'] == grid_idx, 'geometry'].values[0]
    # Calculate the distances between the target geometry and all other geometries
    distances = gdf['geometry'].distance(target_geometry)
    # Filter the grid indexes based on the specified radius
    neighbors = gdf.loc[distances <= 200, 'grid_idx'].tolist()
    return neighbors

In [92]:
def within_boundary_similarity(gdf, sim_cal_cols) -> float:
    pairwise_sq_distances = []
    for row1, row2 in combinations(gdf.iterrows(), 2):
        distance = sum(abs(row1[1][col] - row2[1][col]) for col in SIM_CAL_COLS)
        pairwise_sq_distances.append(distance ** 2)

    # average_distance = sum(pairwise_distances) / len(pairwise_distances)
    average_distance = math.sqrt(sum(pairwise_sq_distances) / (len(pairwise_sq_distances)))
    # boundary_similarity = math.exp(-(average_distance/100))
    return average_distance

In [93]:
# # gbd
# gbd_init_grid_idx = [
#     31409, #삼성전자빌딩
#     31408,
#     31410,
#     31365,
#     31366,
#     31367,
#     31318,
#     31319,
#     31320,
#     31364,
#     36026,
#     36025,
#     36024,
#     31446
#     ]

In [145]:
centroid_gdf_idx = gdf_normalized['bd_height'].idxmax()
print(f'Centroid Building is {gdf_normalized.loc[centroid_gdf_idx].bdnm}\ngrid_idx is {gdf_normalized.loc[centroid_gdf_idx].grid_idx}')
init_grid_idx = [gdf_normalized.loc[centroid_gdf_idx].grid_idx]
cluster = init_grid_idx 
prev_grid_idxs = cluster

Centroid Building is 삼성전자빌딩
grid_idx is 31409


In [146]:

for phase in range(5):
    tba_grid_idxs = []
    for grid_idx in prev_grid_idxs:
        neighbor_grid_idxs = find_neighbors_within_radius(gdf_normalized, grid_idx)
        tba_grid_idxs.extend(neighbor_grid_idxs)
    
    tba_grid_idxs = list(set(tba_grid_idxs))
    prev_grid_idxs = tba_grid_idxs 
    assert len(tba_grid_idxs) >= 1
    cluster.extend(tba_grid_idxs)
    cluster = list(set(cluster))
    new_gdf = gdf_normalized[gdf_normalized['grid_idx'].isin(cluster)]
    sim = within_boundary_similarity(new_gdf, SIM_CAL_COLS)
    print(f'Phase: {phase}\n Simil: {sim}')

Phase: 0
 Simil: 21.22414595893081
Phase: 1
 Simil: 16.35942084216686
Phase: 2
 Simil: 15.783896307324293
Phase: 3
 Simil: 14.313647273592572
Phase: 4
 Simil: 12.741712125522861


In [147]:
new_gdf.explore()

In [100]:
first_gdf_idx    = list(gdf_normalized.index)[0] # 650(for gbd)
centroid_gdf_idx = gdf_normalized['bd_height'].idxmax()
centroid_w_idx = centroid_gdf_idx - first_gdf_idx
print(f'Centroid Building is {gdf_normalized.loc[centroid_gdf_idx].bdnm}\ngdf_idx is {centroid_gdf_idx}\nw_idx is {centroid_w_idx}')
init_w = Queen.from_dataframe(gdf_normalized)

init_gdf_idxs = list(gdf[gdf['grid_idx'].isin(gbd_init_grid_idx)].index)
init_w_idxs = [x - first_gdf_idx for x in list(init_gdf_idxs)]

cluster = init_w_idxs
prev_w_idxs = cluster

for phase in range(5):
    tba_w_idxs = []
    for w_idx in prev_w_idxs:
        tba_idxs = init_w.neighbors[w_idx]
        if len(tba_idxs) == 0:
            print('zero1!!')
            gdf_idx = w_idx + first_gdf_idx
            gdf_grid_idx = gdf.loc[gdf_idx].grid_idx
            gdf_near_grid_idx = gdf_near_idx_pair[gdf_near_idx_pair['grid_idx'] == gdf_grid_idx].near_grid_idx
            tba_idx = gdf[gdf['grid_idx'] == gdf_near_grid_idx].index - first_gdf_idx
            print(tba_idx)
        else:
            tba_w_idxs.extend(tba_idxs) # tba: to be added
    tba_w_idxs = list(set(tba_w_idxs))
    prev_w_idxs = tba_w_idxs
    assert len(tba_w_idxs) >= 1
    cluster.extend(tba_w_idxs)
    cluster = list(set(cluster))
    cluster_gdf_idxs = [x + first_gdf_idx for x in list(cluster)]
    new_gdf = gdf_normalized.loc[cluster_gdf_idxs]
    print(list(new_gdf.grid_idx.values))
    sim = within_boundary_similarity(new_gdf, SIM_CAL_COLS)
    # print('bdnms', new_gdf.bdnm.values)
    print('round', phase)
    print('sim', sim)

Centroid Building is 삼성전자빌딩
gdf_idx is 2300
w_idx is 598
[35972, 35983, 31258, 31290, 31318, 31319, 31320, 36016, 36024, 36025, 36026, 31360, 31364, 31365, 31366, 31367, 31368, 31369, 31378, 36027, 31405, 31408, 31409, 31410, 31425, 31446, 31466, 36069, 36070, 36071, 36072, 36073, 31476]
round 0
sim 21.75721024110638
[36117, 36118, 36119, 36120, 36121, 36122, 36123, 35972, 35983, 35984, 31258, 31290, 31318, 31319, 31320, 31321, 36016, 36024, 36025, 36026, 31360, 31364, 31365, 31366, 31367, 31368, 31369, 31378, 36027, 36028, 31379, 31405, 31408, 31409, 31410, 31425, 31435, 31445, 31446, 31466, 31330, 31467, 36068, 36069, 36070, 36071, 36072, 36073, 31476, 31335, 36074, 31497, 31503, 31524]
round 1
sim 19.582227690274113
[36116, 36117, 36118, 36119, 36120, 36121, 36122, 36123, 36124, 31258, 31268, 36168, 36169, 36170, 36171, 31283, 31288, 31289, 31290, 31318, 31319, 31320, 31321, 36172, 36173, 36174, 36175, 36176, 31323, 31360, 31364, 31365, 31366, 31367, 31368, 31369, 31378, 31379, 3139

In [99]:
gdf[gdf['grid_idx'].isin([35972, 35983, 31258, 31290, 31318, 31319, 31320, 36016, 36024, 36025, 36026, 31360, 31364, 31365, 31366, 31367, 31368, 31369, 31378, 36027, 31405, 31408, 31409, 31410, 31425, 31446, 31466, 36069, 36070, 36071, 36072, 36073, 31476])].explore()

In [72]:
new_gdf.explore()

In [206]:
within_boundary_similarity(gdf_normalized, SIM_CAL_COLS)

0.950468266890957

In [207]:
foo = gdf_normalized[gdf_normalized['grid_idx'].isin([12556, 12481, 12528, 12503])]
within_boundary_similarity(foo, SIM_CAL_COLS)

0.8028004500225211

#### 2. CBD Index

In [10]:
# for each grids, calculate CBD Index. CBD Index represents two;
# 1. CBDness
# 2. How each grid is similar to other grids in CBD Boundary

# for example, suppose there is very high building with sufficient people, business factor((1)suffices), 
# but if that building is not similar to other buildings around((2)not suffices) CBD Index must show low
# (=like Lionel messi in the amateur league, not in Ligue 1 league)