### Import Lib

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.wkt import loads
import constant as c
from pyproj import CRS
import sys;sys.path.append('../')
from itertools import combinations
from sklearn.preprocessing import StandardScaler
import math
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
import warnings; warnings.filterwarnings('ignore')

### Acquire Data

In [2]:
# then read the data
df_2020 = pd.read_csv('../asset/preprocess/df_2020.csv', index_col=0)
df_2020['geometry_grids'] = df_2020['geometry_grids'].apply(lambda x: loads(x))
df_2021 = pd.read_csv('../asset/preprocess/df_2021.csv', index_col=0)
df_2021['geometry_grids'] = df_2021['geometry_grids'].apply(lambda x: loads(x))
df_2022 = pd.read_csv('../asset/preprocess/df_2022.csv', index_col=0)
df_2022['geometry_grids'] = df_2022['geometry_grids'].apply(lambda x: loads(x))

crs = 'EPSG:5179'  # Specify the coordinate reference system
gdf_2020 = gpd.GeoDataFrame(df_2020, geometry=df_2020['geometry_grids'], crs=crs)
gdf_2021 = gpd.GeoDataFrame(df_2021, geometry=df_2021['geometry_grids'], crs=crs)
gdf_2022 = gpd.GeoDataFrame(df_2022, geometry=df_2022['geometry_grids'], crs=crs)

gdf_2020_scbd = gdf_2020[gdf_2020['sigungunm'].isin(c.SCBD_NMS)]
gdf_2020_gbd = gdf_2020[gdf_2020['sigungunm'].isin(c.GBD_NMS)]
gdf_2020_ybd = gdf_2020[gdf_2020['sigungunm'].isin(c.YBD_NMS)]
gdf_2021_scbd = gdf_2021[gdf_2021['sigungunm'].isin(c.SCBD_NMS)]
gdf_2021_gbd = gdf_2021[gdf_2021['sigungunm'].isin(c.GBD_NMS)]
gdf_2021_ybd = gdf_2021[gdf_2021['sigungunm'].isin(c.YBD_NMS)]
gdf_2022_scbd = gdf_2022[gdf_2022['sigungunm'].isin(c.SCBD_NMS)]
gdf_2022_gbd = gdf_2022[gdf_2022['sigungunm'].isin(c.GBD_NMS)]
gdf_2022_ybd = gdf_2022[gdf_2022['sigungunm'].isin(c.YBD_NMS)]

In [3]:
# Setting COLS
SIM_CAL_COLS = c.SIM_CAL_COLS
STANDARDIZE_COLS = c.STANDARDIZE_COLS
PP_COLS = c.PP_COLS
BS_COLS = c.BS_COLS
BD_COLS = c.BD_COLS

### Experiment Settings

In [40]:
# experiment settings
YEAR = 2020
CBD_NM = "scbd"

In [41]:
# Construct the DataFrame variable name
df_nm = f'gdf_{YEAR}_{CBD_NM}'
gdf = gpd.GeoDataFrame(locals()[df_nm], geometry='geometry_grids')
MAX_GRIDS_N = int(len(gdf) * 0.1)
print(f"Total Grids: {len(gdf)}\nNum Max grids: {MAX_GRIDS_N}")

# Set the CRS for the GeoDataFrame
crs = CRS.from_epsg(5179)
gdf = gdf.set_crs(crs)
gdf = gdf.dropna() # drop any data have null value

# and normailze
# Initialize the StandardScaler object
scaler = StandardScaler()
# Fit the scaler to the data
scaler.fit(gdf[STANDARDIZE_COLS])
# Transform the data using the scaler
normalized_data = scaler.transform(gdf[STANDARDIZE_COLS])
gdf_normalized = gdf.copy()
gdf_normalized[STANDARDIZE_COLS] = normalized_data

Total Grids: 1038
Num Max grids: 103


#### OURS

In [42]:
# First, get the result data(phase_results)
result_fpath = f"../asset/experiment/cluster/{CBD_NM}_{YEAR}.csv"
phase_results = pd.read_csv(result_fpath)[['grid_idx', 'phase']]
phase_results['grid_idx'] = phase_results['grid_idx'].astype(int)
phase_results['phase'] = phase_results['phase'].astype(int)
cluster_idxs = list(phase_results.grid_idx)

gdf_ours = gdf_normalized.copy()
gdf_ours['cluster'] = np.where(gdf_ours['grid_idx'].isin(cluster_idxs), 1, 0)
num_grids = len(gdf_ours[gdf_ours['cluster'] == 1])
# gdf_ours.explore(column='cluster', cmap='Blues') 

In [43]:
model_score = silhouette_score(gdf_ours[SIM_CAL_COLS], gdf_ours.cluster)
print("Score: ", model_score)
print("NUM GRIDS: ", num_grids)
gdf_ours.explore(column='cluster', cmap='Blues') 

Score:  0.2421132877062531
NUM GRIDS:  103


#### Comparables(KNN)

In [44]:
# KNN with n_clusters = 10으로 지정
# 그 후 centroid cluster 대다수가 속한 cluster를 1, 아닌 클러스터를 0으로 지정

In [45]:
init_cluster_grid_idxs = phase_results[phase_results['phase'] == 0].grid_idx

In [46]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
gdf_knn = gdf_normalized.copy()
print(len(gdf_knn))
# Choose the number of clusters
n_clusters = 10 # for example

kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(gdf_knn[SIM_CAL_COLS])
gdf_knn['cluster'] = kmeans.labels_
# gdf_knn.explore(column='cluster', cmap='viridis') 
gdf_knn[gdf_knn['grid_idx'].isin(init_cluster_grid_idxs)].cluster


def get_cbd_label(init_cluster_grid_idxs, cluster_df):
    lst = cluster_df[cluster_df['grid_idx'].isin(init_cluster_grid_idxs)].cluster
    counts = {}
    for num in lst:
        counts[num] = counts.get(num, 0) + 1

    max_count = max(counts.values())
    most_common = [num for num, count in counts.items() if count == max_count]
    result = most_common[0]
    return result

cbd_label = get_cbd_label(init_cluster_grid_idxs, gdf_knn)
gdf_knn_ = gdf_knn.copy() # gdf_knn_ is for assigning new cluster(seperating cbd and non cbd)
gdf_knn_['cluster'] = gdf_knn_['cluster'].apply(lambda x: 1 if x == cbd_label else 0)
num_grids = len(gdf_knn_[gdf_knn_['cluster'] == 1])

974


In [47]:
model_score = silhouette_score(gdf_knn_[SIM_CAL_COLS], gdf_knn_.cluster)
print("Score: ", model_score)
print("NUM GRIDS: ", num_grids)
gdf_knn_.explore(column='cluster', cmap='Blues') 

Score:  0.15302894408327372
NUM GRIDS:  99


In [48]:
# gdf_knn_.explore(column='cluster', cmap='Blues') 
