# Subset Generation

In [1]:
import numpy as np
import pandas as pd
import os
import geopandas as gpd
from tqdm import tqdm

## Interpolated Subset (Subset-296)

songdo_traffic_core.dataset.nodelink 모듈을 통해 생성한 데이터를 기반으로 Interpolation 및 subset 생성

In [2]:
from songdo_traffic_core.dataset.interpolator import (
    IterativeRandomForestInterpolator,
    LinearInterpolator,
    SplineInterpolator,
)
from songdo_traffic_core.dataset.metr_imc.generator import MetrImcSubsetGenerator

In [3]:
def extend_nans_around_zeros(series: pd.Series) -> pd.Series:
    series = series.copy()
    nan_indices = series[series.isna()].index

    for idx in nan_indices:
        idx_pos = series.index.get_loc(idx)

        i = idx_pos - 1
        while i >= 0 and series.iat[i] == 0:
            series.iat[i] = np.nan
            i -= 1

        i = idx_pos + 1
        while i < len(series) and series.iat[i] == 0:
            series.iat[i] = np.nan
            i += 1

    return series

In [4]:
df_imc: pd.DataFrame = pd.read_hdf("../datasets/metr-imc/metr-imc.h5")
df_imc

Unnamed: 0,1680254606,1650051800,1670030907,1610002406,1650374201,1650038700,1690052500,1650054600,1640002100,1660001406,...,1680093100,1610104400,1650230200,1630168900,1640247500,1610120700,1610028300,1650352801,1630026000,1660003900
2023-01-01 00:00:00,0.0,,,,,,12.0,,9.0,,...,91.0,,,,20.0,,,,,409.0
2023-01-01 01:00:00,0.0,,,,,,12.0,,25.0,,...,97.0,,,,9.0,,,,,370.0
2023-01-01 02:00:00,0.0,,,,,,12.0,,0.0,,...,54.0,,,,0.0,,,,,236.0
2023-01-01 03:00:00,0.0,,,,,,12.0,,0.0,,...,38.0,,,,0.0,,,,,219.0
2023-01-01 04:00:00,0.0,,,,,,12.0,,0.0,,...,31.0,,,,0.0,,,,,166.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-06 19:00:00,,149.0,315.0,0.0,6.0,15.0,19.0,99.0,166.0,219.0,...,452.0,,0.0,,,,0.0,129.0,789.0,1182.0
2023-12-06 20:00:00,,99.0,266.0,0.0,0.0,3.0,16.0,50.0,122.0,172.0,...,358.0,,0.0,,,,0.0,86.0,636.0,994.0
2023-12-06 21:00:00,,89.0,230.0,0.0,0.0,0.0,12.0,15.0,125.0,195.0,...,326.0,,0.0,,,,0.0,37.0,517.0,818.0
2023-12-06 22:00:00,,46.0,135.0,0.0,0.0,0.0,10.0,0.0,68.0,148.0,...,233.0,,0.0,,,,0.0,9.0,319.0,617.0


현재는 결측치가 너무 많은 데이터는 제외하고 실험

In [5]:
no_missing_columns = df_imc.columns[df_imc.isnull().sum() == 0].to_list()
less_500_missing_columns = df_imc.columns[df_imc.isnull().sum() < 500].to_list()    # 현재 선택된 대상 그룹    
less_750_missing_columns = df_imc.columns[df_imc.isnull().sum() < 750].to_list()

['1680254606',
 '1680061201',
 '1680060802',
 '1610006401',
 '1640050201',
 '1640318700',
 '1610087700',
 '1680009510',
 '1680257000',
 '1630175301',
 '1660003801',
 '1630175201',
 '1650057200',
 '1680061400',
 '1680255000',
 '1650069000',
 '1610087600',
 '1670004000',
 '1640137900',
 '1660048601',
 '1640321901',
 '1660003802',
 '1650044800',
 '1610088000',
 '1650055800',
 '1650015703',
 '1680061202',
 '1660000401',
 '1680256902',
 '1680020917',
 '1660001002',
 '1680519701',
 '1670002800',
 '1660034301',
 '1680022501',
 '1680061003',
 '1680063600',
 '1650004900',
 '1630017402',
 '1650021400',
 '1670002500',
 '1680257100',
 '1650041400',
 '1680008503',
 '1680094300',
 '1680093400',
 '1640099900',
 '1630044300',
 '1650071800',
 '1650068900',
 '1660033202',
 '1663132600',
 '1680008501',
 '1680061602',
 '1650006900',
 '1663189301',
 '1663187501',
 '1680256501',
 '1680022402',
 '1680496600',
 '1680254601',
 '1680061102',
 '1680235100',
 '1680258100',
 '1610006403',
 '1640318800',
 '16401005

In [6]:
generator = MetrImcSubsetGenerator(
    nodelink_dir="../datasets/metr-imc/nodelink",
    imcrts_dir="../datasets/metr-imc/imcrts",
    metr_imc_dir="../datasets/metr-imc/",
)

결측치 주변의 값이 대부분 0으로 되어 있다. 결측치가 발생하고 측정이 재개되었을 때, 시간이 걸리고 0으로 측정된다는 가설하에 결측치 주변의 0은 모두 결측치로 처리하였다.

In [7]:
df = generator.metr_imc_df
df = df.sort_index()
tqdm.pandas()
generator.metr_imc_df = df.apply(extend_nans_around_zeros)

# Mac에서 대략 30초 소요

Interpolation 방법은 여러가지가 있을 수 있으며 현재는 아래와 같은 Interpolation으로 정의

In [8]:
interpolator = SplineInterpolator()

### Interpolation 완료한 Subset 생성

In [9]:
SUBSET_296_TARGET_DIR = "../datasets/metr-imc/subsets/metr-imc-296-int"

In [10]:
tqdm.pandas()
generator.generate_subset(less_500_missing_columns, SUBSET_296_TARGET_DIR, interpolator)

2024/07/22 12:56:17 songdo_traffic_core.dataset.metr_imc.generator [INFO] Start generating subset...
2024/07/22 12:56:17 songdo_traffic_core.dataset.metr_imc.generator [INFO] Generating ../datasets/metr-imc/subsets/metr-imc-296-int/metr-imc.h5...
2024/07/22 12:56:17 songdo_traffic_core.dataset.metr_imc.generator [INFO] Interpolating...


In [None]:
pd.read_hdf("../datasets/metr-imc/subsets/metr-imc-296-int/metr-imc.h5").to_excel(
    "../datasets/metr-imc/subsets/metr-imc-296-int/metr-imc-296-int.xlsx"
)

생성된 데이터를 Shapefile로 시각화

In [None]:
from songdo_traffic_core.dataset.metr_imc.converter.graph_sensor_locations import SensorNetworkView

SUBSET_TARGET_DISTANCE_FILE_PATH = os.path.join(SUBSET_296_TARGET_DIR, "distances_imc_2023.csv")
SUBSET_TARGET_SENSOR_FILE_PATH = os.path.join(SUBSET_296_TARGET_DIR, "graph_sensor_locations.csv")
SUBSET_TARGET_NETWORK_DIR = os.path.join(SUBSET_296_TARGET_DIR, "miscellaneous")

SensorNetworkView(SUBSET_TARGET_DISTANCE_FILE_PATH, SUBSET_TARGET_SENSOR_FILE_PATH).to_file(SUBSET_TARGET_NETWORK_DIR)

## Small Group (Subset-37)

앞에서 선택한 결측치 500개 미만의 데이터 중 임의로 Clustering을 한 37개의 그룹을 추가로 추출하여 작은 그룹을 만들었다. 이 그룹이 적절한 그래프 구조와 적은 결측치를 가지고 있기에 테스트 용도로 적합할 것으로 판단하였다.

아래는 QGIS를 통해서 추출한 Subset 목록의 Shapefile 이다.

In [None]:
from numpy import dtype


SUBSET_37_TARGET_DIR = "../datasets/metr-imc/subsets/metr-imc-37"

subset_37_targets: gpd.GeoDataFrame = gpd.read_file(
    os.path.join(SUBSET_37_TARGET_DIR, "selected_roads.shp")
)
subset_37_targets.head()

In [None]:
print("Shape:", subset_37_targets.shape)
print("Uniques:")
print(subset_37_targets.nunique())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

target_columns = [str(value) for value in subset_37_targets["NODE_ID"].to_list()]
sub_37_df = generator.metr_imc_df[target_columns]

# import seaborn as sns
# import matplotlib.pyplot as plt

# for col in sub_37_df.columns:
#     plt.figure(figsize=(6, 2))
#     sns.lineplot(sub_37_df[col])
#     plt.title(f"{sub_37_df[col].name} Road Traffic Data")
#     plt.xlabel("Date Time")
#     plt.show()

# 그래프를 2줄로 출력
num_cols = len(sub_37_df.columns)
num_graphs_per_row = 2
num_rows = (num_cols + num_graphs_per_row - 1) // num_graphs_per_row

fig, axes = plt.subplots(num_rows, num_graphs_per_row, figsize=(18, 4 * num_rows))

# 플롯을 그릴 때 각 서브플롯을 선택하여 그립니다.
for i, col in enumerate(sub_37_df.columns):
    row = i // num_graphs_per_row
    col_index = i % num_graphs_per_row
    sns.lineplot(ax=axes[row, col_index], data=sub_37_df[col])
    axes[row, col_index].set_title(f"{sub_37_df[col].name} Road Traffic Data")
    axes[row, col_index].set_xlabel("Date Time")
    axes[row, col_index].set_ylabel("Value")

# 사용되지 않은 서브플롯을 숨깁니다.
for j in range(i + 1, num_rows * num_graphs_per_row):
    fig.delaxes(axes.flatten()[j])

plt.tight_layout()
plt.show()

In [None]:
interpolator = SplineInterpolator()
tqdm.pandas()
generator.generate_subset(target_columns, SUBSET_37_TARGET_DIR, interpolator)