# EDA for Incheon Data

## Environment Setup
Import necessary libraries including pandas, numpy, matplotlib, seaborn, and specialized libraries like geopandas and missingno. Set up visualization parameters and define utility paths.

### Import Necessary Libraries

In [None]:
import os
from typing import Tuple

import geopandas as gpd
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns
from metr.components import TrafficData

### Define and Create Utility Paths 

In [None]:
MAP_DATA_OF_SENSORS = "../datasets/metr-imc/nodelink/imc_link.shp"
TRAFFIC_RAW_PATH = "../datasets/metr-imc/metr-imc.h5"
METADATA_RAW_PATH = "../datasets/metr-imc/metadata.h5"
OUTLIER_OUTPUT_DIR = "./output/outlier_processed"
INTERPOLATED_OUTPUT_DIR = "./output/interpolated"
FINAL_OUTPUT_DIR = "./output/final"
PREDICTION_OUTPUT_DIR = "./output/prediction"

os.makedirs(OUTLIER_OUTPUT_DIR, exist_ok=True)
os.makedirs(INTERPOLATED_OUTPUT_DIR, exist_ok=True)
os.makedirs(FINAL_OUTPUT_DIR, exist_ok=True)
os.makedirs(PREDICTION_OUTPUT_DIR, exist_ok=True)

### Etc.

In [None]:
# Set up visualization parameters
plt.rcParams["font.family"] = "AppleGothic"  # Use AppleGothic for better font rendering
plt.rcParams["axes.unicode_minus"] = False  # Prevent negative sign rendering issues

## Data Loading and Preprocessing
Load traffic data from HDF files, inspect initial data structure, and perform basic filtering to remove sensors with excessive missing values.

In [None]:
raw = TrafficData.import_from_hdf(TRAFFIC_RAW_PATH)
raw_df = raw.data
raw_df.iloc[:, :5]

In [None]:
metadata_df = pd.read_hdf(METADATA_RAW_PATH)
metadata_df

In [None]:
# Inspect the initial structure of the data
print("Initial Data Shape:", raw_df.shape)
print("Initial Data Columns:", raw_df.columns[:5])  # Display first 5 columns
print("Initial Data Sample:")
print(raw_df[raw_df.columns[:4]].head(), end="\n\n")

# Check for missing values in the dataset
missing_values_count = raw_df.isnull().sum().sum()
print(f"Total Missing Values: {missing_values_count}")

In [None]:
max_missing_rate = 0.5

In [None]:
# Filter out sensors with more than 50% missing values
threshold = raw_df.shape[0] * max_missing_rate
df = raw_df.dropna(thresh=threshold, axis=1)

# Display the shape of the filtered data
print("Filtered Data Shape:", df.shape)
print(f"Removed Sensors: {raw_df.shape[1] - df.shape[1]}")

## Statistical Analysis and Summaries
Calculate summary statistics for the dataset, analyze the distribution of values, and identify general patterns in the data.

In [None]:
# Calculate summary statistics for the filtered dataset
summary_stats = df.describe().transpose()
print("Summary Statistics:")
print(summary_stats)

In [None]:
low_limit = 1000

# Analyze the distribution of values across all sensors
value_distribution = df.stack().reset_index(drop=True)
values_low = value_distribution[value_distribution < low_limit]
values_high = value_distribution[value_distribution >= low_limit]

# 2행 1열의 subplot 생성
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

# 상단 그래프 - values_low
sns.histplot(values_low, bins=150, kde=True, color="blue", ax=ax1)
ax1.set_title(f"Traffic Volume Distribution (< {low_limit})")
ax1.set_xlabel("Traffic Volume")
ax1.set_ylabel("Frequency")
ax1.grid(True)

# 하단 그래프 - values_high
sns.histplot(values_high, bins=200, color="red", ax=ax2)
ax2.set_title(f"Traffic Volume Distribution (≥ {low_limit})")
ax2.set_xlabel("Traffic Volume")
ax2.set_ylabel("Frequency")
ax2.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Identify sensors with the highest and lowest mean traffic volume
sensor_means = df.mean()
highest_mean_sensor = sensor_means.idxmax()
lowest_mean_sensor = sensor_means.idxmin()
print(f"Sensor with Highest Mean Traffic Volume: {highest_mean_sensor} ({sensor_means[highest_mean_sensor]:.2f})")
print(f"Sensor with Lowest Mean Traffic Volume: {lowest_mean_sensor} ({sensor_means[lowest_mean_sensor]:.2f})")

# Plot the mean traffic volume for all sensors
plt.figure(figsize=(12, 6))
sensor_means.sort_values(ascending=False).plot(kind="bar", color="blue", alpha=0.7)
plt.title("Mean Traffic Volume by Sensor")
plt.xlabel("Sensors")
plt.ylabel("Mean Traffic Volume")
plt.xticks([])  # x축 레이블 제거
plt.grid(axis="y")
plt.ylim(top=3000)
plt.tight_layout()
plt.show()

In [None]:
# Calculate and display the percentage of missing values for each sensor
missing_percentage = df.isnull().mean() * 100
print("Percentage of Missing Values by Sensor:")
print(missing_percentage.sort_values(ascending=False))

# Visualize the percentage of missing values
plt.figure(figsize=(12, 6))
missing_percentage.sort_values(ascending=False).plot(kind="bar", color="red", alpha=0.7)
plt.title("Percentage of Missing Values by Sensor")
plt.xlabel("Sensors")
plt.ylabel("Missing Percentage (%)")
plt.xticks([])  # x축 레이블 제거
plt.grid(axis="y")
plt.tight_layout()
plt.show()

하나의 센서에 대한 히스토그램 분석도 실시

In [None]:
# Generate histograms to examine the distribution of traffic volumes
def plot_histogram(df: pd.DataFrame, column: str, title: str, bins: int = 50, x_max: float = None, exclude_zero: bool = False):
    values = df[column].dropna()  # Remove NaN values
    if exclude_zero:
        values = values[values != 0]  # Exclude zero values if specified
    
    plt.figure(figsize=(10, 6))
    sns.histplot(values, bins=bins, kde=True, color="blue")
    plt.title(title)
    plt.xlabel("Traffic Volume")
    plt.ylabel("Frequency")
    plt.grid(True)
    
    if x_max is not None:
        plt.xlim(right=x_max)  # Set maximum x-axis value if specified
    
    plt.show()

In [None]:
sensor_id = df.columns[0]  # Select the first sensor as an example
plot_histogram(df, sensor_id, f"Histogram for Sensor {sensor_id}", bins=50, x_max=2000, exclude_zero=True)

## Temporal Pattern Analysis
Visualize traffic patterns by time of day, day of week, and over longer time periods. Create functions to plot mean values by hour and compare trends across different time periods.

In [None]:
# Define functions for temporal pattern analysis


def plot_mean_by_hour(df: pd.DataFrame, title: str, figsize: Tuple[int, int] = (10, 6)):
    """
    Plot the mean traffic volume by hour of the day.
    """
    hourly_means = df.groupby(df.index.hour).mean().mean(axis=1)
    plt.figure(figsize=figsize)
    sns.lineplot(x=hourly_means.index, y=hourly_means.values, marker="o")
    plt.title(title)
    plt.xlabel("Hour of Day")
    plt.ylabel("Mean Traffic Volume")
    plt.grid(True)
    plt.xticks(range(0, 24))
    plt.show()


def plot_mean_by_day_of_week(
    df: pd.DataFrame, title: str, figsize: Tuple[int, int] = (10, 6)
):
    """
    Plot the mean traffic volume by day of the week.
    """
    day_of_week_means = df.groupby(df.index.dayofweek).mean().mean(axis=1)
    day_labels = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    plt.figure(figsize=figsize)
    sns.barplot(
        x=day_of_week_means.index,
        y=day_of_week_means.values,
        palette="viridis",
    )
    plt.title(title)
    plt.xlabel("Day of Week")
    plt.ylabel("Mean Traffic Volume")
    plt.xticks(ticks=range(7), labels=day_labels)
    plt.grid(axis="y")
    plt.show()


def plot_mean_by_month(
    df: pd.DataFrame, title: str, figsize: Tuple[int, int] = (10, 6)
):
    """
    Plot the mean traffic volume by month.
    """
    monthly_means = df.groupby(df.index.month).mean().mean(axis=1)
    plt.figure(figsize=figsize)
    sns.lineplot(
        x=monthly_means.index, y=monthly_means.values, marker="o", color="green"
    )
    plt.title(title)
    plt.xlabel("Month")
    plt.ylabel("Mean Traffic Volume")
    plt.grid(True)
    plt.xticks(range(1, 13))
    plt.show()


# Apply the functions to the filtered data
plot_mean_by_hour(df, "Mean Traffic Volume by Hour of Day")
plot_mean_by_day_of_week(df, "Mean Traffic Volume by Day of Week")
plot_mean_by_month(df, "Mean Traffic Volume by Month")

## Outlier Detection and Analysis
Implement outlier detection using statistical methods (z-score, IQR) and domain knowledge about theoretical road capacities. Visualize outliers and assess their impact on the dataset.

In [None]:
# Define functions for outlier detection and visualization
def detect_outliers_zscore(df: pd.DataFrame, threshold: float = 3.0) -> pd.DataFrame:
    """
    Detect outliers using the z-score method. 여기서 z-score는 전체 데이터의 평균과 표준편차를 사용.

    Parameters:
    - df: DataFrame containing the data.
    - threshold: Z-score threshold for identifying outliers.

    Returns:
    - DataFrame with boolean values indicating outliers (True for outliers).
    """
    z_scores = (df - df.mean()) / df.std()
    return z_scores.abs() > threshold


def detect_outliers_iqr(df: pd.DataFrame, multiplier: float = 1.5) -> pd.DataFrame:
    """
    Detect outliers using the IQR method. 전체 데이터 기반 IQR을 사용.

    Parameters:
    - df: DataFrame containing the data.
    - multiplier: Multiplier for the IQR range to identify outliers.

    Returns:
    - DataFrame with boolean values indicating outliers (True for outliers).
    """
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    return (df < (Q1 - multiplier * IQR)) | (df > (Q3 + multiplier * IQR))


# Road capacity-based outlier detection
def detect_outliers_road_caps(
    df: pd.DataFrame,
    metadata: pd.DataFrame,
    adjustment_rate: float = 1.5,
) -> pd.DataFrame:
    """
    Detect outliers based on theoretical road capacities using domain knowledge.

    Parameters:
    - df: DataFrame containing the data.
    - metadata: Metadata containing road information.
    - adjustment_rate: Multiplier for the theoretical road capacity.

    Returns:
    - DataFrame with boolean values indicating outliers (True for outliers).
    """
    
    capacity_map = {}
    for _, row in metadata.iterrows():
        speed_limit = row["MAX_SPD"]
        lanes = row["LANES"]
        base_capacity = (2200 - 10 * (100 - speed_limit)) * lanes
        capacity_map[row["LINK_ID"]] = base_capacity * adjustment_rate

    outliers = pd.DataFrame(False, index=df.index, columns=df.columns)
    for col in df.columns:
        if col in capacity_map:
            outliers[col] = df[col] > capacity_map[col]
    return outliers

In [None]:
def visualize_outliers_in_sensor(
    sensor_series: pd.Series, outlier: pd.Series, title: str, alpha: float = 0.7
):
    """
    Visualize outliers on a line plot.

    Parameters:
    - sensor_series: Original Series data of target sensor.
    - outliers: Boolean series indicating outliers of target sensor. The index should match the original series.
    - title: Title of the plot.
    """
    plt.figure(figsize=(12, 6))

    plt.plot(
        sensor_series.index,
        sensor_series,
        label=f"Sensor {sensor_series.name}",
        alpha=alpha,
    )
    
    outlier_data = sensor_series[outlier]
    plt.scatter(
        outlier_data.index,  # 이상치의 인덱스(시간)
        outlier_data.values,  # 이상치 값
        color="red",
        label=f"Outliers {sensor_series.name}",
        s=10,
    )

    plt.title(title)
    plt.xlabel("Time")
    plt.ylabel("Traffic Volume")
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
# Detect outliers using z-score
zscore_outliers = detect_outliers_zscore(df, threshold=3.0)

target_sensor_data = df.iloc[:, 0]  # Select the first sensor as an example
target_sensor_outliers = zscore_outliers.iloc[:, 0]
# Select the first sensor's outliers

visualize_outliers_in_sensor(
    target_sensor_data, target_sensor_outliers, "Outliers Detected by Z-Score Method"
)

In [None]:
# Detect outliers using IQR methods
iqr_outliers = detect_outliers_iqr(df, multiplier=1.5)

target_sensor_data = df.iloc[:, 0]  # Select the first sensor as an example
target_sensor_outliers = iqr_outliers.iloc[:, 0]
# Select the first sensor's outliers

visualize_outliers_in_sensor(
    target_sensor_data, target_sensor_outliers, "Outliers Detected by IQR Method"
)

In [None]:
# Detect outliers using domain knowledge
road_cap_outliers = detect_outliers_road_caps(df, metadata_df, adjustment_rate=2)

In [None]:
# 각 열에 True 값(이상치)이 하나라도 있는 센서만 선택
cols_with_outliers = road_cap_outliers.any()
road_cap_outliers_filtered = road_cap_outliers.loc[:, cols_with_outliers]

# True가 있는 열의 수와 전체 열 수 출력
print(f"전체 센서 수: {road_cap_outliers.shape[1]}개")
print(f"Road Cap. 이상치가 있는 센서 수: {road_cap_outliers_filtered.shape[1]}개")

# 결과 출력
road_cap_outliers_filtered.sum()

In [None]:
target_sensor_data = df.loc[:, "1640048000"]  # Select the first sensor as an example
target_sensor_outliers = road_cap_outliers.loc[:, "1640048000"]
# Select the first sensor's outliers
# 1660033301 or 1640049000
# 1640048000는 확실한 이상치만 있는 센서

# Visualize outliers detected by domain knowledge
visualize_outliers_in_sensor(
    target_sensor_data,
    target_sensor_outliers,
    "Outliers Detected by Road Capacity Method",
)

In [None]:
# Assess the impact of outliers on the dataset
def assess_outlier_impact(df: pd.DataFrame, outliers: pd.DataFrame):
    """
    Assess the impact of outliers on the dataset by calculating the percentage of outliers.
    
    Parameters:
    - df: Original DataFrame containing the data.
    - outliers: DataFrame with boolean values indicating outliers.
    """
    total_values = df.size
    total_outliers = outliers.sum().sum()
    outlier_percentage = (total_outliers / total_values) * 100
    print(f"Total Outliers: {total_outliers}")
    print(f"Percentage of Outliers: {outlier_percentage:.2f}%")

In [None]:
# Assess the impact of outliers detected by each method
print("Impact of Z-Score Outliers:")
assess_outlier_impact(df, zscore_outliers)

print("\nImpact of IQR Outliers:")
assess_outlier_impact(df, iqr_outliers)

print("\nImpact of Domain Knowledge Outliers:")
assess_outlier_impact(df, road_cap_outliers)

## Missing Data Patterns
Analyze missing data patterns using visualization tools like missingno. Calculate missing data statistics by sensor and time period, and identify systematic patterns in data missingness.

Analyze missing data patterns using visualization tools like missingno
Calculate missing data statistics by sensor and time period
Identify systematic patterns in data missingness

In [None]:
sample_columns = np.random.choice(
    df.columns,
    min(500, df.shape[1]),
    replace=False,
)
sampled_df = df[sample_columns]

# Visualize missing data patterns using missingno
plt.figure(figsize=(12, 6))
msno.matrix(sampled_df, sparkline=False)
plt.title("Missing Data Matrix")
plt.show()

plt.figure(figsize=(12, 6))
msno.heatmap(sampled_df, cmap="viridis")
plt.title("Missing Data Correlation Heatmap")
plt.show()

plt.figure(figsize=(12, 6))
msno.bar(sampled_df, color="blue", fontsize=12)
plt.title("Missing Data Bar Chart")
plt.show()

In [None]:
# Calculate missing data statistics by sensor
missing_by_sensor = df.isnull().sum()
missing_percentage_by_sensor = (missing_by_sensor / df.shape[0]) * 100
missing_stats = pd.DataFrame({
    "Missing Count": missing_by_sensor,
    "Missing Percentage (%)": missing_percentage_by_sensor
}).sort_values(by="Missing Percentage (%)", ascending=False)

print("Missing Data Statistics by Sensor:")
print(missing_stats.head(10))  # Display top 10 sensors with the most missing data

In [None]:
# Calculate missing data statistics by time period
missing_by_time = df.isnull().sum(axis=1)
missing_percentage_by_time = (missing_by_time / df.shape[1]) * 100
missing_time_stats = pd.DataFrame({
    "Missing Count": missing_by_time,
    "Missing Percentage (%)": missing_percentage_by_time
}).sort_index()

print("Missing Data Statistics by Time Period:")
print(missing_time_stats.head(10))  # Display first 10 time periods with missing data

In [None]:
# Visualize missing data statistics by time period
plt.figure(figsize=(12, 6))
plt.plot(missing_time_stats.index, missing_time_stats["Missing Percentage (%)"], label="Missing Percentage", color="red")
plt.title("Missing Data Percentage by Time Period")
plt.xlabel("Time")
plt.ylabel("Missing Percentage (%)")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Identify systematic patterns in missing data
missing_patterns = df.isnull().astype(int)
correlation_matrix = missing_patterns.corr()

plt.figure(figsize=(12, 6))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=False, cbar=True)
plt.title("Correlation of Missing Data Between Sensors")
plt.show()

## Spatial Data Visualization
Use geopandas to visualize the geographical distribution of sensors, highlighting those with different characteristics (high missing rates, outliers, etc.).

이 부분은 좀 더 수정할 것

In [None]:
# Load geographical data for sensors
gdf_raw = gpd.read_file(MAP_DATA_OF_SENSORS)
gdf_raw


In [None]:
# Merge geographical data with missing data statistics
missing_stats = df.isnull().mean() * 100  # Calculate missing percentage
missing_stats_df = pd.DataFrame({
    "LINK_ID": missing_stats.index,
    "Missing Percentage": missing_stats.values
})
gdf = gdf_raw.merge(missing_stats_df, on="LINK_ID", how="left")
gdf

In [None]:
gdf.explore()

In [None]:
# Merge geographical data with outlier statistics
outlier_counts = road_cap_outliers.sum()  # Count outliers per sensor
outlier_stats_df = pd.DataFrame({
    "LINK_ID": outlier_counts.index,
    "Outlier Count": outlier_counts.values
})
gdf = gdf.merge(outlier_stats_df, on="LINK_ID", how="left")

In [None]:
high_missing_sensors = gdf[gdf["Missing Percentage"] > 50]
high_missing_sensors.explore()

In [None]:
high_outlier_sensors = gdf[gdf["Outlier Count"] > 0]
high_outlier_sensors.explore(color="red")

In [None]:
# Visualize sensors with high outlier counts
high_outlier_sensors = gdf[gdf["Outlier Count"] > 0]
high_outlier_sensors.explore(color="red")

In [None]:
# Visualize all sensors with missing and outlier statistics
plt.figure(figsize=(12, 8))
gdf.plot(column="Missing Percentage", cmap="Oranges", legend=True, edgecolor="black")
plt.title("Geographical Plots")
plt.show()