# 지하철 혼잡도 종합 분석 시각화

2호선, 4호선, 5호선 혼잡도와 도시 특성 간의 연관성을 시각화합니다.

**출력**: PNG 이미지 파일 (`output/` 폴더)

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
import os
import warnings
warnings.filterwarnings('ignore')

# 출력 폴더 생성
OUTPUT_DIR = "../output/visualization"
os.makedirs(OUTPUT_DIR, exist_ok=True)

DB_PATH = "../db/subway.db"

# 이미지 저장 함수
def save_fig(fig, filename, width=1200, height=700):
    filepath = f"{OUTPUT_DIR}/{filename}"
    fig.write_image(filepath, width=width, height=height, scale=2)
    print(f"Saved: {filepath}")

print("Setup complete")
print(f"Output directory: {OUTPUT_DIR}")

Setup complete
Output directory: ../output/visualization


In [2]:
# 데이터 로드 및 전처리
conn = sqlite3.connect(DB_PATH)

# 역 정보
df_stations = pd.read_sql("""
SELECT s.station_id, s.station_name_kr, sr.station_code, sr.admin_dong_code, l.line_name
FROM Stations s
JOIN Station_Routes sr ON s.station_id = sr.station_id
JOIN Lines l ON sr.line_id = l.line_id
WHERE l.line_name IN ('2호선', '4호선', '5호선')
""", conn)

target_ids = df_stations['station_id'].unique().tolist()
target_codes = df_stations['station_code'].unique().tolist()
dong_shorts = list(set([str(d)[:8] for d in df_stations['admin_dong_code'].dropna().unique()]))

# 혼잡도
df_cong = pd.read_sql(f"""
SELECT * FROM Station_Congestion
WHERE station_code IN ({','.join([f"'{c}'" for c in target_codes])})
""", conn)

def cat_time(slot):
    h = ((5*60+30+slot*30)//60)%24
    if h<6: return '00_06'
    elif h<11: return '06_11'
    elif h<14: return '11_14'
    elif h<17: return '14_17'
    elif h<21: return '17_21'
    else: return '21_24'

df_cong['time_period'] = df_cong['time_slot'].apply(cat_time)
df_cong = df_cong.merge(df_stations[['station_id','station_code']].drop_duplicates(), on='station_code')

# 평일 혼잡도 피벗
cong_piv = df_cong[df_cong['day_of_week']==0].groupby(['station_id','time_period'])['congestion_level'].mean().unstack()
cong_piv.columns = [f'cong_{c}' for c in cong_piv.columns]
cong_piv['cong_avg'] = cong_piv.mean(axis=1)
cong_piv = cong_piv.reset_index()

# 건물
df_bldg = pd.read_sql(f"""
SELECT * FROM Station_Catchment_Buildings
WHERE station_id IN ({','.join(map(str, target_ids))})
""", conn)

def cat_usage(u):
    if pd.isna(u): return 'etc'
    u = str(u)
    if '주택' in u: return 'residential'
    elif '근린생활' in u or '판매' in u: return 'commercial'
    elif '업무' in u: return 'office'
    else: return 'etc'

df_bldg['usage_cat'] = df_bldg['usage_type'].apply(cat_usage)
bldg_stats = df_bldg.groupby('station_id').agg({'id':'count','height':'mean','floor_area':'sum','households':'sum'})
bldg_stats.columns = ['bldg_count','bldg_height','bldg_area','bldg_house']
usage_piv = df_bldg.groupby(['station_id','usage_cat']).size().unstack(fill_value=0)
bldg_stats = bldg_stats.join(usage_piv).reset_index()

# 매출
df_rev = pd.read_sql('SELECT * FROM Dong_Estimated_Revenue', conn)
df_rev['dong_short'] = df_rev['admin_dong_code'].astype(str).str[:8]
df_rev = df_rev[df_rev['dong_short'].isin(dong_shorts)]
latest_q = df_rev['quarter_code'].max()
dong_rev = df_rev[df_rev['quarter_code']==latest_q].groupby('dong_short')['month_sales_amt'].sum().reset_index()
dong_rev.columns = ['dong_short','sales']
dong_rev['sales'] = dong_rev['sales'] / 1e8

# 유동인구
df_float = pd.read_sql('SELECT * FROM Dong_Floating_Population', conn)
df_float['dong_short'] = df_float['admin_dong_code'].astype(str).str[:8]
df_float = df_float[df_float['dong_short'].isin(dong_shorts)]
dong_float = df_float.groupby('dong_short')['total_floating_pop'].mean().reset_index()
dong_float.columns = ['dong_short','floating']
dong_float['floating'] = dong_float['floating'] / 10000

# 직장인구
df_work = pd.read_sql('SELECT * FROM Dong_Workplace_Population', conn)
df_work['dong_short'] = df_work['admin_dong_code'].astype(str).str[:8]
df_work = df_work[df_work['dong_short'].isin(dong_shorts)]
dong_work = df_work.groupby('dong_short')['total_pop'].sum().reset_index()
dong_work.columns = ['dong_short','workplace']
dong_work['workplace'] = dong_work['workplace'] / 1000

# 통합
station_dong = df_stations[['station_id','station_name_kr','admin_dong_code','line_name']].drop_duplicates(subset='station_id')
station_dong['dong_short'] = station_dong['admin_dong_code'].astype(str).str[:8]

df = station_dong[['station_id','station_name_kr','dong_short','line_name']].copy()
df = df.merge(cong_piv, on='station_id', how='left')
df = df.merge(bldg_stats, on='station_id', how='left')
df = df.merge(dong_rev, on='dong_short', how='left')
df = df.merge(dong_float, on='dong_short', how='left')
df = df.merge(dong_work, on='dong_short', how='left')
df = df.dropna(subset=['cong_avg','bldg_count'])

print(f"Data loaded: {len(df)} stations")
conn.close()

Data loaded: 119 stations


## 1. 상관관계 히트맵

In [3]:
# 상관관계 계산
features = ['bldg_count', 'bldg_height', 'bldg_area', 'bldg_house',
            'residential', 'commercial', 'office', 'sales', 'floating', 'workplace']
cong_cols = ['cong_06_11', 'cong_11_14', 'cong_14_17', 'cong_17_21', 'cong_21_24', 'cong_avg']

features = [f for f in features if f in df.columns]
cong_cols = [c for c in cong_cols if c in df.columns]

corr_matrix = df[features + cong_cols].corr()
corr_data = corr_matrix.loc[features, cong_cols]

# 한글 레이블
feature_labels = ['건물수', '평균높이', '총연면적', '세대수', 
                  '주거건물', '상업건물', '업무건물', '추정매출', '유동인구', '직장인구']
cong_labels = ['출근(06-11)', '점심(11-14)', '오후(14-17)', '퇴근(17-21)', '야간(21-24)', '평균']

fig = go.Figure(data=go.Heatmap(
    z=corr_data.values,
    x=cong_labels,
    y=feature_labels[:len(features)],
    colorscale='RdBu_r',
    zmid=0,
    text=np.round(corr_data.values, 2),
    texttemplate='%{text}',
    textfont={'size': 12, 'color': 'black'},
    colorbar=dict(title='상관계수', tickfont=dict(size=14))
))

fig.update_layout(
    title=dict(text='시간대별 혼잡도와 도시 특성 상관관계', font=dict(size=20)),
    xaxis=dict(title='시간대별 혼잡도', tickfont=dict(size=14)),
    yaxis=dict(title='도시 특성', tickfont=dict(size=14)),
    width=900,
    height=600,
    template='plotly_white'
)

fig.show()
save_fig(fig, '01_correlation_heatmap.png', width=900, height=600)

Saved: ../output/visualization/01_correlation_heatmap.png


## 2. 건물 용도별 시간대 혼잡도 패턴

In [4]:
# 건물 용도별 시간대 상관계수
time_periods = ['06_11', '11_14', '14_17', '17_21', '21_24']
time_labels = ['출근\n(06-11)', '점심\n(11-14)', '오후\n(14-17)', '퇴근\n(17-21)', '야간\n(21-24)']

patterns = {}
for usage in ['residential', 'commercial', 'office']:
    if usage in df.columns:
        patterns[usage] = [df[usage].corr(df[f'cong_{t}']) for t in time_periods]

fig = go.Figure()

colors = {'residential': '#3498db', 'commercial': '#e74c3c', 'office': '#2ecc71'}
names = {'residential': '주거 건물', 'commercial': '상업 건물', 'office': '업무 건물'}

for usage, corrs in patterns.items():
    fig.add_trace(go.Scatter(
        x=time_labels,
        y=corrs,
        mode='lines+markers',
        name=names[usage],
        line=dict(width=3, color=colors[usage]),
        marker=dict(size=12, symbol='circle')
    ))

fig.update_layout(
    title=dict(text='건물 용도별 시간대 혼잡도 상관관계 패턴', font=dict(size=20)),
    xaxis=dict(title='시간대', tickfont=dict(size=12)),
    yaxis=dict(title='상관계수 (r)', tickfont=dict(size=12),
               range=[0, 0.5]),
    legend=dict(font=dict(size=14), x=0.02, y=0.98),
    template='plotly_white',
    width=900,
    height=500
)

fig.add_hline(y=0, line_dash='dash', line_color='gray', opacity=0.5)

fig.show()
save_fig(fig, '02_time_pattern_by_building.png', width=900, height=500)

Saved: ../output/visualization/02_time_pattern_by_building.png


## 3. 주요 변수별 혼잡도 산점도

In [5]:
# 4개 주요 변수 산점도
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=['업무 건물 수 vs 평균 혼잡도', '총 연면적 vs 평균 혼잡도',
                    '유동인구 vs 평균 혼잡도', '직장인구 vs 평균 혼잡도'],
    horizontal_spacing=0.12,
    vertical_spacing=0.15
)

scatter_vars = [
    ('office', '업무 건물 수', '#2ecc71'),
    ('bldg_area', '총 연면적 (m²)', '#9b59b6'),
    ('floating', '유동인구 (만명)', '#e67e22'),
    ('workplace', '직장인구 (천명)', '#1abc9c')
]

positions = [(1,1), (1,2), (2,1), (2,2)]

for (var, label, color), pos in zip(scatter_vars, positions):
    if var in df.columns:
        corr = df[var].corr(df['cong_avg'])
        
        fig.add_trace(
            go.Scatter(
                x=df[var],
                y=df['cong_avg'],
                mode='markers',
                marker=dict(size=8, color=color, opacity=0.6),
                text=df['station_name_kr'],
                hovertemplate='%{text}<br>' + label + ': %{x:.1f}<br>혼잡도: %{y:.1f}<extra></extra>',
                showlegend=False
            ),
            row=pos[0], col=pos[1]
        )
        
        # 추세선
        x_valid = df[var].dropna()
        y_valid = df.loc[x_valid.index, 'cong_avg']
        z = np.polyfit(x_valid, y_valid, 1)
        p = np.poly1d(z)
        x_line = np.linspace(x_valid.min(), x_valid.max(), 100)
        
        fig.add_trace(
            go.Scatter(
                x=x_line, y=p(x_line),
                mode='lines',
                line=dict(color='red', dash='dash', width=2),
                showlegend=False
            ),
            row=pos[0], col=pos[1]
        )
        
        # r 값 표시
        fig.add_annotation(
            x=0.95, y=0.95,
            xref=f'x{pos[0]*2+pos[1]-2 if pos != (1,1) else ""} domain',
            yref=f'y{pos[0]*2+pos[1]-2 if pos != (1,1) else ""} domain',
            text=f'r = {corr:.3f}',
            showarrow=False,
            font=dict(size=14, color='red'),
            bgcolor='white',
            row=pos[0], col=pos[1]
        )

fig.update_layout(
    title=dict(text='주요 변수와 평균 혼잡도 관계', font=dict(size=20)),
    template='plotly_white',
    width=1000,
    height=800,
    showlegend=False
)

fig.update_yaxes(title_text='평균 혼잡도')

fig.show()
save_fig(fig, '03_scatter_main_variables.png', width=1000, height=800)

Saved: ../output/visualization/03_scatter_main_variables.png


## 4. 변수 중요도 (Random Forest)

In [7]:
# Random Forest 변수 중요도
rf_features = ['bldg_count', 'bldg_height', 'bldg_area', 'residential', 'commercial', 
               'office', 'sales', 'floating', 'workplace']
rf_features = [f for f in rf_features if f in df.columns]

X = df[rf_features].fillna(0)
y = df['cong_avg']

rf = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=5)
rf.fit(X, y)

importance_df = pd.DataFrame({
    'feature': rf_features,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=True)

# 한글 레이블 매핑
label_map = {
    'bldg_count': '건물 수', 'bldg_height': '평균 높이', 'bldg_area': '총 연면적',
    'residential': '주거 건물', 'commercial': '상업 건물', 'office': '업무 건물',
    'sales': '추정매출', 'floating': '유동인구', 'workplace': '직장인구'
}
importance_df['label'] = importance_df['feature'].map(label_map)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=importance_df['importance'],
    y=importance_df['label'],
    orientation='h',
    marker=dict(
        color=importance_df['importance'],
        colorscale='Blues',
        line=dict(color='darkblue', width=1)
    ),
    text=importance_df['importance'].round(3),
    textposition='outside',
    textfont=dict(size=12)
))

fig.update_layout(
    title=dict(text=f'혼잡도 예측 변수 중요도 (R² = {rf.score(X, y):.3f})', font=dict(size=20)),
    xaxis=dict(title='중요도', tickfont=dict(size=12)),
    yaxis=dict(title='', tickfont=dict(size=13)),
    template='plotly_white',
    width=800,
    height=500
)

fig.show()
save_fig(fig, '04_feature_importance.png', width=800, height=500)

Saved: ../output/visualization/04_feature_importance.png


## 5. 호선별 혼잡도 분포

In [8]:
# 호선별 혼잡도 박스플롯
line_colors = {'2호선': '#3CB44B', '4호선': '#00A1E9', '5호선': '#8B50A4'}

fig = go.Figure()

for line in ['2호선', '4호선', '5호선']:
    line_data = df[df['line_name'] == line]['cong_avg']
    fig.add_trace(go.Box(
        y=line_data,
        name=line,
        marker_color=line_colors[line],
        boxmean=True
    ))

fig.update_layout(
    title=dict(text='호선별 평균 혼잡도 분포', font=dict(size=20)),
    yaxis=dict(title='평균 혼잡도', tickfont=dict(size=12)),
    xaxis=dict(tickfont=dict(size=14)),
    template='plotly_white',
    width=700,
    height=500,
    showlegend=False
)

fig.show()
save_fig(fig, '05_congestion_by_line.png', width=700, height=500)

Saved: ../output/visualization/05_congestion_by_line.png


## 6. 역 클러스터링 시각화

In [9]:
# 클러스터링
cluster_features = ['residential', 'commercial', 'office', 'cong_avg']
cluster_features = [f for f in cluster_features if f in df.columns]

df_cluster = df.dropna(subset=cluster_features).copy()

# 비율 계산
df_cluster['res_ratio'] = df_cluster['residential'] / df_cluster['bldg_count']
df_cluster['off_ratio'] = df_cluster['office'] / df_cluster['bldg_count']

X_cluster = df_cluster[['res_ratio', 'off_ratio', 'cong_avg']].fillna(0)
X_scaled = StandardScaler().fit_transform(X_cluster)

kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
df_cluster['cluster'] = kmeans.fit_predict(X_scaled)

# 클러스터 이름 지정
cluster_names = {0: '주거형', 1: '업무형', 2: '혼합형', 3: '상업형'}
cluster_means = df_cluster.groupby('cluster')[['res_ratio', 'off_ratio', 'cong_avg']].mean()

# 클러스터 특성에 따라 이름 재지정
for c in range(4):
    res = cluster_means.loc[c, 'res_ratio']
    off = cluster_means.loc[c, 'off_ratio']
    cong = cluster_means.loc[c, 'cong_avg']
    if off > 0.03:
        cluster_names[c] = '업무 밀집'
    elif res > 0.5:
        cluster_names[c] = '주거 중심'
    elif cong > 40:
        cluster_names[c] = '고혼잡'
    else:
        cluster_names[c] = '혼합형'

df_cluster['cluster_name'] = df_cluster['cluster'].map(cluster_names)

fig = px.scatter(
    df_cluster,
    x='res_ratio',
    y='cong_avg',
    color='cluster_name',
    size='office',
    hover_name='station_name_kr',
    color_discrete_sequence=px.colors.qualitative.Set2,
    labels={'res_ratio': '주거 건물 비율', 'cong_avg': '평균 혼잡도', 'cluster_name': '역 유형'}
)

fig.update_layout(
    title=dict(text='역 유형별 클러스터링', font=dict(size=20)),
    template='plotly_white',
    width=900,
    height=600,
    legend=dict(font=dict(size=13))
)

fig.show()
save_fig(fig, '06_station_clustering.png', width=900, height=600)

Saved: ../output/visualization/06_station_clustering.png


## 7. Top 혼잡 역 비교

In [11]:
# 상위 15개 역
top_stations = df.nlargest(15, 'cong_avg')[['station_name_kr', 'cong_avg', 'line_name']].copy()
top_stations = top_stations.sort_values('cong_avg', ascending=True)

fig = go.Figure()

fig.add_trace(go.Bar(
    x=top_stations['cong_avg'],
    y=top_stations['station_name_kr'],
    orientation='h',
    marker=dict(
        color=[line_colors.get(l, 'gray') for l in top_stations['line_name']],
        line=dict(width=1, color='white')
    ),
    text=top_stations['cong_avg'].round(1),
    textposition='outside',
    textfont=dict(size=11)
))

fig.update_layout(
    title=dict(text='혼잡도 상위 15개 역', font=dict(size=20)),
    xaxis=dict(title='평균 혼잡도',  tickfont=dict(size=12)),
    yaxis=dict(tickfont=dict(size=12)),
    template='plotly_white',
    width=800,
    height=600
)

# 범례 추가
for line, color in line_colors.items():
    fig.add_trace(go.Scatter(
        x=[None], y=[None],
        mode='markers',
        marker=dict(size=15, color=color),
        name=line
    ))

fig.update_layout(legend=dict(orientation='h', yanchor='bottom', y=1.02, font=dict(size=12)))

fig.show()
save_fig(fig, '07_top_congestion_stations.png', width=800, height=600)

Saved: ../output/visualization/07_top_congestion_stations.png


## 8. 시간대별 혼잡도 패턴

In [13]:
# 전체 시간대별 혼잡도
time_cols = ['cong_00_06', 'cong_06_11', 'cong_11_14', 'cong_14_17', 'cong_17_21', 'cong_21_24']
time_cols = [c for c in time_cols if c in df.columns]
time_labels = ['심야\n(00-06)', '출근\n(06-11)', '점심\n(11-14)', '오후\n(14-17)', '퇴근\n(17-21)', '야간\n(21-24)']

avg_by_time = df[time_cols].mean()

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=time_labels[:len(time_cols)],
    y=avg_by_time.values,
    mode='lines+markers+text',
    line=dict(width=4, color='#3498db'),
    marker=dict(size=15, symbol='circle', color='#3498db'),
    text=[f'{v:.1f}' for v in avg_by_time.values],
    textposition='top center',
    textfont=dict(size=14, color='#2c3e50'),
    fill='tozeroy',
    fillcolor='rgba(52, 152, 219, 0.2)'
))

fig.update_layout(
    title=dict(text='시간대별 평균 혼잡도 패턴', font=dict(size=20)),
    xaxis=dict(title='시간대', tickfont=dict(size=13)),
    yaxis=dict(title='평균 혼잡도', tickfont=dict(size=12),
               range=[0, max(avg_by_time.values) * 1.2]),
    template='plotly_white',
    width=900,
    height=500
)

fig.show()
save_fig(fig, '08_daily_congestion_pattern.png', width=900, height=500)

Saved: ../output/visualization/08_daily_congestion_pattern.png


## 9. 종합 대시보드

In [14]:
# 종합 대시보드
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=[
        '시간대별 평균 혼잡도',
        '호선별 혼잡도 분포',
        '업무 건물 vs 혼잡도',
        '변수 중요도 Top 5'
    ],
    specs=[
        [{"type": "scatter"}, {"type": "box"}],
        [{"type": "scatter"}, {"type": "bar"}]
    ],
    horizontal_spacing=0.12,
    vertical_spacing=0.15
)

# 1. 시간대별 패턴
fig.add_trace(
    go.Scatter(
        x=time_labels[:len(time_cols)],
        y=avg_by_time.values,
        mode='lines+markers',
        line=dict(width=3, color='#3498db'),
        marker=dict(size=10),
        showlegend=False
    ),
    row=1, col=1
)

# 2. 호선별 박스플롯
for line in ['2호선', '4호선', '5호선']:
    line_data = df[df['line_name'] == line]['cong_avg']
    fig.add_trace(
        go.Box(y=line_data, name=line, marker_color=line_colors[line], showlegend=False),
        row=1, col=2
    )

# 3. 업무 건물 산점도
if 'office' in df.columns:
    fig.add_trace(
        go.Scatter(
            x=df['office'], y=df['cong_avg'],
            mode='markers',
            marker=dict(size=6, color='#2ecc71', opacity=0.6),
            showlegend=False
        ),
        row=2, col=1
    )

# 4. 변수 중요도 Top 5
top5 = importance_df.nlargest(5, 'importance')
fig.add_trace(
    go.Bar(
        x=top5['importance'],
        y=top5['label'],
        orientation='h',
        marker_color='#9b59b6',
        showlegend=False
    ),
    row=2, col=2
)

fig.update_layout(
    title=dict(text='지하철 혼잡도 분석 대시보드 (2, 4, 5호선)', font=dict(size=22)),
    template='plotly_white',
    width=1200,
    height=900,
    showlegend=False
)

fig.show()
save_fig(fig, '09_dashboard.png', width=1200, height=900)

Saved: ../output/visualization/09_dashboard.png


In [15]:
print("=" * 60)
print("시각화 완료!")
print("=" * 60)
print(f"\n저장 위치: {OUTPUT_DIR}")
print("\n생성된 파일:")
for f in sorted(os.listdir(OUTPUT_DIR)):
    if f.endswith('.png'):
        print(f"  - {f}")

시각화 완료!

저장 위치: ../output/visualization

생성된 파일:
  - 01_correlation_heatmap.png
  - 02_time_pattern_by_building.png
  - 03_scatter_main_variables.png
  - 04_feature_importance.png
  - 05_congestion_by_line.png
  - 06_station_clustering.png
  - 07_top_congestion_stations.png
  - 08_daily_congestion_pattern.png
  - 09_dashboard.png
