In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm
import seaborn as sns
import pandas as pd
from datetime import datetime as dt
import numpy as np
import os

In [42]:
# 데이터 불러오기
df = pd.read_pickle('/content/서울시.pkl')
print(df.head())
df.dtypes

   침수면적(1,000㎡)  총강수량(mm)  기간(일)   태풍  평균기온(℃)  최다강수1H  최대풍속(m/s)         일자  \
2           0.4     284.5      5  0.0     24.3    25.0        5.5 2013-07-13   
3           1.8     410.5      8  0.0     23.6    37.5        6.9 2002-08-08   
4           4.8     306.5      4  0.0     23.2    72.5        4.9 2011-07-28   
5           7.1     340.0      5  0.0     23.6    39.0        5.7 2003-08-25   
6          15.0     764.5     13  0.0     25.0    46.5        9.1 1999-07-29   

  행정구역  
2  도봉구  
3  도봉구  
4  도봉구  
5  도봉구  
6  도봉구  


Unnamed: 0,0
"침수면적(1,000㎡)",float64
총강수량(mm),float64
기간(일),int64
태풍,float64
평균기온(℃),float64
최다강수1H,float64
최대풍속(m/s),float64
일자,datetime64[ns]
행정구역,object


# 예측 전 전처리
- 예측에 쓰이지 않는 데이터를 드롭합니다.
  + 일자 같은 경우에는 장마철에 침수가 주로 일어난다는 것은 알았으나, 침수가 자주 일어나는 7~9월을 원핫인코딩을 진행하여 예측을 하기에는 장마철이 일관성있지 않게 매년 빠르거나 느리게 오기 때문에 '일자' 컬럼은 예측에서 고려하지 않겠습니다.
  + 행정구역은 원핫인코딩을 진행해서 하기에는 가장 많은 침수피해 데이터를 가진 서대문구 조차도 25회의 침수피해로 상당히 적은 수의 데이터를 가지고 있습니다. 그러므로 원핫인코딩을 진행하기보다는 '행정구역' 컬럼을 제외하고 서울시 전체의 데이터를 기준으로 진행하겠습니다.
    - 평균기온과 풍속의 경우에는 서울시 전체를 기준으로 하였을 때, 침수피해와 낮은 상관계수를 보이고 있습니다. 서울시 전체를 기준으로 예측을 진행할 것이기 때문에 '평균기온(℃)'컬럼과 '최대풍속(m/s)'컬럼을 제외하겠습니다.
  + 태풍은 어느 행정구역이든 침수피해와는 낮은 상관관계를 지녔습니다. 그렇기에 '태풍'컬럼 또한 제외하고 진행하도록 하겠습니다.


In [51]:
df.drop(columns=['일자', '태풍', '행정구역', '평균기온(℃)', '최대풍속(m/s)'], inplace=True)

In [52]:
df.head(3)

Unnamed: 0,"침수면적(1,000㎡)",총강수량(mm),기간(일),최다강수1H
2,0.4,284.5,5,25.0
3,1.8,410.5,8,37.5
4,4.8,306.5,4,72.5


In [11]:
df.to_pickle('서울시_예측.pkl')

In [12]:
df.to_csv("서울시_예측.csv", index=False, encoding="utf-8-sig")

# 선형회귀

In [13]:
X = df[['총강수량(mm)', '기간(일)', '최다강수1H']]
y = df['침수면적(1,000㎡)']

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [15]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_test)

print("실제값:", y_test.values)
print("예측값:", y_pred)

실제값: [4.6000e+00 2.4000e+00 7.4100e+01 1.1800e+01 4.9870e+02 1.2255e+03
 2.4000e+00 5.8400e+01 1.3600e+01 8.0500e+01 3.8000e+00 1.2600e+01
 2.6600e+01 4.0000e-01 2.1920e+02 1.4000e+00 4.3200e+01 4.0800e+02
 1.1920e+02 4.0000e-01 7.2000e+00 2.4000e+00 1.4000e+00 3.2000e+00
 7.5000e+00 9.0000e-01 4.3500e+01 8.9000e+00 4.7500e+01 1.4000e+01
 1.2300e+01 1.4292e+03 9.1900e+01 1.0000e-01 4.3850e+02 2.0300e+01
 2.3000e+00 9.0000e-01 6.0560e+02 2.0820e+02 3.2900e+01 1.1940e+02
 8.2000e+00 2.9000e+00 4.0400e+02 7.1000e+00 4.0000e-01 2.8000e+00
 1.0752e+03 9.0000e-01 3.8000e+00 1.7200e+01 3.7500e+01 1.3280e+02
 4.3000e+00 2.3500e+01 5.0000e-01 4.7000e+00 2.6700e+01 1.0140e+02
 5.1100e+01 3.8000e+00 3.9000e+00 5.4000e+00 1.4250e+02 1.5600e+01
 5.2000e+00 3.3460e+02 2.4100e+01 4.8750e+02 1.3450e+02 3.1300e+01
 2.1460e+02 9.0000e-01 5.7000e+00 2.4000e+00 5.0000e-01 3.8800e+01
 7.0100e+01 6.0960e+02 2.8000e+00 4.4000e+00 3.9100e+01 9.8000e+00
 9.0000e-01 1.4000e+00 1.4000e+01 1.4970e+02 3.3000e+00 9

In [41]:
len(df)

318

In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print("MAE :", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²  :", r2_score(y_test, y_pred))

MAE : 166.99586546555236
RMSE: 231.37814083936323
R²  : 0.21932421117559442


## 결과
평균적으로 침수면적(1,000㎡) 당 166㎡의 오차 발생

# 랜덤포레스트

In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [19]:
# 입력 변수(X)와 타겟(y) 분리
X = df[['총강수량(mm)', '기간(일)', '최다강수1H']]
y = df['침수면적(1,000㎡)']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [21]:
rf = RandomForestRegressor(
    n_estimators=200,     # 트리 개수
    max_depth=None,       # 트리 깊이 제한 없음
    random_state=42
)

rf.fit(X_train, y_train)

In [22]:
y_pred = rf.predict(X_test)

In [23]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE  : {mae:.3f}")
print(f"RMSE : {rmse:.3f}")
print(f"R²   : {r2:.3f}")


MAE  : 91.893
RMSE : 196.979
R²   : 0.510


## 결과
평균적으로 침수면적(1,000㎡) 당 91.9㎡의 오차 발생
- 선형회귀보다 개선된 예측

# 딥러닝

In [53]:
df.dtypes

Unnamed: 0,0
"침수면적(1,000㎡)",float64
총강수량(mm),float64
기간(일),int64
최다강수1H,float64


In [54]:
df.head()

Unnamed: 0,"침수면적(1,000㎡)",총강수량(mm),기간(일),최다강수1H
2,0.4,284.5,5,25.0
3,1.8,410.5,8,37.5
4,4.8,306.5,4,72.5
5,7.1,340.0,5,39.0
6,15.0,764.5,13,46.5


In [56]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# ===============================
# 1. df 준비
# ===============================

# 타깃 / 피처 분리
X = df.drop(columns=['침수면적(1,000㎡)'])
y = df['침수면적(1,000㎡)']

# ===============================
# 2. 데이터 분할 (랜덤)
# ===============================
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# ===============================
# 3. 스케일링
# ===============================
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

# ===============================
# 4. 모델 생성 함수
# ===============================
def build_model(input_dim, hidden_units, dropout_rate, lr):
    model = Sequential()
    model.add(Dense(hidden_units, activation='relu', input_dim=input_dim))
    model.add(Dropout(dropout_rate))
    model.add(Dense(hidden_units // 2, activation='relu'))
    model.add(Dense(1))  # 회귀

    model.compile(
        optimizer=Adam(learning_rate=lr),
        loss='mse'
    )
    return model

# ===============================
# 5. 하이퍼파라미터 후보
# ===============================
hidden_units_list = [32, 64, 128]
dropout_list = [0.1, 0.2, 0.3]
learning_rate_list = [0.001, 0.0005]

best_score = np.inf
best_result = None
results = []

# ===============================
# 6. 하이퍼파라미터 탐색
# ===============================
for hidden_units in hidden_units_list:
    for dropout_rate in dropout_list:
        for lr in learning_rate_list:
            print(f"\n[실험] units={hidden_units}, dropout={dropout_rate}, lr={lr}")

            model = build_model(
                input_dim=X_train.shape[1],
                hidden_units=hidden_units,
                dropout_rate=dropout_rate,
                lr=lr
            )

            es = EarlyStopping(
                monitor='val_loss',
                patience=20,
                restore_best_weights=True,
                verbose=0
            )

            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=300,
                batch_size=16,
                callbacks=[es],
                verbose=0
            )

            # ===============================
            # 수정 1: 검증 RMSE 계산
            # ===============================
            val_pred = model.predict(X_val).ravel()
            val_mse = mean_squared_error(y_val, val_pred)
            rmse = np.sqrt(val_mse)

            results.append({
                'hidden_units': hidden_units,
                'dropout': dropout_rate,
                'lr': lr,
                'val_rmse': rmse
            })

            if rmse < best_score:
                best_score = rmse
                best_result = {
                    'model': model,
                    'params': (hidden_units, dropout_rate, lr)
                }

# ===============================
# 7. 최적 모델 평가 (Test)
# ===============================
best_model = best_result['model']

test_pred = best_model.predict(X_test).ravel()

mae = mean_absolute_error(y_test, test_pred)

# ===============================
# 2: 테스트 RMSE 계산
# ===============================
test_mse = mean_squared_error(y_test, test_pred)
rmse = np.sqrt(test_mse)

r2 = r2_score(y_test, test_pred)

print("\n===== 최종 테스트 성능 =====")
print(f"MAE  : {mae:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"R²   : {r2:.4f}")

print("\n최적 하이퍼파라미터:")
print(f"Hidden Units: {best_result['params'][0]}")
print(f"Dropout     : {best_result['params'][1]}")
print(f"LR          : {best_result['params'][2]}")

# ===============================
# 8. 결과 정리
# ===============================
results_df = pd.DataFrame(results).sort_values('val_rmse')
results_df.head()



[실험] units=32, dropout=0.1, lr=0.001


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 46ms/step



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step

[실험] units=32, dropout=0.1, lr=0.0005


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



[실험] units=32, dropout=0.2, lr=0.001
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step

[실험] units=32, dropout=0.2, lr=0.0005


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step

[실험] units=32, dropout=0.3, lr=0.001


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step

[실험] units=32, dropout=0.3, lr=0.0005


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step

[실험] units=64, dropout=0.1, lr=0.001


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step

[실험] units=64, dropout=0.1, lr=0.0005


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step

[실험] units=64, dropout=0.2, lr=0.001


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step

[실험] units=64, dropout=0.2, lr=0.0005


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step

[실험] units=64, dropout=0.3, lr=0.001


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step

[실험] units=64, dropout=0.3, lr=0.0005


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step

[실험] units=128, dropout=0.1, lr=0.001


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step

[실험] units=128, dropout=0.1, lr=0.0005


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step

[실험] units=128, dropout=0.2, lr=0.001


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step

[실험] units=128, dropout=0.2, lr=0.0005


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step

[실험] units=128, dropout=0.3, lr=0.001


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step

[실험] units=128, dropout=0.3, lr=0.0005


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step

===== 최종 테스트 성능 =====
MAE  : 149.1778
RMSE : 287.7869
R²   : 0.2455

최적 하이퍼파라미터:
Hidden Units: 32
Dropout     : 0.3
LR          : 0.0005


Unnamed: 0,hidden_units,dropout,lr,val_rmse
5,32,0.3,0.0005,168.476153
3,32,0.2,0.0005,168.537046
7,64,0.1,0.0005,168.598503
1,32,0.1,0.0005,168.612439
6,64,0.1,0.001,168.653831


## 결과
평균적으로 침수면적(1,000㎡) 당 149.2㎡의 오차 발생

# 시계열
- 해당 데이터가 꾸준히 업데이트 되어온 것이 아니므로 시계열 예측 및 순석이 불가능하다고 판단됩니다.