선 본 대회의 의도는 추론하고자 하는 기간의 10분 전에, 한강의 주요 지점(다리)에 대한 수위를 예측

[팔당댐 홍수 안전운영에 따른 한강 수위예측 AI 경진대회, DACON](https://dacon.io/competitions/official/235949/data)

In [None]:
import pandas as pd
import numpy as np

from glob import glob
from tqdm import tqdm
from scipy import interpolate

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, AveragePooling1D, GlobalAveragePooling1D

'''#GPU 자원이 부족한 경우 아래 코드를 이용하세요
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)'''

'#GPU 자원이 부족한 경우 아래 코드를 이용하세요\nfrom tensorflow.compat.v1 import ConfigProto\nfrom tensorflow.compat.v1 import InteractiveSession\n\nconfig = ConfigProto()\nconfig.gpu_options.allow_growth = True\nsession = InteractiveSession(config=config)'

### 파일 받아오기

In [None]:
path = '/content/drive/MyDrive/Colab Notebooks/dataset/dacon/competition_data.zip'

import zipfile
from zipfile import ZipFile

with zipfile.ZipFile(f'{path}', 'r') as obj:
  namelist = obj.namelist()
  obj.extractall('/content/competition_data')

In [None]:
w_list = sorted(glob("competition_data/water_data/*.csv"))
rf_list = sorted(glob("competition_data/rf_data/*.csv"))

### dataframe으로 합쳐서 변환(연도 순서대로)

In [None]:
df = pd.DataFrame(pd.merge(pd.read_csv(w_list[0]), pd.read_csv(rf_list[0])))
for w, rf in zip(w_list[1:], rf_list[1:]) :
  df_sample = pd.merge(pd.read_csv(w), pd.read_csv(rf))
  df = df.append(df_sample)
df = df.reset_index(drop=True)
df = df.sort_values(by=['ymdhm'], axis=0)
df.tail(3)

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,wl_1018662,fw_1018662,wl_1018680,fw_1018680,wl_1018683,fw_1018683,wl_1019630,fw_1019630,rf_10184100,rf_10184110,rf_10184140
276333,2022-07-18 23:30,25.04,259.37,212.86,31.14,259.37,475.0,0.0,387.55,0.0,,0.0,-843.37,0.0,1039.9,0.0,0.0,0.0
276334,2022-07-18 23:40,25.04,259.13,212.86,31.14,259.13,458.0,0.0,454.91,0.0,,0.0,-1023.37,0.0,1073.46,0.0,0.0,0.0
276335,2022-07-18 23:50,25.04,258.16,212.86,31.14,258.16,442.0,0.0,582.21,0.0,,0.0,-1049.44,0.0,1090.45,0.0,0.0,0.0


In [None]:
# 단위 맞추기
df['sfw'] = df['sfw']*10000
df['ecpc'] = df['ecpc']*1000000

In [None]:
df.shape

(276336, 18)

### test dataset 따로 저장하기
데이터 누수 제거

In [None]:
# df에 sample_submission의 날짜는 제외해야한다. => test data
# 기존 df에서 test data를 지우기 위해 sample_submission의 시작 date 알기
submission_df = pd.read_csv('competition_data/sample_submission.csv').sort_values(by=['ymdhm'], axis=0)
print(submission_df.shape)
submission_df.head(3)

(6912, 5)


Unnamed: 0,ymdhm,wl_1018662,wl_1018680,wl_1018683,wl_1019630
0,2022-06-01 00:00,0,0,0,0
1,2022-06-01 00:10,0,0,0,0
2,2022-06-01 00:20,0,0,0,0


In [None]:
# test dataset 저장, train dataset에서 해당 data 삭제
start_data = df[df['ymdhm'] == submission_df.ymdhm.iloc[0]].index[0]
print(start_data)
test_df = df[df.index >= start_data-1]  # 이전데이터포함
test_df = test_df.reset_index(drop=True)
df = df.iloc[:start_data]
print(df.shape, test_df.shape)
test_df.head(3)

269424
(269424, 18) (6913, 18)


Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,wl_1018662,fw_1018662,wl_1018680,fw_1018680,wl_1018683,fw_1018683,wl_1019630,fw_1019630,rf_10184100,rf_10184110,rf_10184140
0,2022-05-31 23:50,25.19,140.56,2183600.0,25640000.0,140.56,173.0,278.7,269.4,276.2,,275.0,70.09,270.3,482.26,0.0,0.0,0.0
1,2022-06-01 00:00,25.19,140.6,2183600.0,25640000.0,140.6,162.0,0.0,269.4,0.0,,0.0,8.23,0.0,471.08,0.0,0.0,0.0
2,2022-06-01 00:10,25.19,140.78,2183600.0,25640000.0,140.78,151.0,0.0,280.22,0.0,,0.0,28.82,0.0,449.12,0.0,0.0,0.0


In [None]:
df.tail(3)

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,wl_1018662,fw_1018662,wl_1018680,fw_1018680,wl_1018683,fw_1018683,wl_1019630,fw_1019630,rf_10184100,rf_10184110,rf_10184140
269421,2022-05-31 23:30,25.18,141.01,2179900.0,26010000.0,141.01,196.0,278.7,269.4,277.2,,277.0,425.89,272.3,505.03,0.0,0.0,0.0
269422,2022-05-31 23:40,25.19,755.75,2183600.0,25640000.0,140.75,184.0,277.7,264.07,276.2,,276.0,198.19,271.3,493.58,0.0,0.0,0.0
269423,2022-05-31 23:50,25.19,140.56,2183600.0,25640000.0,140.56,173.0,278.7,269.4,276.2,,275.0,70.09,270.3,482.26,0.0,0.0,0.0


### NULL값 idx 저장 및 보강

na_idxes에 null값이 있던 idx 저장. 이후에 label에 null값이 있다면 그 것은 train dataset에서 제거할 예정이다.

In [None]:
na_idxes = {}
print(f'shape : {df.shape}')
print(f'null : {df.isna().sum()}')
for col in df.columns :
  na_idxes[col] = df[df[col].isna() == True].index
df = df.fillna(df.interpolate())
print(f'null : {df.isna().sum()}')

In [None]:
# 진관료 : 남양주시 rf_10184100 , 대곡교 : 송파구 rf_10184110,  송정동 : 서울 성동구 rf_10184140
features = ['swl', 'inf', 'sfw', 'ecpc', 'tototf', 'tide_level', 'fw_1018662', 'fw_1018680', 'fw_1018683', 'fw_1019630']
rainfall = ['rf_10184100', 'rf_10184110', 'rf_10184140']
label = ['wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']


In [None]:
# 교집합 구하기 ( 같은 위치에서 모두 nan이 되어서 합집합과 같아졌다.)
unique_idx = set(list(na_idxes['wl_1018662'])) & set(list(na_idxes['wl_1018680'])) & set(list(na_idxes['wl_1018683'])) & set(list(na_idxes['wl_1019630']))
print(len(unique_idx))
print(sorted(unique_idx))
mmin_idx, mmax_idx, mmmax_idx = sorted(unique_idx)[0], sorted(unique_idx)[-2], sorted(unique_idx)[-1]

59
[140613, 140614, 140615, 140616, 140617, 140618, 140619, 140620, 140621, 140622, 140623, 140624, 140625, 140626, 140627, 140628, 140629, 140630, 140631, 140632, 140633, 140634, 140635, 140636, 140637, 140638, 140639, 140640, 140641, 140642, 140643, 140644, 140645, 140646, 140647, 140648, 140649, 140650, 140651, 140652, 140653, 140654, 140655, 140656, 140657, 140658, 140659, 140660, 140661, 140662, 140663, 140664, 140665, 140666, 140667, 140668, 140669, 140670, 236080]


In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# fig,axs = plt.subplots(2,2)
# sns.lineplot(data=df,x='wl_1018662',y='fw_1018662',ax=axs[0,0])
# sns.lineplot(data=df,x='wl_1018680',y='fw_1018680',ax=axs[0,1])
# sns.lineplot(data=df,x='wl_1018683',y='fw_1018683',ax=axs[1,0])
# sns.lineplot(data=df,x='wl_1019630',y='fw_1019630',ax=axs[1,1])

In [None]:
# test dataset 보정
na_idxes = {}
print(f'shape : {test_df.shape}')
for col in test_df.columns :
  na_idxes[col] = test_df[test_df[col].isna() == True].index
test_df = test_df.fillna(test_df.interpolate())
print(f'null : {test_df.isna().sum()}')

shape : (6931, 19)
null : 0              6913
ecpc             18
fw_1018662       18
fw_1018680       18
fw_1018683       18
fw_1019630       18
inf              18
rf_10184100      18
rf_10184110      18
rf_10184140      18
sfw              18
swl              18
tide_level       18
tototf           18
wl_1018662       18
wl_1018680       18
wl_1018683       18
wl_1019630       18
ymdhm            18
dtype: int64


In [None]:
test_df = test_df.fillna(0)
test_df.shape

(6913, 18)

## 모델링

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
# from sklearn.multioutput import MultiOutputRegressor
# import xgboost as xgb
from sklearn.linear_model import ElasticNet, Lasso

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(df[features[:-3]+rainfall[:-2]], train_size=0.80, test_size=0.20, random_state=2)
label_train, label_val = train_test_split(df[label[:-3]], train_size=0.80, test_size=0.20, random_state=2)

In [None]:
df.iloc[:-1].shape, df.shape

((269423, 18), (269424, 18))

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(df[features[:-3]+rainfall[:-2]].iloc[:-1], train_size=0.80, test_size=0.20, random_state=2)
label_train, label_val = train_test_split(df[label[:-3]].iloc[1:], train_size=0.80, test_size=0.20, random_state=2)
train.shape, val.shape, label_train.shape, label_val.shape

((215538, 8), (53885, 8), (215538, 1), (53885, 1))

In [None]:
from sklearn.metrics import mean_squared_error
def poly_pipe(alphas) :
  for i in alphas :
    poly_pipeline = make_pipeline(
        PolynomialFeatures(degree=2, include_bias=False),
        ElasticNet(alpha=i, l1_ratio=0.1)
    )
    poly_pipeline.fit(train, label_train)
    poly_pred = poly_pipeline.predict(val)
    print(mean_squared_error(label_val, poly_pred, squared=False))
print([0.25, 0.1, 0.01, 0.05])
poly_pipe([0.25, 0.1, 0.01, 0.05])  # rmse : 11

# 10분 미루기(1 row 미루기)
# poly_pipe([0.25, 0.1, 0.01, 0.05])  # rmse : 11
# poly_pipe([0.25, 0.1, 0.01, 0.05])  # poly 없을 때 rmse : 20
poly_pipe([0.25, 0.1, 0.01, 0.05])  # poly 있을 때, 4가지 label 모두 rmse : 8.19, 8.19, 

In [None]:
def poly_lasso_pipe(alphas) :
  for i in alphas :
    lasso_pipeline = make_pipeline(
        PolynomialFeatures(degree=2, include_bias=False),
        Lasso(alpha=i)
    )
    lasso_pipeline.fit(train, label_train)
    poly_pred = lasso_pipeline.predict(val)
    print(mean_squared_error(label_val, poly_pred, squared=False))
print([0.25, 0.1, 0.01, 0.05])
# poly_lasso_pipe([0.25, 0.1, 0.01, 0.05])  # rmse : 11
# 10분 미뤘을 때
# poly_lasso_pipe([0.25, 0.1, 0.01, 0.05])  # rmse : 11.55
poly_lasso_pipe([0.25, 0.1, 0.01, 0.05])  # 4 변수 모두 rmse : 8.19, 

[0.25, 0.1, 0.01, 0.05]


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


8.193855229398045


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


8.192389289555589


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


8.183479345195003


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


8.191665468591738


  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


In [None]:
import xgboost
def poly_xgboost_pipe(lr) :
  for lrr in lr :
    xgboost_pipeline = make_pipeline(
        # PolynomialFeatures(degree=2, include_bias=False),
        xgboost.XGBRegressor(n_estimators=500, learning_rate=lrr, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7)
    )
    xgboost_pipeline.fit(train, label_train)
    poly_pred = xgboost_pipeline.predict(val)
    print(mean_squared_error(label_val, poly_pred, squared=False))
# poly_xgboost_pipe([0.25, 0.1, 0.01, 0.05])  # rmse : 1.28, 1.40, 123.55, 2.68
# poly_xgboost_pipe([0.25, 0.3, 0.5])  # rmse : 1.28, 1.32, 1.48
# poly_xgboost_pipe([0.25, 0.2, 0.15])  # rmse : 1.2838, 1.2861, 1.2763
# poly_xgboost_pipe([0.25, 0.2, 0.15, 0.125, 0.1])  # poly rmse : 1.6456, 1.5485, 1.4009, 1.3963
# 10문 미뤘을 때 1 변수
poly_xgboost_pipe([0.25, 0.2, 0.15, 0.125, 0.1])  # xgboost rmse : 2.9467, 2.9468, 2.9380, 2.8844, 2.90

2.946770384408707
2.9468224047046694
2.9380591053730174
2.8844266962435507
2.903197742275271


In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(df[features+rainfall].iloc[:-1], train_size=0.80, test_size=0.20, random_state=2)
label_train, label_val = train_test_split(df[label].iloc[1:], train_size=0.80, test_size=0.20, random_state=2)

In [None]:
from sklearn.ensemble import RandomForestRegressor
def ramdomforest_pipe(depth) :
  for dp in depth :
    rdf_pipeline = make_pipeline(
       RandomForestRegressor(max_depth=dp, random_state=0)
    )
    rdf_pipeline.fit(df[features+rainfall].iloc[:-1], df[label].iloc[1:])
    rdf_pred = rdf_pipeline.predict(val)
    print(mean_squared_error(label_val, rdf_pred, squared=False))
ramdomforest_pipe([2, 8, 16, 32, 64])  # rdf rms 4 label, 10분 : 24.25 , 4.76, 2.95, 
ramdomforest_pipe([2, 8, 16, 32, 64])  # rdf rms 4 label : 24.25  

In [None]:
rdf_pipeline = make_pipeline(
       RandomForestRegressor(max_depth=16, random_state=0)
    )
rdf_pipeline.fit(df[features+rainfall].iloc[:-1], df[label].iloc[1:])
rdf_pred = rdf_pipeline.predict(val)

In [None]:
test_df.shape

(6913, 18)

In [None]:
rdf_pred = rdf_pipeline.predict(test_df[features+rainfall].iloc[:-1])
rdf_pred.shape

(6912, 4)

### 저장

In [None]:
poly_pred = pd.DataFrame(poly_pred)
lasso_pred = pd.DataFrame(lasso_pred)

In [None]:
sample_submission = pd.read_csv("competition_data/sample_submission.csv")

sample_submission["wl_1018662"] = rdf_pred[:, 0]
sample_submission["wl_1018680"] = rdf_pred[:, 1]
sample_submission["wl_1018683"] = rdf_pred[:, 2]
sample_submission["wl_1019630"] = rdf_pred[:, 3]

In [None]:
sample_submission.to_csv("rdf.csv", index = False)

In [None]:
rdf_pred[0]

array([279.17592982, 272.22739921, 272.42953892, 270.00281624])