### 필요한 라이브러리 불러오기

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

from glob import glob
from tqdm import tqdm
from scipy import interpolate

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, AveragePooling1D, GlobalAveragePooling1D
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pylab as plt


# from tensorflow.compat.v1 import ConfigProto
# from tensorflow.compat.v1 import InteractiveSession

# config = ConfigProto()
# config.gpu_options.allow_growth = True
# session = InteractiveSession(config=config)

### 데이터 전처리

In [3]:
w_list = sorted(glob("/content/drive/MyDrive/Study/Dacon/dam/water_data/*.csv"))
r_list = sorted(glob("/content/drive/MyDrive/Study/Dacon/dam/rf_data/*.csv"))
w_list

['/content/drive/MyDrive/Study/Dacon/dam/water_data/data_2012.csv',
 '/content/drive/MyDrive/Study/Dacon/dam/water_data/data_2013.csv',
 '/content/drive/MyDrive/Study/Dacon/dam/water_data/data_2014.csv',
 '/content/drive/MyDrive/Study/Dacon/dam/water_data/data_2015.csv',
 '/content/drive/MyDrive/Study/Dacon/dam/water_data/data_2016.csv',
 '/content/drive/MyDrive/Study/Dacon/dam/water_data/data_2017.csv',
 '/content/drive/MyDrive/Study/Dacon/dam/water_data/data_2018.csv',
 '/content/drive/MyDrive/Study/Dacon/dam/water_data/data_2019.csv',
 '/content/drive/MyDrive/Study/Dacon/dam/water_data/data_2020.csv',
 '/content/drive/MyDrive/Study/Dacon/dam/water_data/data_2021.csv',
 '/content/drive/MyDrive/Study/Dacon/dam/water_data/data_2022.csv']

In [4]:
pd.read_csv(w_list[0]).shape

(26496, 15)

In [5]:
pd.read_csv(r_list[0]).shape

(26496, 4)

In [6]:
pd.read_csv(w_list[0]).head(4)

Unnamed: 0,ymdhm,swl,inf,sfw,ecpc,tototf,tide_level,wl_1018662,fw_1018662,wl_1018680,fw_1018680,wl_1018683,fw_1018683,wl_1019630,fw_1019630
0,2012-05-01 00:00,24.8,555.0,219.07,24.93,555.0,445.0,310.7,469.05,300.2,0.0,290.0,729.8,275.3,540.18
1,2012-05-01 00:10,24.794,464.6,218.86,25.15,562.9,449.0,314.7,498.0,300.2,0.0,290.0,731.48,275.3,540.18
2,2012-05-01 00:20,24.789,478.1,218.69,25.31,576.4,451.0,313.7,490.68,301.2,0.0,290.0,726.42,275.3,540.18
3,2012-05-01 00:30,24.789,464.8,218.69,25.31,563.1,452.0,311.7,476.21,301.2,0.0,290.0,726.42,276.3,552.17


In [7]:
pd.read_csv(w_list[0]).describe()

Unnamed: 0,swl,inf,sfw,ecpc,tototf,tide_level,wl_1018662,fw_1018662,wl_1018680,fw_1018680,wl_1018683,fw_1018683,wl_1019630,fw_1019630
count,26495.0,26495.0,26495.0,26495.0,26495.0,25720.0,26496.0,26496.0,26496.0,26496.0,26496.0,26496.0,26496.0,26496.0
mean,25.042453,674.198675,229.294549,14.706602,675.913304,354.248911,339.474909,826.061119,322.93547,0.0,318.738489,751.475707,302.936322,1089.279554
std,1.27775,881.191078,12.625652,12.625592,878.086197,171.386529,67.209232,912.423482,58.278871,0.0,58.698993,1094.658477,49.652259,1101.845511
min,0.0,-807.5,0.0,2.15,0.0,71.0,264.7,243.29,264.2,0.0,253.0,-3299.97,241.3,212.04
25%,25.03,156.05,227.08,10.78,143.4,204.0,295.7,368.42,283.2,0.0,278.0,257.1975,267.3,449.12
50%,25.12,364.9,230.33,13.67,359.8,332.0,314.7,498.0,302.2,0.0,300.0,495.38,288.3,706.48
75%,25.2,765.2,233.22,16.93,756.05,492.0,361.7,905.32,342.2,0.0,340.0,1021.3575,325.3,1304.91
max,25.439,7693.0,241.85,244.0,7837.7,793.0,761.7,9405.64,703.2,0.0,694.0,9762.54,638.3,13983.59


유입량의 최솟값이 음수  
fw_1018683의 최솟값이 음수 로 나타난다.

In [8]:
pd.read_csv(r_list[0]).describe()

Unnamed: 0,rf_10184100,rf_10184110,rf_10184140
count,26496.0,26496.0,26496.0
mean,0.050725,0.053555,0.057216
std,0.418925,0.453166,0.496421
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,14.0,19.0,21.0


In [9]:
# Scaler
# StandardScaler
def scaler1(tmp):
    scaler = StandardScaler()
    tmp_df = pd.DataFrame(data=tmp, columns=["swl", "inf", "sfw", "ecpc",
                                                       "tototf", "tide_level"])
    scaler.fit(tmp_df)
    tmp_scaled = scaler.transform(tmp_df)

    tmp_df_scaled = pd.DataFrame(data=tmp_scaled, columns=["swl", "inf", "sfw", "ecpc",
                                                       "tototf", "tide_level"])
    return tmp_df_scaled
    
# MinMaxScaler
def scaler2(tmp):
    scaler = MinMaxScaler()
    tmp_df = pd.DataFrame(data=tmp, columns=["swl", "inf", "sfw", "ecpc",
                                                       "tototf", "tide_level"])
    scaler.fit(tmp_df)
    tmp_scaled = scaler.transform(tmp_df)
    
    tmp_df_scaled = pd.DataFrame(data=tmp_scaled, columns=["swl", "inf", "sfw", "ecpc",
                                                       "tototf", "tide_level"])   
    return tmp_df_scaled

In [10]:
train_data = []
train_label = []
num = 0

for i in w_list[:-1]:
    
    tmp = pd.read_csv(i)
    tmp = tmp.replace(" ", np.nan)
    tmp = tmp.interpolate(method = 'values')
    tmp = tmp.fillna(0)
    tmp1 = scaler2(tmp)
    tmp = tmp.assign(swl1 = tmp1['swl'],
                   inf1 = tmp1['inf'],
                   sfw1 = tmp1['sfw'],
                   ecpc1 = tmp1['ecpc'],
                   tototf1 = tmp1['tototf'],
                   tide_level1 = tmp1['tide_level']
                   )
    
    for j in tqdm(range(len(tmp)-432)):
        train_data.append(np.array(tmp.loc[j:j + 431, ["swl1", "inf1", "sfw1", "ecpc1",
                                                       "tototf1", "tide_level1",
                                                       "fw_1018662", "fw_1018680",
                                                       "fw_1018683", "fw_1019630"
                                                       ]]).astype(float))
        
        train_label.append(np.array(tmp.loc[j + 432, ["wl_1018662", "wl_1018680",
                                                      "wl_1018683", "wl_1019630"]]).astype(float))

100%|██████████| 26064/26064 [00:28<00:00, 914.70it/s]
100%|██████████| 26064/26064 [00:27<00:00, 937.84it/s]
100%|██████████| 26064/26064 [00:27<00:00, 958.60it/s]
100%|██████████| 26064/26064 [00:27<00:00, 960.60it/s]
100%|██████████| 26064/26064 [00:27<00:00, 960.26it/s]
100%|██████████| 26064/26064 [00:27<00:00, 959.15it/s]
100%|██████████| 26064/26064 [00:27<00:00, 956.77it/s]
100%|██████████| 26064/26064 [00:27<00:00, 961.59it/s]
100%|██████████| 26064/26064 [00:27<00:00, 955.67it/s]
100%|██████████| 26064/26064 [00:27<00:00, 958.73it/s]


In [11]:
train_data = np.array(train_data)
train_label = np.array(train_label)

print(train_data.shape)
print(train_label.shape)

(260640, 432, 10)
(260640, 4)


### 모델링 및 모델 학습

In [12]:
input_shape = (train_data[0].shape[0], train_data[0].shape[1])

model = Sequential()
model.add(GRU(256, input_shape=input_shape))
# model.add(Dense(128, activation = 'relu'))
model.add(Dense(4, activation = 'relu'))

optimizer = tf.optimizers.RMSprop(0.001)


In [13]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
EPOCH = 20
BATCH_SIZE = 256
filename = 'checkpoint-epoch-{}-batch-{}-trial-001.h5'.format(EPOCH, BATCH_SIZE)
checkpoint = ModelCheckpoint(filename,             # file명을 지정합니다
                             monitor='loss',       # loss 값이 개선되었을때 호출됩니다
                             save_best_only=True,  # 가장 best 값만 저장합니다
                             mode='auto'           # auto는 알아서 best를 찾습니다. min/max
                            )

earlystopping = EarlyStopping(monitor='loss',  # 모니터 기준 설정 (val loss) 
                              patience=3,         # 10회 Epoch동안 개선되지 않는다면 종료
                             )


model.compile(optimizer=optimizer,loss='mse', metrics=['mae'])

In [14]:
model.fit(train_data, train_label,
          callbacks=[checkpoint, earlystopping], 
          epochs=EPOCH, 
          batch_size=BATCH_SIZE)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


<keras.callbacks.History at 0x7f91b7c27990>

### 추론 데이터셋 만들기

In [15]:
test_data = []
test_label = []

tmp = pd.read_csv(w_list[-1])
tmp = tmp.replace(" ", np.nan)
# 이전값을 사용
tmp = tmp.fillna(method = 'pad')
tmp = tmp.fillna(0)
    
#tmp.loc[:, ["wl_1018662", "wl_1018680", "wl_1018683", "wl_1019630"]] = tmp.loc[:, ["wl_1018662", "wl_1018680", "wl_1018683", "wl_1019630"]]*100
    
for j in tqdm(range(4032, len(tmp)-432)):
    test_data.append(np.array(tmp.loc[j:j + 431, ["swl", "inf", "sfw", "ecpc",
                                                    "tototf", "tide_level",
                                                    "fw_1018662", "fw_1018680",
                                                    "fw_1018683", "fw_1019630"]]).astype(float))
        
    test_label.append(np.array(tmp.loc[j + 432, ["wl_1018662", "wl_1018680",
                                                    "wl_1018683", "wl_1019630"]]).astype(float))

100%|██████████| 6912/6912 [00:07<00:00, 953.19it/s]


In [16]:
test_data = np.array(test_data)
test_label = np.array(test_label)

print(test_data.shape)
print(test_label.shape)

(6912, 432, 10)
(6912, 4)


### 제출 파일 만들기

In [17]:
pred = model.predict(test_data)

In [18]:
pred = pd.DataFrame(pred)

In [19]:
sample_submission = pd.read_csv("/content/drive/MyDrive/Study/Dacon/dam/sample_submission.csv")

sample_submission["wl_1018662"] = pred[0]
sample_submission["wl_1018680"] = pred[1]
sample_submission["wl_1018683"] = pred[2]
sample_submission["wl_1019630"] = pred[3]

In [20]:
import datetime
now=datetime.datetime.now()
file_name = 'submission' + str(now.month)+ str(now.day) + '.csv'
sample_submission.to_csv(file_name, index = False)

In [21]:
import datetime
print(str(now.month), str(now.day))

8 3


# <8월 3일 회고>
### 변경점
---
- **fw, wl 데이터를 제외하고 기존 데이터를 부분 Standard Scaling 처리했다**  
 -> 999등이 나옴 스케일링은 안하는게 더 좋을것 같다는 판단이 들었음
- **fw, wl 데이터를 제외하고 기존 데이터를 부분 MinMax Scaling 처리했다**  
 -> 이것도 999등이 나옴 스케일링이 오히려 악영향을 미친다는 생각이 듬  
- **콜백기능 추가!**
 ### 앞으로 할 것
- **강수량 데이터를 이용!**  
    -> 대부분 0의값을 가지거나 적은 값을 가지기 때문에 *10등의 증폭을 해서 데이터를 적용해볼 예정!
- **전체적으로 스케일링을 하고 딥러닝처럼 역전파로 다시 연산해서 구해보는 방식을 써보는 것도 좋을 것같다!**