In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import MinMaxScaler


In [25]:
train = pd.read_csv('use_this.csv')
train.head()

Unnamed: 0,stn4contest,v01,v02,v03,v04,v05,v06,v07,v08,v09,vv,class_interval,year,month,day,hour
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,12
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,15
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,18
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,21
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2020,5,2,0


In [10]:
# 학습 데이터와 테스트 데이터 분할
train_data = train[train['stn4contest'] <= 16]
test_data = train[train['stn4contest'] > 15]

# 입력 특성 (v01 ~ v09)와 타겟 변수 (vv) 분리
X_train = train_data[['stn4contest', 'year', 'month', 'day', 'hour', 'v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']]
y_train = train_data['vv']
X_test = test_data[['stn4contest', 'year', 'month', 'day', 'hour', 'v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']]
y_test = test_data['vv']

# 데이터 표준화
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# MLP 모델 정의
model = Sequential([
    Dense(64, input_dim=14, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')  # 회귀 문제이므로 선형 활성화 함수 사용
])

# 모델 컴파일
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# 모델 학습
model.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.2)

# 모델 예측
y_pred = model.predict(X_test)

# 성능 지표 계산
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 결과 저장
results = {
    'Window Size': 1,  # 현재 예제에서는 window size가 1로 고정되어 있음
    'RMSE': rmse,
    'MAE': mae,
    'MSE': mse,
    'R²': r2
}

# 결과 출력
print("Model Performance Metrics by Window Size:")
print(f"{'Window Size':<15}{'RMSE':<10}{'MAE':<10}{'MSE':<10}{'R²':<10}")
print(f"{'-'*45}")
print(f"{results['Window Size']:<15}{results['RMSE']:<10.4f}{results['MAE']:<10.4f}{results['MSE']:<10.4f}{results['R²']:<10.4f}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model Performance Metrics by Window Size:
Window Size    RMSE      MAE       MSE       R²        
---------------------------------------------
1              3.4160    0.7919    11.6690   0.3525    


In [11]:
test_data = pd.read_csv('test_stn+ef_label.csv')
test_data.head()

Unnamed: 0,stn4contest,ef_datetime,v01,v02,v03,v04,v05,v06,v07,v08,v09,year,month,day,hour,class_interval
0,31,2023-05-01 12:00:00,68.0,46.0,13.0,1.0,0.0,0.0,0.0,0.0,0.0,2023,5,1,12,
1,31,2023-05-01 15:00:00,83.0,66.0,26.0,5.0,0.0,0.0,0.0,0.0,0.0,2023,5,1,15,
2,31,2023-05-01 18:00:00,17.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,5,1,18,
3,31,2023-05-01 21:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,5,1,21,
4,31,2023-05-02 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,5,2,0,


In [14]:
features = test_data[['ef_datetime', 'stn4contest', 'v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09', 'year', 'month', 'day', 'hour']]
target = test_data['class_interval']

features.set_index('ef_datetime', inplace=True)

# 데이터 정렬
features = features.sort_index()

# 데이터 정규화
scaler_features = MinMaxScaler(feature_range=(0, 1))

test_features_scaled = scaler_features.fit_transform(features)

In [15]:
# 예측
test_predict = model.predict(test_features_scaled)

# 예측 결과를 데이터프레임으로 변환
predicted_df = pd.DataFrame(test_predict, index=features.index, columns=['Predicted vv'])

# 예측 결과 출력
predicted_df



Unnamed: 0_level_0,Predicted vv
ef_datetime,Unnamed: 1_level_1
2023-05-01 12:00:00,0.441448
2023-05-01 12:00:00,-0.241992
2023-05-01 12:00:00,2.584950
2023-05-01 12:00:00,1.929838
2023-05-01 12:00:00,0.820360
...,...
2023-10-10 21:00:00,25.348650
2023-10-10 21:00:00,39.113647
2023-10-10 21:00:00,41.733391
2023-10-10 21:00:00,31.216543


In [16]:
def get_class_interval(rainfall):
    if rainfall < 0.1:
        return 0
    elif 0.1 <= rainfall < 0.2:
        return 1
    elif 0.2 <= rainfall < 0.5:
        return 2
    elif 0.5 <= rainfall < 1.0:
        return 3
    elif 1.0 <= rainfall < 2.0:
        return 4
    elif 2.0 <= rainfall < 5.0:
        return 5
    elif 5.0 <= rainfall < 10.0:
        return 6
    elif 10.0 <= rainfall < 20.0:
        return 7
    elif 20.0 <= rainfall < 30.0:
        return 8
    else:
        return 9
    
# 'Predicted vv' 열에 변환 함수 적용
predicted_df['class_interval'] = predicted_df['Predicted vv'].apply(get_class_interval)

predicted_df['class_interval'].value_counts()

class_interval
0    1881
5     987
7     921
6     901
4     596
3     374
8     339
2     272
9     116
1     113
Name: count, dtype: int64

In [17]:
# 'Predicted vv' 열에 변환 함수 적용하여 클래스 구간 계산
predicted_df['Class Interval'] = predicted_df['Predicted vv'].apply(get_class_interval)

# 원래의 test_data에 'Class Interval' 열 추가
test_data['class_interval'] = predicted_df['Class Interval'].values

In [19]:
test_data.head()

Unnamed: 0,stn4contest,ef_datetime,v01,v02,v03,v04,v05,v06,v07,v08,v09,year,month,day,hour,class_interval
0,31,2023-05-01 12:00:00,68.0,46.0,13.0,1.0,0.0,0.0,0.0,0.0,0.0,2023,5,1,12,2
1,31,2023-05-01 15:00:00,83.0,66.0,26.0,5.0,0.0,0.0,0.0,0.0,0.0,2023,5,1,15,0
2,31,2023-05-01 18:00:00,17.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,5,1,18,5
3,31,2023-05-01 21:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,5,1,21,4
4,31,2023-05-02 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2023,5,2,0,3


In [20]:
submit = pd.read_csv("rainfall_test2.csv")
realsubmit = pd.read_csv("rainfall_test.csv")
prediction_test = test_data.copy()

#submit-prediction 지점,date 맞추기
prediction_test['ef_datetime'] = pd.to_datetime(prediction_test['ef_datetime'])
submit['ef_year_temp'] = submit['ef_year'].replace({'D': '2023'})
submit['ef_datetime'] = pd.to_datetime(submit[['ef_year_temp', 'ef_month', 'ef_day', 'ef_hour']].astype(str).agg('-'.join, axis=1), format='%Y-%m-%d-%H')
stn_mapping = {'STN031': 31, 'STN032': 32, 'STN033': 33, 'STN034': 34, 'STN035': 35}
submit['stn4contest'] = submit['stn4contest'].replace(stn_mapping)

#test_data['class_interval'] = y_pred2
#test_data['class_interval'] = test_data['class_interval'].apply(classify_class)

    
merged = submit.merge(prediction_test[['ef_datetime', 'stn4contest', 'class_interval']], 
                      on=['ef_datetime', 'stn4contest'], 
                      how='left', 
                      suffixes=('', '_pred'))
merged['class_interval_pred']
    
#-999 index 저장
idx999 = realsubmit[realsubmit['rainfall_test.class_interval'].notnull()].index
merged.loc[idx999, 'class_interval_pred'] = -999

# 제출용 데이터에 class_interval 채우기
realsubmit['rainfall_test.class_interval'] = merged['class_interval_pred']

#저장
realsubmit.to_csv("240494.csv")  
print(realsubmit.shape)
realsubmit.head()

(122000, 21)


Unnamed: 0.1,Unnamed: 0,rainfall_test.fc_year,rainfall_test.fc_month,rainfall_test.fc_day,rainfall_test.fc_hour,rainfall_test.stn4contest,rainfall_test.dh,rainfall_test.ef_year,rainfall_test.ef_month,rainfall_test.ef_day,...,rainfall_test.v01,rainfall_test.v02,rainfall_test.v03,rainfall_test.v04,rainfall_test.v05,rainfall_test.v06,rainfall_test.v07,rainfall_test.v08,rainfall_test.v09,rainfall_test.class_interval
0,1,D,5,1,9,STN031,3,D,5,1,...,68,46,13,1,0,0,0,0,0,2
1,2,D,5,1,9,STN031,6,D,5,1,...,83,66,26,5,0,0,0,0,0,0
2,3,D,5,1,9,STN031,9,D,5,1,...,17,6,0,0,0,0,0,0,0,5
3,4,D,5,1,9,STN031,12,D,5,1,...,0,0,0,0,0,0,0,0,0,4
4,5,D,5,1,9,STN031,15,D,5,2,...,0,0,0,0,0,0,0,0,0,3


## MLP에 레이어 추가

In [26]:
# 학습 데이터와 테스트 데이터 분할
train_data = train[train['stn4contest'] <= 16]
test_data = train[train['stn4contest'] > 15]

# 입력 특성 (v01 ~ v09)와 타겟 변수 (vv) 분리
X_train = train_data[['stn4contest', 'year', 'month', 'day', 'hour', 'v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']]
y_train = train_data['vv']
X_test = test_data[['stn4contest', 'year', 'month', 'day', 'hour', 'v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09']]
y_test = test_data['vv']

# 데이터 표준화
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [29]:
# MLP 모델 정의
model_comp = Sequential([
    Dense(128, input_dim=14, activation='relu'),  # 입력층
    Dropout(0.3),  # 드롭아웃 추가
    Dense(64, activation='relu'),  # 첫 번째 은닉층
    Dropout(0.3),  # 드롭아웃 추가
    Dense(32, activation='relu'),  # 두 번째 은닉층
    Dropout(0.3),  # 드롭아웃 추가
    Dense(16, activation='relu'),  # 세 번째 은닉층 추가
    Dropout(0.3),  # 드롭아웃 추가
    Dense(1, activation='linear')  # 출력층
])

# 모델 컴파일
model_comp.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# 모델 학습
model_comp.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.2)

# 모델 예측
y_pred = model_comp.predict(X_test)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [27]:
y_pred.shape

(6500, 1)

In [28]:
y_test.shape

(15600,)

In [30]:
# 성능 지표 계산
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 결과 저장
results = {
    'Window Size': 1,  # 현재 예제에서는 window size가 1로 고정되어 있음
    'RMSE': rmse,
    'MAE': mae,
    'MSE': mse,
    'R²': r2
}

# 결과 출력
print("Model Performance Metrics by Window Size:")
print(f"{'Window Size':<15}{'RMSE':<10}{'MAE':<10}{'MSE':<10}{'R²':<10}")
print(f"{'-'*45}")
print(f"{results['Window Size']:<15}{results['RMSE']:<10.4f}{results['MAE']:<10.4f}{results['MSE']:<10.4f}{results['R²']:<10.4f}")

Model Performance Metrics by Window Size:
Window Size    RMSE      MAE       MSE       R²        
---------------------------------------------
1              3.4188    0.8144    11.6884   0.3514    
