In [893]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.pyplot as plt             
import matplotlib as mpl                    
mpl.rc('font', family='Malgun Gothic')      
plt.rcParams['axes.unicode_minus']=False  

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import tensorflow as tf
from tensorflow import keras     
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD, Adam

from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.callbacks import LambdaCallback

from sklearn import metrics
from tensorflow.keras.layers import LSTM, Dropout
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

In [895]:
np.random.seed(42)       
tf.random.set_seed(42)   

# 전력 예측
- 데이터: https://archive.ics.uci.edu/dataset/321/electricityloaddiagrams20112014
- 날짜와 특정지역코드로 구성
    - 행(Row): 140,256개의 타임스탬프(매 15분 단위)
    - 열(Column): 370개의 소비 지역(전력 미터 ID)
    - 형식: timestamp (DatetimeIndex) + 370개의 소비량 열

## 데이터 전처리

In [901]:
data=pd.read_csv("./Data/LD2011_2014.txt", sep=";", index_col=0, parse_dates=True, decimal=",")
data.shape

(140256, 370)

In [982]:
elec=data.copy()

- 데이터 로드
- 일변 소비량 합산 (평균)
- 평균을 기준으로 High/Low 라벨 생성
- 스케일 조절

In [985]:
elec.shape
elec.head(2)
elec.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 140256 entries, 2011-01-01 00:15:00 to 2015-01-01 00:00:00
Columns: 370 entries, MT_001 to MT_370
dtypes: float64(370)
memory usage: 397.0 MB


In [987]:
# 데이터 리샘플링 (일별 소비량 합산)
daily_elec=elec.resample("D").sum()   # S, T, M, D, W, M, Q, Y
daily_elec.shape

(1462, 370)

In [989]:
# daily_data.describe()

# 결측치
# daily_data.fillna(method="ffill", inplace=True)

In [991]:
# 임계값 계산 (전체 평균)
threshold=daily_elec.mean(axis=1).mean()
threshold

50704.3933356309

In [993]:
# 새로운 분류 라벨 추가  : "High" 1, "Low" 0
daily_elec['label']=np.where(daily_elec.mean(axis=1) > threshold, "High" , "Low")

In [995]:
daily_elec["label"].value_counts()

label
Low     742
High    720
Name: count, dtype: int64

In [997]:
daily_elec.head(2)

Unnamed: 0,MT_001,MT_002,MT_003,MT_004,MT_005,MT_006,MT_007,MT_008,MT_009,MT_010,...,MT_362,MT_363,MT_364,MT_365,MT_366,MT_367,MT_368,MT_369,MT_370,label
2011-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Low
2011-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Low


In [999]:
daily_x=daily_elec.drop(['label'], axis=1)
daily_y=daily_elec['label']
daily_x.shape, daily_y.shape

((1462, 370), (1462,))

In [1001]:
scaler=MinMaxScaler()
scaled=scaler.fit_transform(daily_x)

In [1003]:
def create_sequences(data, labels, window=30):
    X, Y=[], []
    for i in range(len(data) - window):
        X.append(data[i: i+window])        # 과거 30일 데이터
        Y.append(labels[i + window])       # 그 다음 날의 라벨
    return np.array(X), np.array(Y)

X, Y=create_sequences(scaled, daily_y.values)
X.shape, Y.shape

((1432, 30, 370), (1432,))

In [1005]:
X_reshape=X.reshape(X.shape[0], -1) 
X_reshape.shape

(1432, 11100)

In [1007]:
from sklearn.decomposition import PCA
pca=PCA(n_components=0.95)               # 분산의 95%를 유지하는 차원 선택
X_pca=pca.fit_transform(X_reshape)
X_pca.shape

(1432, 34)

In [1009]:
X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, test_size=0.2, shuffle=False)
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((1145, 34), (1145,), (287, 34), (287,))

## RandomForestClassifier

In [932]:
rf_model=RandomForestClassifier(max_depth=5, random_state=42)
rf_model.fit(X_train, Y_train)

In [934]:
rf_model.score(X_train, Y_train), rf_model.score(X_test, Y_test)

(0.9650655021834061, 0.6515679442508711)

pred=rf_model.predict(X_test)
print(metrics.classification_report(Y_test, pred))

## LSTM

In [939]:
from sklearn.preprocessing import LabelEncoder
label=LabelEncoder()
Y_label=label.fit_transform(Y)  

In [941]:
X_train, X_test, Y_train, Y_test=train_test_split(X, Y_label, test_size=0.2, shuffle=False)
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((1145, 30, 370), (1145,), (287, 30, 370), (287,))

In [943]:
model=keras.Sequential()
model.add(keras.layers.Input(shape=(30, 370)))                                  # LSTM 입력 크기
model.add(keras.layers.LSTM(16, activation='tanh', return_sequences=False))     # 시퀀스 중 마지막 상태만 출력 
model.add(Dropout(0.2)) 
model.add(keras.layers.Dense(2, activation='softmax'))                         # 출력층
model.summary()

In [945]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [947]:
history=model.fit(X_train, Y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - accuracy: 0.6590 - loss: 0.6024 - val_accuracy: 0.6419 - val_loss: 0.6112
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9097 - loss: 0.3067 - val_accuracy: 0.5983 - val_loss: 0.5800
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9011 - loss: 0.2653 - val_accuracy: 0.8515 - val_loss: 0.4119
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8705 - loss: 0.2817 - val_accuracy: 0.7904 - val_loss: 0.4557
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.9193 - loss: 0.2431 - val_accuracy: 0.8472 - val_loss: 0.3909
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9431 - loss: 0.1954 - val_accuracy: 0.8515 - val_loss: 0.3844
Epoch 7/10
[1m29/29[0m [32m━━━━

In [949]:
model.evaluate(X_test, Y_test) 

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.8323 - loss: 0.2978


[0.3024427890777588, 0.8257839679718018]

In [951]:
pred_prob=model.predict(X_test)
pred_prob  

pred=np.argmax(pred_prob, axis=1)
pred[0]

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


0

In [959]:
np.unique(Y_test, return_counts=True)

(array([0, 1], dtype=int64), array([259,  28], dtype=int64))

In [955]:
print(metrics.classification_report(Y_test, pred))

              precision    recall  f1-score   support

           0       0.93      0.88      0.90       259
           1       0.24      0.36      0.29        28

    accuracy                           0.83       287
   macro avg       0.58      0.62      0.59       287
weighted avg       0.86      0.83      0.84       287

