In [1]:
import os

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import models, layers, optimizers

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# 사용 가능 GPU 확인
tf.config.list_physical_devices('GPU')

2022-08-04 04:37:21.674733: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-04 04:37:21.675184: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-04 04:37:21.680212: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-04 04:37:21.680663: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-04 04:37:21.681087: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from S

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'),
 PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]

In [3]:
# TF 버전
print(tf.__version__)

2.9.1


In [4]:
# 폴더 안에 있는 모든 csv 파일을 읽어서 하나에 DataFrame 으로 만든다
def read_csvs_in_dir(path):
    dfs = []
    dir_files = os.listdir(path)
    file_names = list(filter(lambda x: not(x.startswith('.')), dir_files))
    for file_name in sorted(file_names):
        df = pd.read_csv(path + file_name, parse_dates=['날짜', '시간'], dayfirst=True, infer_datetime_format=True)
        dfs.append(df)
    return pd.concat(dfs)

In [5]:
# 데이터 read
df_train = read_csvs_in_dir('./data/train/')
df_false = read_csvs_in_dir('./data/test/')
df_false.head()

Unnamed: 0,날짜,시간,대여개수,대기개수,기온,강수량
0,2022-02-01,2022-08-04 00:00:00,20819,3883,-3.9,0.0
1,2022-02-01,2022-08-04 01:00:00,285,24417,-3.9,22.8
2,2022-02-01,2022-08-04 02:00:00,194,24508,-14.0,0.0
3,2022-02-01,2022-08-04 03:00:00,14849,9853,-15.0,0.0
4,2022-02-01,2022-08-04 04:00:00,45,24657,-5.9,16.6


In [6]:
df_false.describe()

Unnamed: 0,대여개수,대기개수,기온,강수량
count,2880.0,2880.0,2880.0,2880.0
mean,11641.336111,19682.247222,7.42809,6.761042
std,9292.621349,9370.342962,10.457679,9.741365
min,25.0,16.0,-15.0,0.0
25%,3685.0,11847.0,0.0,0.0
50%,9075.5,21899.5,8.0,0.0
75%,18605.25,27310.75,15.0,13.425
max,34329.0,34305.0,31.0,34.6


In [7]:
# 전처리
def preprocess(df):
    df['날짜'] = df['날짜'].dt.dayofyear
    df['시간'] = df['시간'].dt.hour
    return df

df_true = preprocess(df_train)
df_false = preprocess(df_false)
df_false.head()

Unnamed: 0,날짜,시간,대여개수,대기개수,기온,강수량
0,32,0,20819,3883,-3.9,0.0
1,32,1,285,24417,-3.9,22.8
2,32,2,194,24508,-14.0,0.0
3,32,3,14849,9853,-15.0,0.0
4,32,4,45,24657,-5.9,16.6


In [8]:
input_true = df_true.values
input_false = df_false.values

timesteps = 24
n_features = input_true.shape[1]

In [9]:
# 3차원화
def temporalize(X, timesteps):
	output_X = []
	for i in range(len(X) - timesteps - 1):
		t = []
		for j in range(1, timesteps + 1):
			# Gather the past records upto the lookback period
			t.append(X[[(i + j + 1)], :])
		output_X.append(t)
	return np.squeeze(np.array(output_X))

In [10]:
x_true = temporalize(input_true, timesteps)
x_false = temporalize(input_false, timesteps)
print(x_false.shape)

(2855, 24, 6)


In [11]:
def flatten(X):
    flattened_X = np.empty((X.shape[0], X.shape[2]))  # sample x features array.
    for i in range(X.shape[0]):
        flattened_X[i] = X[i, (X.shape[1]-1), :]
    return(flattened_X)

def scale(X, scaler):
    for i in range(X.shape[0]):
        X[i, :, :] = scaler.transform(X[i, :, :])
        
    return X

In [12]:
# 스케일링
scaler = StandardScaler().fit(flatten(x_true))

x_true = scale(x_true, scaler)
x_false = scale(x_false, scaler)

In [13]:
x_train, x_valid = train_test_split(x_true, test_size=0.2)

In [14]:
epochs = 10000
batch = 2048
lr = 0.0001

In [None]:
lstm_ae = models.Sequential()
# Encoder
lstm_ae.add(layers.LSTM(128, input_shape=(timesteps, n_features), return_sequences=True))
lstm_ae.add(layers.LSTM(64, return_sequences=True))
lstm_ae.add(layers.LSTM(32, return_sequences=True))
lstm_ae.add(layers.LSTM(16, return_sequences=True))
lstm_ae.add(layers.LSTM(8, return_sequences=False))
lstm_ae.add(layers.RepeatVector(timesteps))
# Decoder
lstm_ae.add(layers.LSTM(8, return_sequences=True))
lstm_ae.add(layers.LSTM(16, return_sequences=True))
lstm_ae.add(layers.LSTM(32, return_sequences=True))
lstm_ae.add(layers.LSTM(64, return_sequences=True))
lstm_ae.add(layers.LSTM(128, return_sequences=True))
lstm_ae.add(layers.TimeDistributed(layers.Dense(n_features)))

lstm_ae.summary()

In [None]:
# compile
lstm_ae.compile(loss='mse', optimizer=optimizers.Adam(lr))

# fit
history = lstm_ae.fit(x_train, x_train,
                     epochs=4000, batch_size=batch,
                     validation_data=(x_valid, x_valid))

In [None]:
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='valid loss')
plt.legend()
plt.xlabel('Epoch'); plt.ylabel('loss')
plt.show()

In [None]:
predict_false = lstm_ae.predict(x_false)
mse = np.mean(np.power(flatten(x_false) - flatten(predict_false), 2), axis=1)

plt.plot(np.linspace(0, len(mse)-1, len(mse)), mse)
plt.ylim([0, 2])
plt.show()

In [None]:
predict_false = lstm_ae.predict(x_valid)
mse = np.mean(np.power(flatten(x_valid) - flatten(predict_false), 2), axis=1)

plt.plot(np.linspace(0, len(mse)-1, len(mse)), mse)
plt.ylim([0, 2])
plt.show()