In [1]:
# !pip install --upgrade pip
# !pip install python-dotenv
# !pip install numpy
# !pip install pandas
# !pip install tensorflow
# !pip install keras
# !pip install matplotlib
# !pip install sklearn
# !pip install PyMySQL

In [33]:
debug = True

In [34]:
from dotenv import load_dotenv
load_dotenv()

True

In [35]:
import os
import time

import pymysql

import multiprocessing

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

In [36]:
import keras
import tensorflow as tf
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, TimeDistributed, RepeatVector

tf.config.threading.set_intra_op_parallelism_threads(2)
tf.config.threading.set_inter_op_parallelism_threads(1)

In [37]:
# DB_CONN INFOS
DB_USER = os.getenv('MYSQL_USER')
DB_PASSWD = os.getenv('MYSQL_PASSWORD')
DB_HOST = os.getenv('MYSQL_HOST')
DB_DB = os.getenv('MYSQL_DATABASE')

# Connect to db
db = pymysql.connect(
    user=DB_USER, 
    passwd=DB_PASSWD, 
    host=DB_HOST, 
    db=DB_DB, 
    charset='utf8'
)

# Set cursor
cursor = db.cursor(pymysql.cursors.DictCursor)

In [50]:
# Get all stations ids in database
sql = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = %s;"
nRows = cursor.execute(sql, DB_DB)
stationIds = cursor.fetchall()
stationIds = [stationId['TABLE_NAME'] for stationId in stationIds]
stationIds = stationIds[:30]

In [51]:
start = time.time()

count = 0
data = {}
for stationId in stationIds:
    sql = "SELECT parkingBikeTotCnt FROM `{}`".format(stationId)
    count += cursor.execute(sql)
    res = cursor.fetchall()
    
    tempdf = pd.DataFrame(res)
    y = pd.DataFrame(tempdf.parkingBikeTotCnt)

    scaler = MinMaxScaler()
    y = scaler.fit_transform(y)
    
    # filter only dataset which have more than 1k data
    # IMPORTANT: This is very important to ensure safe training.
    #            because some stations' dataset has very small data.
    if(len(y) > 1000):
        data[stationId] = y
    
print("로딩시간 :", time.time() - start)
print("로드된 데이터 수 :", count)

로딩시간 : 0.6876561641693115
로드된 데이터 수 : 61140


In [52]:
def create_dataset(dataset, look_back=10, nPredicted = 6):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-nPredicted + 1):
        dataX.append(dataset[i:(i+look_back), 0])
        dataY.append(dataset[i + look_back: i + look_back + nPredicted, 0])
        
    dataX, dataY = np.array(dataX), np.array(dataY)
    
    dataX = dataX.reshape(dataX.shape[0], dataX.shape[1], 1)
    dataY = dataY.reshape(dataY.shape[0], dataY.shape[1], 1)
    
    return dataX, dataY

In [53]:
def do_all_task(args):
    model = Sequential()
    
    model.add(LSTM(25, activation='linear', input_shape=(10, 1)))
    model.add(RepeatVector(6))
    model.add(LSTM(25, activation='linear', return_sequences=True))
    model.add(TimeDistributed(Dense(1)))
    
    model.compile(optimizer='adam', loss='mse')
    
    if debug:
        start = time.time()
        history = model.fit(args['x'], args['y'], epochs=250, batch_size=70, verbose=0)
        print("[{:^9}] {:>5}초, loss: {}".format(args['key'], round(time.time() - start, 2), round(history.history['loss'][-1:][0], 4)))
    else:
        model.fit(args['x'], args['y'], epochs=200, batch_size=64, verbose=0)
          
    file_name = 'models/{}.h5'.format(args['key'])
    model.save(file_name)

In [54]:
try:
    nCores = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=nCores)

    keys = data.keys()
    result = pool.map(create_dataset, [data[key] for key in keys])

    datasets = {}
    idx = 0
    for key in keys:
        datasets[key] = result[idx]
        idx+=1
except:
    pool.terminate()
    print('Pool is terminated')
finally:
    print('Joining pool processes')
    pool.close()
    pool.join()
    print('Join complete')

Joining pool processes
Join complete


In [57]:
try:
    start = time.time()
    print("== 학습 시작 ==")
    #print("*" * len(datasets))

    nCores = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=4)

    keys = datasets.keys()
    res = pool.map(do_all_task, [{
        'key': key, 
        'x': datasets[key][0], 
        'y': datasets[key][1]
    } for key in keys])


    print("\n")
    print("== 학습 완료 == ")
    print("소요 시간 :", round(time.time() - start, 2))
except:
    print('에러가 발생 했습니다')
    pool.terminate()
    print('pool is terminated')
finally:
    print('joining pool processes')
    pool.close()
    pool.join()
    print('join complete')

== 학습 시작 ==
[ ST-1006 ] 40.41초, loss: 0.0057
[ ST-1002 ] 40.62초, loss: 0.0029
[  ST-10  ] 40.67초, loss: 0.0055
[ ST-1004 ]  40.8초, loss: 0.003
[ ST-1007 ] 40.44초, loss: 0.0034
[ ST-1000 ] 40.35초, loss: 0.0019
[ ST-1003 ] 40.58초, loss: 0.0029
[ ST-1005 ] 40.42초, loss: 0.0021
[ ST-1011 ] 40.69초, loss: 0.0015
[ ST-1008 ] 41.06초, loss: 0.0027
[ ST-1015 ] 40.77초, loss: 0.0035
[ ST-1013 ] 40.85초, loss: 0.0082
[ ST-1012 ] 40.51초, loss: 0.0022
[ ST-1010 ] 40.58초, loss: 0.0019
[ ST-1016 ] 40.77초, loss: 0.0025
[ ST-1014 ] 40.72초, loss: 0.0036
[ ST-1017 ]  41.2초, loss: 0.0056
[ ST-1024 ] 40.92초, loss: 0.0028
[ ST-1019 ] 41.23초, loss: 0.0034
[ ST-1020 ] 41.23초, loss: 0.0024
[ ST-1018 ] 40.92초, loss: 0.0088
[ ST-102  ] 40.67초, loss: 0.0115
[ ST-1025 ] 41.07초, loss: 0.002
[ ST-1023 ] 40.86초, loss: 0.0007
[ ST-1029 ] 39.98초, loss: 0.0048
[ ST-1027 ] 40.09초, loss: 0.0023
[ ST-1031 ]  40.6초, loss: 0.0091
[ ST-103  ] 39.78초, loss: 0.003
[ ST-1028 ] 40.14초, loss: 0.0027
[ ST-1032 ] 39.72초, loss: 0.0046

