In [1]:
# !pip install --upgrade pip
# !pip install python-dotenv
# !pip install numpy
# !pip install pandas
# !pip install tensorflow
# !pip install keras
# !pip install matplotlib
# !pip install sklearn
# !pip install PyMySQL

In [1]:
debug = True

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [17]:
import os
import time

import pymysql

from model import create_model

import multiprocessing

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
import joblib

In [14]:
import keras
import tensorflow as tf
from keras import optimizers

tf.config.threading.set_intra_op_parallelism_threads(2)
tf.config.threading.set_inter_op_parallelism_threads(1)

In [18]:
# DB_CONN INFOS
DB_USER = os.getenv('MYSQL_USER')
DB_PASSWD = os.getenv('MYSQL_PASSWORD')
DB_HOST = os.getenv('MYSQL_HOST')
DB_DB = os.getenv('MYSQL_DATABASE')

# Connect to db
db = pymysql.connect(
    user=DB_USER, 
    passwd=DB_PASSWD, 
    host=DB_HOST, 
    db=DB_DB, 
    charset='utf8'
)

# Set cursor
cursor = db.cursor(pymysql.cursors.DictCursor)

In [19]:
# Get all stations ids in database
sql = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA = %s;"
nRows = cursor.execute(sql, DB_DB)
stationIds = cursor.fetchall()
stationIds = [stationId['TABLE_NAME'] for stationId in stationIds]
#stationIds = stationIds[:30]

In [21]:
start = time.time()

count = 0
data = {}
for stationId in stationIds:
    sql = "SELECT parkingBikeTotCnt FROM `{}`".format(stationId)
    count += cursor.execute(sql)
    res = cursor.fetchall()
    
    tempdf = pd.DataFrame(res)
    y = pd.DataFrame(tempdf.parkingBikeTotCnt)

    scaler = MinMaxScaler()
    y = scaler.fit_transform(y)
    
    file_name = 'scalers/{}.pkl'.format(stationId)
    joblib.dump(scaler, file_name)
    
    # filter only dataset which have more than 1k data
    # IMPORTANT: This is very important to ensure safe training.
    #            because some stations' dataset has very small data.
    if(len(y) > 1000):
        data[stationId] = y
    
print("로딩시간 :", time.time() - start)
print("로드된 데이터 수 :", count)

로딩시간 : 65.95288228988647
로드된 데이터 수 : 5321918


In [8]:
def create_dataset(dataset, look_back=10, nPredicted = 6):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-nPredicted + 1):
        dataX.append(dataset[i:(i+look_back), 0])
        dataY.append(dataset[i + look_back: i + look_back + nPredicted, 0])
        
    dataX, dataY = np.array(dataX), np.array(dataY)
    
    dataX = dataX.reshape(dataX.shape[0], dataX.shape[1], 1)
    dataY = dataY.reshape(dataY.shape[0], dataY.shape[1], 1)
    
    return dataX, dataY

In [10]:
def do_all_task(args):
    model = create_model()
    
    model.compile(optimizer='adam', loss='mse')
    
    if debug:
        start = time.time()
        history = model.fit(args['x'], args['y'], epochs=250, batch_size=70, verbose=0)
        print("[{:^9}] {:>5}초, loss: {}".format(args['key'], round(time.time() - start, 2), round(history.history['loss'][-1:][0], 4)))
    else:
        model.fit(args['x'], args['y'], epochs=200, batch_size=64, verbose=0)
          
    file_name = 'test/{}.h5'.format(args['key'])
    model.save(file_name)

In [10]:
try:
    nCores = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=nCores, maxtasksperchild=8)

    keys = data.keys()
    result = pool.map(create_dataset, [data[key] for key in keys])

    datasets = {}
    idx = 0
    for key in keys:
        datasets[key] = result[idx]
        idx+=1
except:
    pool.terminate()
    print('Pool is terminated')
finally:
    print('Joining pool processes')
    pool.close()
    pool.join()
    print('Join complete')

Joining pool processes
Join complete


In [11]:
try:
    start = time.time()
    print("== 학습 시작 ==")
    #print("*" * len(datasets))

    nCores = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(processes=nCores)

    keys = datasets.keys()
    res = pool.map(do_all_task, [{
        'key': key, 
        'x': datasets[key][0], 
        'y': datasets[key][1]
    } for key in keys])


    print("\n")
    print("== 학습 완료 == ")
    print("소요 시간 :", round(time.time() - start, 2))
except:
    print('에러가 발생 했습니다')
    pool.terminate()
    print('pool is terminated')
finally:
    print('joining pool processes')
    pool.close()
    pool.join()
    print('join complete')

== 학습 시작 ==
[  ST-10  ] 55.48초, loss: 0.0065
[ ST-1206 ] 56.04초, loss: 0.0042
[ ST-1485 ] 56.16초, loss: 0.0065
[ ST-114  ]  56.8초, loss: 0.0086
[ ST-1345 ]  56.7초, loss: 0.0012
[ ST-1069 ] 57.33초, loss: 0.0026
[ ST-1417 ] 57.05초, loss: 0.0031
[ ST-1277 ] 57.98초, loss: 0.0023
[ ST-1486 ] 53.73초, loss: 0.005
[ ST-1140 ] 56.54초, loss: 0.003
[ ST-1346 ] 56.79초, loss: 0.0015
[ ST-1000 ] 59.16초, loss: 0.0019
[ ST-107  ]  57.5초, loss: 0.0035
[ ST-1207 ] 58.78초, loss: 0.0086
[ ST-1418 ] 58.55초, loss: 0.0057
[ ST-1278 ] 61.46초, loss: 0.0074
[ ST-1487 ] 62.26초, loss: 0.0029
[ ST-1002 ] 60.57초, loss: 0.0041
[ ST-1141 ] 61.67초, loss: 0.0018
[ ST-1347 ] 62.04초, loss: 0.0091
[ ST-1419 ] 60.72초, loss: 0.0046[ ST-1070 ]  61.9초, loss: 0.0083

[ ST-1208 ] 61.95초, loss: 0.005
[ ST-1279 ] 61.69초, loss: 0.003
[ ST-1488 ] 56.67초, loss: 0.0021
[ ST-1142 ] 56.53초, loss: 0.0063
[ ST-1003 ] 57.24초, loss: 0.0035
[ ST-1348 ] 56.43초, loss: 0.0051
[ ST-1072 ] 56.11초, loss: 0.0064
[ ST-121  ] 56.36초, loss: 0.0013
[ 