In [1]:
import numpy as np
import pandas as pd 

In [2]:
data = pd.read_csv('./data/training_new.csv', header=None, encoding='utf-8', names=["date", "office", "arrivals", "quantity"])

In [3]:
data.head()

Unnamed: 0,date,office,arrivals,quantity
0,20171201,가평우체국,1,11
1,20171201,가평우체국,24,10
2,20171201,가평우체국,17,8
3,20171201,가평우체국,9,5
4,20171201,가평우체국,14,11


In [4]:
pd_data = data.drop("quantity", axis=1).copy()

In [5]:
type(pd_data)

pandas.core.frame.DataFrame

In [6]:
train_data = data.drop("quantity", axis=1) # 훈련 세트를 위해 레이블 삭제
train_data_labels = data["quantity"].copy()

In [7]:
train_data_cat = train_data[['office']]
train_data_num = train_data.drop(columns=['office'])

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer



num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])


num_attribs = list(train_data_num)
cat_attribs = ["office"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(sparse=False), cat_attribs),
    ])

train_data_prepared = full_pipeline.fit_transform(train_data)
train_data_prepared

array([[2.0171201e+07, 1.0000000e+00, 1.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [2.0171201e+07, 2.4000000e+01, 1.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [2.0171201e+07, 1.7000000e+01, 1.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       ...,
       [2.0181130e+07, 5.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [2.0181130e+07, 1.2000000e+01, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [2.0181130e+07, 1.8000000e+01, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00]])

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
train_data, test_data = train_test_split(train_data_prepared, test_size=0.2, random_state=42)

In [37]:
train_data

array([[ 0.33176835,  0.85484096,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.41086223, -0.80758057,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.17745774,  0.85484096,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.10107787, -1.08465083,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.26430592,  0.30070045,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.09952701, -0.39197519,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [38]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(train_data_prepared, train_data_labels)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [46]:
# 훈련 샘플 몇 개를 사용해 전체 파이프라인을 적용해 보겠습니다.
some_data = pd_data.iloc[:5]
some_labels = train_data_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("예측:", lin_reg.predict(some_data_prepared))

예측: [-6.06100464 23.65750122 14.61273193  4.27587891 10.73641968]


In [47]:
data.head()

Unnamed: 0,date,office,arrivals,quantity
0,20171201,가평우체국,1,11
1,20171201,가평우체국,24,10
2,20171201,가평우체국,17,8
3,20171201,가평우체국,9,5
4,20171201,가평우체국,14,11


In [14]:
# For deep learning model 
import keras
from keras.layers import Dense, Input, concatenate, Dropout
from keras.models import Sequential, Model
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from keras import metrics
from keras import backend as K

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
train_data_labels.values

array([11, 10,  8, ..., 11, 13, 17], dtype=int64)

In [12]:
X = train_data_prepared
Y = train_data_labels.values

In [16]:
num_input = Input(shape=(len(X[0]),), name='num_input')
x = Dense((1024), activation='relu')(num_input)
x = Dropout(0.3)(x)
x = Dense((1024), activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense((1024), activation='relu')(x)
x = Dropout(0.3)(x)
num_output = Dense(1, name='num_output')(x)

model = Model(inputs=num_input, outputs=num_output)

model.compile(optimizer='adam',
              loss='mse',
              metrics=['accuracy'])

learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=10, verbose=1, factor=0.5, min_lr=0.00000001)

callbacks = [
    learning_rate_reduction, # learning_rate를 점차 감소시킴으로서 최적값에 접근하는 방식
    EarlyStopping('val_loss', patience=15)# val_loss이 최적값에서 멀어지는 순간 epoch가 남아도 학습 중지
]

history = model.fit(X, Y, epochs=5, batch_size=64, callbacks=callbacks, validation_split=0.2 )

Train on 1070512 samples, validate on 267628 samples
Epoch 1/5
  41920/1070512 [>.............................] - ETA: 37:31 - loss: 52136824171.5786 - acc: 0.0000e+00

KeyboardInterrupt: 

In [None]:
model.predict()