## 케라스 자동차 연비 예측 모델



### modules import

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import get_file, plot_model

### 데이터 로드

In [2]:
dataset_path = get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
dataset_path

Downloading data from http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data
  16384/Unknown [1m0s[0m 9us/step

'/Users/vin_ah/.keras/datasets/auto-mpg.data'

In [3]:
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']

### 데이터 확인

In [4]:
raw_dataset = pd.read_csv(dataset_path, names=column_names, na_values = "?", comment='\t', sep=" ", skipinitialspace=True)
dataset = raw_dataset.copy()
dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1
397,31.0,4,119.0,82.0,2720.0,19.4,82,1


### 데이터 전처리


- 해당 데이터는 일부 데이터가 누락되어 있음

In [None]:
dataset.isna().sum()

- 누락된 행 삭제

In [None]:
dataset = dataset.dropna()

- "Origin" 범주형 데이터
  - 원-핫 인코딩(one-hot encoding) 진행

In [None]:
dataset['Origin'].unique()

In [None]:
origin = dataset.pop('Origin')

In [None]:
dataset['USA'] = (origin == 1)*1.0
dataset['Europe'] = (origin == 2)*1.0
dataset['Japan'] = (origin == 3)*1.0
dataset

#### 검증 데이터셋 생성

In [None]:
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

#### 데이터 조사

In [None]:
sns.pairplot(train_dataset[["MPG", "Cylinders", "Displacement","Horsepower", "Weight"]], diag_kind="kde")

In [None]:
train_stats = train_dataset.describe()
train_stats.pop("MPG")
train_stats = train_stats.transpose()
train_stats

#### 데이터의 특성과 레이블 분리

In [None]:
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')

#### 데이터 정규화

In [None]:
def normalization(x):
    # [실습 1] 원하는 정규화 수식을 만들어서 적용해보기.
    return


normed_train_data = normalization(train_dataset)
normed_test_data = normalization(test_dataset)

### 모델 구성

In [None]:
def build_model():
    # [실습 2] 함수형 API 를 활용하여 모델을 구성하고, compile을 완료하는 함수 구성하기


    return model

In [None]:
model = build_model()
model.summary()

In [None]:
plot_model(model)

### 샘플 데이터 확인

In [None]:
sample_batch = normed_train_data[:10] # 임의로 10개 데이터를 뽑아서 test sample 만들기
sample_result = model.predict(sample_batch)
sample_batch

### 모델 학습

In [None]:
epochs = 1000

history = model.fit(normed_train_data, train_labels, epochs=epochs,
                    validation_split=0.2)

### 모델 학습 시각화

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist

In [None]:
def plot_history(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    plt.figure(figsize=(12,6))

    plt.subplot(1, 2, 1)
    plt.xlabel('Epoch')
    plt.ylabel('Mean Abs Error [MPG]')
    plt.plot(hist['epoch'], hist['mae'], label='Train Error')
    plt.plot(hist['epoch'], hist['val_mae'], label='Val Error')
    plt.ylim([0,5])
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.xlabel('Epoch')
    plt.ylabel('Mean Squared Error [MPG]')
    plt.plot(hist['epoch'], hist['mse'], label='Train Error')
    plt.plot(hist['epoch'], hist['val_mse'], label='Val Error')
    plt.ylim([0,20])
    plt.legend()

    plt.show()

plot_history(history)

### EarlyStopping을 이용한 규제화

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# [실습 3] EarlyStopping을 적용하여 모델 성능 끌어올리기 -> 콜백함수를 적용하고, 학습시키세요

model = build_model()

early_stop =

history = model.fit()

In [None]:
plot_history(history)

### 모델 평가

In [None]:
# [실습 4] 모델 평가를 진행하여 test mae 를 2 미만으로 달성하세요.

loss, mae, mse =
print("테스트 세트의 평균 절대 오차: {:5.2f} MPG".format(mae))

### 학습된 모델을 통한 예측

In [None]:
test_pred = model.predict(normed_test_data).flatten()

plt.scatter(test_labels, test_pred)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
plt.axis('equal')
plt.axis('square')
plt.grid()
plt.xlim([0,plt.xlim()[1]])
plt.ylim([0,plt.ylim()[1]])
plt.plot([-100,100], [-100,100])

In [None]:
error = test_pred - test_labels
plt.hist(error, bins=30)
plt.xlabel("Prediction Error [MPG]")
plt.grid()
plt.ylabel("Count")