<a href="https://colab.research.google.com/github/hedgehog-zowie/tf-study/blob/master/fuel_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

第3个tensorflow的例子，使用keras api对70年代末和80年代初汽车的燃油效率进行预测，这是一个回归问题。

一、导入module。

In [0]:
# import modules
import pathlib

import tensorflow as tf
from tensorflow import keras

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# 查看各个module的版本
print('tensorflow version: ', tf.__version__)
print('numpy version: ', np.__version__)
print('matplot version: ', mpl.__version__)
print('seaborn version: ', sns.__version__)

tensorflow version:  1.13.0-rc1
numpy version:  1.14.6
matplot version:  3.0.2
seaborn version:  0.7.1


二、导入数据。

In [0]:
# 从网上下载auto mpg开放的数据集
dataset_path = keras.utils.get_file("auto-mpg.data", "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")
print('dataset_path: ', dataset_path)
# 使用pandas进行数据导入
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight','Acceleration','Model Year','Origin'] 
raw_dataset = pd.read_csv(dataset_path, names = column_names, na_values = '?', comment='\t', sep=' ', skipinitialspace=True)
# 复制一份原始数据是个好习惯
dataset = raw_dataset.copy()
# 查看样例数据
print('data examples:')
dataset.tail()

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data
dataset_path:  /root/.keras/datasets/auto-mpg.data
data examples:


Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1
397,31.0,4,119.0,82.0,2720.0,19.4,82,1


三、数据预处理，对数据进行清洗，正则化等。

In [0]:
# 统计na值
dataset.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [0]:
# 丢弃异常行
dataset = dataset.dropna()

In [0]:
# 对标称型的数据进行one-hot编码
origin = dataset['Origin']
dataset['USA'] = (origin == 1) * 1.0
dataset['Europe'] = (origin == 2) * 1.0
dataset['Japan'] = (origin == 3) * 1.0
# 查看样例
dataset.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,USA,Europe,Japan
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,1.0,0.0,0.0
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,1.0,0.0,0.0
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,1.0,0.0,0.0
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,1.0,0.0,0.0
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,1.0,0.0,0.0


In [0]:
# 将数据分割成训练集、测试集
# 随机抽取80%的数据作为训练集，0为随机种子
train_dataset = dataset.sample(frac=0.8, random_state = 0)
# 剩下的为测试信
test_dataset = dataset.drop(train_dataset.index)

In [0]:
# 查看数据
sns.pairplot(train_dataset, vars = ['MPG', 'Cylinders', 'Displacement', 'Weight'], diag_kind = 'kde')

In [0]:
# 使用不同颜色区分不同国家的数据
sns.pairplot(train_dataset, vars = ['MPG', 'Cylinders', 'Displacement', 'Weight'], diag_kind = 'kde', hue="Origin", palette="husl")

In [0]:
# 查看统计数据，分析数据是否合理
train_stats = train_dataset.describe()
# MPG不是特征项
train_stats.pop("MPG")
# 将表格转置，便于后面进行正则化
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Cylinders,314.0,5.477707,1.699788,3.0,4.0,4.0,8.0,8.0
Displacement,314.0,195.318471,104.331589,68.0,105.5,151.0,265.75,455.0
Horsepower,314.0,104.869427,38.096214,46.0,76.25,94.5,128.0,225.0
Weight,314.0,2990.251592,843.898596,1649.0,2256.5,2822.5,3608.0,5140.0
Acceleration,314.0,15.559236,2.78923,8.0,13.8,15.5,17.2,24.8
Model Year,314.0,75.898089,3.675642,70.0,73.0,76.0,79.0,82.0
Origin,314.0,1.573248,0.800988,1.0,1.0,1.0,2.0,3.0
USA,314.0,0.624204,0.485101,0.0,0.0,1.0,1.0,1.0
Europe,314.0,0.178344,0.383413,0.0,0.0,0.0,0.0,1.0
Japan,314.0,0.197452,0.398712,0.0,0.0,0.0,0.0,1.0


In [0]:
# 分割特征与标签
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')

In [0]:
# 对数据进行正则化
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

四、构建模型

In [0]:
# 定义一个函数构建模型
def build_model():
  model = keras.Sequential([
      # input_shape表示输入是一个尺寸为 (*, 16) 的数组
      # 64表示输出数组的尺寸为 (*, 64)
      keras.layers.Dense(64, activation=tf.nn.relu, input_shape=[len(train_dataset.keys())]),
      keras.layers.Dense(64, activation=tf.nn.relu),
      keras.layers.Dense(1)
  ])
  model.compile(loss='mse', optimizer=tf.train.RMSPropOptimizer(0.001), metrics=['mae', 'mse'])
  return model

model = build_model()
# 打印模型概述信息
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                704       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 4,929
Trainable params: 4,929
Non-trainable params: 0
_________________________________________________________________


五、训练模型

In [0]:
normed_train_data.shape
normed_train_data.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,USA,Europe,Japan
146,-0.869348,-1.009459,-0.784052,-1.025303,-0.379759,-0.516397,-0.715676,0.774676,-0.465148,-0.495225
282,-0.869348,-0.530218,-0.442811,-0.118796,0.624102,0.84391,-0.715676,0.774676,-0.465148,-0.495225
69,1.483887,1.482595,1.44714,1.736877,-0.738281,-1.060519,-0.715676,0.774676,-0.465148,-0.495225
378,-0.869348,-0.865687,-1.099044,-1.025303,-0.308055,1.660094,-0.715676,0.774676,-0.465148,-0.495225
331,-0.869348,-0.942365,-0.994047,-1.001603,0.875068,1.115971,1.781239,-1.286751,-0.465148,2.012852


In [0]:
EPOCHS = 1000
# 由于1000较大，因此定义一个打印函数，每个周期打印一个点，100个点换一行
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

model = build_model()
# verbose: 0, 1 或 2。日志显示模式。 0 = 安静模式, 1 = 进度条, 2 = 每轮一行。
history = model.fit(normed_train_data, train_labels, epochs=EPOCHS, validation_split = 0.2, verbose=0, callbacks=[PrintDot()])

Instructions for updating:
Use tf.cast instead.

....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
..........................................

In [0]:
# 查看训练过程
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

# 画图
def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch
  
  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [MPG]')
  plt.plot(hist['epoch'], hist['mean_absolute_error'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mean_absolute_error'],
           label = 'Val Error')
  plt.legend()
  plt.ylim([0,5])
  
  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [$MPG^2$]')
  plt.plot(hist['epoch'], hist['mean_squared_error'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mean_squared_error'],
           label = 'Val Error')
  plt.legend()
  plt.ylim([0,20])

plot_history(history)

In [0]:
# 从上图可以看到100个周期后收敛效果已经不明显，且在验证集上出现了误差上升的情况，说明模型已经过拟合，因此需要提前结束训练，可使用回调函数实现此功能。
model = build_model()

# patience参数用于检查误差上升次数
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(normed_train_data, train_labels, epochs=EPOCHS, validation_split = 0.2, verbose=0, callbacks=[early_stop, PrintDot()])

plot_history(history)

