[辻真吾・矢吹太朗『ゼロからはじめるデータサイエンス入門』（講談社, 2021）](https://github.com/taroyabuki/fromzero)

In [None]:
# Google Colaboratoryの環境設定
import os
if 'COLAB_GPU' in os.environ:
  !python -m pip install h2o pandarallel pca pmdarima | tail -n 1

In [None]:
# Google Colaboratoryの環境設定
import os
if 'COLAB_GPU' in os.environ:
  !python -m pip install h2o | tail -n 1

## 11.1 Kerasによる回帰

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras import activations, callbacks, layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/wine.csv')
tmp = pd.read_csv(my_url)

In [None]:
my_data = shuffle(tmp)

In [None]:
my_scaler = StandardScaler()
X = my_scaler.fit_transform(
    my_data.drop(columns=['LPRICE2']))
y = my_data['LPRICE2']

In [None]:
x = np.linspace(-3, 3, 100)
plt.plot(x, activations.relu(x))
plt.xlabel('x')
plt.ylabel('ReLU(x)')

In [None]:
my_model = models.Sequential()
my_model.add(layers.Dense(units=3, activation='relu', input_shape=[4]))
my_model.add(layers.Dense(units=1))

my_model.summary() # ネットワークの概要

In [None]:
my_model.compile(
    loss='mse',
    optimizer='rmsprop')

In [None]:
my_cb = callbacks.EarlyStopping(
    patience=20,
    restore_best_weights=True)

In [None]:
my_history = my_model.fit(
    x=X,
    y=y,
    validation_split=0.25,
    batch_size=10,
    epochs=500,
    callbacks=[my_cb],
    verbose=0)

In [None]:
tmp = pd.DataFrame(my_history.history)
tmp.plot(xlabel='epoch')

In [None]:
tmp.iloc[-1, ]

In [None]:
y_ = my_model.predict(X)
((y_.ravel() - y)**2).mean()

## 11.2 Kerasによる分類

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from keras import callbacks, layers, losses, models
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import shuffle

tmp = sm.datasets.get_rdataset('iris', 'datasets').data
my_data = shuffle(tmp)

In [None]:
my_scaler = StandardScaler()
X = my_scaler.fit_transform(
    my_data.drop(columns=['Species']))
my_enc = LabelEncoder()
y = my_enc.fit_transform(
    my_data['Species'])

In [None]:
my_model = models.Sequential()
my_model.add(layers.Dense(units=3, activation='relu', input_shape=[4]))
my_model.add(layers.Dense(units=3, activation='softmax'))

In [None]:
my_model.compile(loss='sparse_categorical_crossentropy',
                 optimizer='rmsprop',
                 metrics=['accuracy'])

In [None]:
my_cb = callbacks.EarlyStopping(
    patience=20,
    restore_best_weights=True)

my_history = my_model.fit(
    x=X,
    y=y,
    validation_split=0.25,
    batch_size=10,
    epochs=500,
    callbacks=[my_cb],
    verbose=0)

tmp = pd.DataFrame(my_history.history)
tmp.plot(xlabel='epoch')

In [None]:
tmp.iloc[-1, ]

In [None]:
tmp = my_model.predict(X)
y_ = np.argmax(tmp, axis=-1)
(y_ == y).mean()

In [None]:
-np.log([0.8, 0.7, 0.3, 0.8]).mean()

-np.log([0.7, 0.6, 0.2, 0.7]).mean()

In [None]:
y = [2, 1, 0, 1]
y_1 = [[0.1, 0.1, 0.8],
       [0.1, 0.7, 0.2],
       [0.3, 0.4, 0.3],
       [0.1, 0.8, 0.1]]
y_2 = [[0.1, 0.2, 0.7],
       [0.2, 0.6, 0.2],
       [0.2, 0.5, 0.3],
       [0.2, 0.7, 0.1]]

In [None]:
[losses.sparse_categorical_crossentropy(y_true=y, y_pred=y_1).numpy().mean(),
 losses.sparse_categorical_crossentropy(y_true=y, y_pred=y_2).numpy().mean()]

## 11.3 MNIST：手書き数字の分類

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from random import sample
from keras import callbacks, layers, models
from sklearn.metrics import confusion_matrix

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

In [None]:
x_train.shape

In [None]:
np.set_printoptions(linewidth=170)
x_train[4, :, :]

In [None]:
plt.matshow(x_train[4, :, :])

In [None]:
y_train

In [None]:
x_train.min(), x_train.max()

In [None]:
x_train = x_train / 255
x_test  = x_test  / 255

In [None]:
my_index = sample(range(60000), 6000)
x_train = x_train[my_index, :, :]
y_train = y_train[my_index]

In [None]:
my_model = models.Sequential()
my_model.add(layers.Flatten(input_shape=[28, 28]))
my_model.add(layers.Dense(units=256, activation="relu"))
my_model.add(layers.Dense(units=10, activation="softmax"))

my_model.summary()

my_model.compile(loss='sparse_categorical_crossentropy',
                 optimizer='rmsprop',
                 metrics=['accuracy'])

my_cb = callbacks.EarlyStopping(patience=5, restore_best_weights=True)

In [None]:
my_history = my_model.fit(
    x=x_train,
    y=y_train,
    validation_split=0.2,
    batch_size=128,
    epochs=20,
    callbacks=[my_cb],
    verbose=0)

tmp = pd.DataFrame(my_history.history)
tmp.plot(xlabel='epoch', style='o-')

In [None]:
tmp = my_model.predict(x_test)
y_ = np.argmax(tmp, axis=-1)
confusion_matrix(y_true=y_test,
                 y_pred=y_)

In [None]:
(y_ == y_test).mean()

In [None]:
my_model.evaluate(x=x_test, y=y_test)

In [None]:
x_train2d = x_train.reshape(-1, 28, 28, 1)
x_test2d = x_test.reshape(-1, 28, 28, 1)

In [None]:
my_model = models.Sequential()
my_model.add(layers.Conv2D(filters=32, kernel_size=3, # 畳み込み層
                           activation='relu',
                           input_shape=[28, 28, 1]))
my_model.add(layers.MaxPooling2D(pool_size=2))        # プーリング層
my_model.add(layers.Flatten())
my_model.add(layers.Dense(128, activation='relu'))
my_model.add(layers.Dense(10, activation='softmax'))

my_model.summary()

my_model.compile(loss='sparse_categorical_crossentropy',
                 optimizer='rmsprop',
                 metrics=['accuracy'])

from keras.callbacks import EarlyStopping
my_cb = EarlyStopping(patience=5,
                      restore_best_weights=True)

In [None]:
my_history = my_model.fit(
    x=x_train2d,
    y=y_train,
    validation_split=0.2,
    batch_size=128,
    epochs=20,
    callbacks=my_cb,
    verbose=0)

tmp = pd.DataFrame(my_history.history)
tmp.plot(xlabel='epoch', style='o-')

In [None]:
my_model.evaluate(x=x_test2d, y=y_test)

In [None]:
my_model = models.Sequential()
my_model.add(layers.Conv2D(filters=20, kernel_size=5, activation='relu',
                           input_shape=(28, 28, 1)))
my_model.add(layers.MaxPooling2D(pool_size=2, strides=2))
my_model.add(layers.Conv2D(filters=20, kernel_size=5, activation='relu'))
my_model.add(layers.MaxPooling2D(pool_size=2, strides=2))
my_model.add(layers.Dropout(rate=0.25))
my_model.add(layers.Flatten())
my_model.add(layers.Dense(500, activation='relu'))
my_model.add(layers.Dropout(rate=0.5))
my_model.add(layers.Dense(10, activation='softmax'))

my_model.compile(loss='sparse_categorical_crossentropy',
                 optimizer='rmsprop',
                 metrics=['accuracy'])

my_cb = callbacks.EarlyStopping(patience=5,
                                restore_best_weights=True)

In [None]:
my_history = my_model.fit(
    x=x_train2d,
    y=y_train,
    validation_split=0.2,
    batch_size=128,
    epochs=20,
    callbacks=my_cb,
    verbose=0)

tmp = pd.DataFrame(my_history.history)
tmp.plot(xlabel='epoch', style='o-')

In [None]:
my_model.evaluate(x=x_test2d, y=y_test)

In [None]:
y_prob = my_model.predict(x_test2d)                    # カテゴリに属する確率

tmp = pd.DataFrame({
    'y_prob': np.max(y_prob, axis=1),                  # 確率の最大値
    'y_': np.argmax(y_prob, axis=1),                   # 予測カテゴリ
    'y': y_test,                                       # 正解
    'id': range(len(y_test))})                         # 番号

tmp = tmp[tmp.y_ != tmp.y]                             # 予測がはずれたものを残す
my_result = tmp.sort_values('y_prob', ascending=False) # 確率の大きい順に並び替える

In [None]:
my_result.head()

In [None]:
for i in range(5):
    plt.subplot(1, 5, i + 1)
    ans = my_result['y'].iloc[i]
    id = my_result['id'].iloc[i]
    plt.title(f'{ans} ({id})')
    plt.imshow(x_test[id])
    plt.axis('off')

## 11.4 AutoML

In [None]:
import h2o
import pandas as pd
import tensorflow as tf
from h2o.automl import H2OAutoML
from random import sample

h2o.init()
h2o.no_progress()
# h2o.cluster().shutdown() # 停止

In [None]:
my_url = ('https://raw.githubusercontent.com/taroyabuki'
          '/fromzero/master/data/wine.csv')
my_data = pd.read_csv(my_url)
my_frame = h2o.H2OFrame(my_data) # 通常のデータフレームをH2OFrameに変換する．
# あるいは
my_frame = h2o.import_file(my_url, header=1) # データを読み込む．

In [None]:
my_frame.head(5)

# 通常のデータフレームに戻す．
h2o.as_list(my_frame).head()
# 結果は割愛（見た目は同じ）

In [None]:
my_model = H2OAutoML(
    max_runtime_secs=60)
my_model.train(
    y='LPRICE2',
    training_frame=my_frame)

In [None]:
my_model.leaderboard['rmse'].min()

In [None]:
tmp = h2o.as_list(
    my_model.predict(my_frame))

pd.DataFrame({
    'y': my_data['LPRICE2'],
    'y_': tmp['predict']}
).plot('y', 'y_', kind='scatter')

In [None]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
my_index = sample(range(60000), 6000)
x_train = x_train[my_index, :, :]
y_train = y_train[my_index]

In [None]:
tmp = pd.DataFrame(
    x_train.reshape(-1, 28 * 28))
y = 'y'
tmp[y] = y_train
my_train = h2o.H2OFrame(tmp)
my_train[y] = my_train[y].asfactor()

tmp = pd.DataFrame(
    x_test.reshape(-1, 28 * 28))
my_test = h2o.H2OFrame(tmp)

In [None]:
my_model = H2OAutoML(
    max_runtime_secs=120)
my_model.train(
    y=y,
    training_frame=my_train)

In [None]:
my_model.leaderboard[
    'mean_per_class_error'].min()

In [None]:
tmp = h2o.as_list(
    my_model.predict(my_test))
y_ = tmp.predict

(y_ == y_test).mean()