##### Copyright 2021 The TensorFlow IO Authors.

In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 来自 MongoDB 集合的 TensorFlow 数据集 

<table class="tfo-notebook-buttons" align="left">
  <td>     <a target="_blank" href="https://tensorflow.google.cn/io/tutorials/mongodb"><img src="https://tensorflow.google.cn/images/tf_logo_32px.png">在 TensorFlow.org 上查看</a> </td>
  <td>     在 Google Colab 中运行   </td>
  <td>在 GitHub 上查看源代码</td>
      <td>     <a href="https://storage.googleapis.com/tensorflow_docs/docs-l10n/site/zh-cn/io/tutorials/mongodb.ipynb"><img src="https://tensorflow.google.cn/images/download_logo_32px.png">下载笔记本</a>   </td>
</table>

## 概述

本教程着重阐述通过从 mongoDB 集合中读取数据并使用其训练 `tf.keras` 模型来准备 `tf.data.Dataset`。

**注：**对 [mongodb 存储](https://docs.mongodb.com/guides/)的基本了解可以帮助您更轻松地学习本教程。

## 安装软件包

本教程使用 `pymongo` 作为辅助软件包来创建新的 mongodb 数据库和集合以存储数据。


### 安装要求的 tensorflow-io 和 mongodb （辅助）软件包

In [2]:
!pip install -q tensorflow-io
!pip install -q pymongo



### 导入软件包

In [3]:
import os
import time
from pprint import pprint
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
import tensorflow_io as tfio
from pymongo import MongoClient

### 验证 tf 和 tfio 导入

In [4]:
print("tensorflow-io version: {}".format(tfio.__version__))
print("tensorflow version: {}".format(tf.__version__))

tensorflow-io version: 0.20.0
tensorflow version: 2.6.0


## 下载并安装 MongoDB 实例

出于演示目的，使用了开源版本的 mongodb。


In [5]:
%%bash

sudo apt install -y mongodb >log
service mongodb start

 * Starting database mongodb
   ...done.




debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 8.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 


In [6]:
# Sleep for few seconds to let the instance start.
time.sleep(5)

启动实例后，在进程列表中使用 grep 搜索 `mongo` 以确认可用性。

In [7]:
%%bash

ps -ef | grep mongo

mongodb      580       1 13 17:38 ?        00:00:00 /usr/bin/mongod --config /etc/mongodb.conf
root         612     610  0 17:38 ?        00:00:00 grep mongo


查询基础端点以检索有关集群的信息。

In [8]:
client = MongoClient()
client.list_database_names() # ['admin', 'local']

['admin', 'local']

### 探索数据集

出于本教程的目的，让我们下载 [PetFinder](https://www.kaggle.com/c/petfinder-adoption-prediction) 数据集并手动将数据馈入 mongodb。此分类问题的目标是预测宠物是否会被收养。


In [9]:
dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = 'datasets/petfinder-mini/petfinder-mini.csv'
tf.keras.utils.get_file('petfinder_mini.zip', dataset_url,
                        extract=True, cache_dir='.')
pf_df = pd.read_csv(csv_file)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip


In [10]:
pf_df.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,Description,PhotoAmt,AdoptionSpeed
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,Nibble is a 3+ month old ball of cuteness. He ...,1,2
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,I just found it alone yesterday near my apartm...,2,0
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,Their pregnant mother was dumped by her irresp...,7,3
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,"Good guard dog, very alert, active, obedience ...",8,2
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,This handsome yet cute boy is up for adoption....,3,2


出于本教程的目的，对标签列进行了修改。0 表示该宠物未被收养，1 表示被收养。


In [11]:
# In the original dataset "4" indicates the pet was not adopted.
pf_df['target'] = np.where(pf_df['AdoptionSpeed']==4, 0, 1)

# Drop un-used columns.
pf_df = pf_df.drop(columns=['AdoptionSpeed', 'Description'])


In [12]:
# Number of datapoints and columns
len(pf_df), len(pf_df.columns)

(11537, 14)

### 拆分数据集


In [13]:
train_df, test_df = train_test_split(pf_df, test_size=0.3, shuffle=True)
print("Number of training samples: ",len(train_df))
print("Number of testing sample: ",len(test_df))


Number of training samples:  8075
Number of testing sample:  3462


### 在 mongo 集合中存储训练数据和测试数据

In [14]:
URI = "mongodb://localhost:27017"
DATABASE = "tfiodb"
TRAIN_COLLECTION = "train"
TEST_COLLECTION = "test"

In [15]:
db = client[DATABASE]
if "train" not in db.list_collection_names():
  db.create_collection(TRAIN_COLLECTION)
if "test" not in db.list_collection_names():
  db.create_collection(TEST_COLLECTION)

In [16]:
def store_records(collection, records):
  writer = tfio.experimental.mongodb.MongoDBWriter(
      uri=URI, database=DATABASE, collection=collection
  )
  for record in records:
      writer.write(record)

In [17]:
store_records(collection="train", records=train_df.to_dict("records"))
time.sleep(2)
store_records(collection="test", records=test_df.to_dict("records"))

## 准备 tfio 数据集

当数据在集群中可用后，会针对此目的使用 `mongodb.MongoDBIODsataset` 类。该类继承自 `tf.data.Dataset`，因此，它原生具有 `tf.data.Dataset` 的所有有用功能。


### 训练数据集


In [18]:
train_ds = tfio.experimental.mongodb.MongoDBIODataset(
        uri=URI, database=DATABASE, collection=TRAIN_COLLECTION
    )

train_ds

Connection successful: mongodb://localhost:27017
Instructions for updating:
Use `tf.data.Dataset.scan(...) instead
Instructions for updating:
Use `tf.data.Dataset.take_while(...)


<MongoDBIODataset shapes: (), types: tf.string>

`train_ds` 中的每一项都是一个字符串，需要解码为 json。为此，可以通过指定 `TensorSpec` 仅选择一部分列

In [19]:
# Numeric features.
numerical_cols = ['PhotoAmt', 'Fee'] 

SPECS = {
    "target": tf.TensorSpec(tf.TensorShape([]), tf.int64, name="target"),
}
for col in numerical_cols:
  SPECS[col] = tf.TensorSpec(tf.TensorShape([]), tf.int32, name=col)
pprint(SPECS)

{'Fee': TensorSpec(shape=(), dtype=tf.int32, name='Fee'),
 'PhotoAmt': TensorSpec(shape=(), dtype=tf.int32, name='PhotoAmt'),
 'target': TensorSpec(shape=(), dtype=tf.int64, name='target')}


In [20]:
BATCH_SIZE=32
train_ds = train_ds.map(
        lambda x: tfio.experimental.serialization.decode_json(x, specs=SPECS)
    )

# Prepare a tuple of (features, label)
train_ds = train_ds.map(lambda v: (v, v.pop("target")))
train_ds = train_ds.batch(BATCH_SIZE)

train_ds

<BatchDataset shapes: ({PhotoAmt: (None,), Fee: (None,)}, (None,)), types: ({PhotoAmt: tf.int32, Fee: tf.int32}, tf.int64)>

### 测试数据集

In [21]:
test_ds = tfio.experimental.mongodb.MongoDBIODataset(
        uri=URI, database=DATABASE, collection=TEST_COLLECTION
    )
test_ds = test_ds.map(
        lambda x: tfio.experimental.serialization.decode_json(x, specs=SPECS)
    )
# Prepare a tuple of (features, label)
test_ds = test_ds.map(lambda v: (v, v.pop("target")))
test_ds = test_ds.batch(BATCH_SIZE)

test_ds

Connection successful: mongodb://localhost:27017


<BatchDataset shapes: ({PhotoAmt: (None,), Fee: (None,)}, (None,)), types: ({PhotoAmt: tf.int32, Fee: tf.int32}, tf.int64)>

### 定义 keras 预处理层

根据[结构化数据教程](https://tensorflow.google.cn/tutorials/structured_data/preprocessing_layers)，建议使用 [Keras 预处理层](https://tensorflow.google.cn/api_docs/python/tf/keras/layers/experimental/preprocessing)，因为它们更直观，并且可以轻松地与模型集成。但是，也可以使用标准的 [feature_columns](https://tensorflow.google.cn/api_docs/python/tf/feature_column)。

为了对结构化数据分类中的 `preprocessing_layers` 有更好的理解，请参阅[结构化数据教程](https://tensorflow.google.cn/tutorials/structured_data/preprocessing_layers)

In [22]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for our feature.
  normalizer = preprocessing.Normalization(axis=None)

  # Prepare a Dataset that only yields our feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer


In [23]:
all_inputs = []
encoded_features = []

for header in numerical_cols:
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

## 构建、编译并训练模型


In [24]:
# Set the parameters

OPTIMIZER="adam"
LOSS=tf.keras.losses.BinaryCrossentropy(from_logits=True)
METRICS=['accuracy']
EPOCHS=10


In [25]:
# Convert the feature columns into a tf.keras layer
all_features = tf.keras.layers.concatenate(encoded_features)

# design/build the model
x = tf.keras.layers.Dense(32, activation="relu")(all_features)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
x = tf.keras.layers.Dropout(0.5)(x)
output = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(all_inputs, output)

In [26]:
# compile the model
model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=METRICS)

In [27]:
# fit the model
model.fit(train_ds, epochs=EPOCHS)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f743229fe90>

## 在测试数据上进行推断

In [28]:
res = model.evaluate(test_ds)
print("test loss, test acc:", res)

test loss, test acc: [0.569588840007782, 0.7383015751838684]


注：本教程的目标是演示 Tensorflow-IO 从 mongodb 准备 `tf.data.Datasets` 并直接训练 `tf.keras` 模型的能力，因此提高模型的准确率超出了当前范围。但是，用户可以探索数据集并使用特征列和模型架构来获得更好的分类性能。

## 参考文献：

- [MongoDB](https://docs.mongodb.com/guides/)

- [PetFinder 数据集](https://www.kaggle.com/c/petfinder-adoption-prediction)

- [使用 Keras 对结构化数据进行分类](https://tensorflow.google.cn/tutorials/structured_data/preprocessing_layers#create_compile_and_train_the_model)
