In [1]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os, sys, time
import tensorflow as tf
from tensorflow import keras

In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## **1.1 数据集载入**

In [3]:
import tensorflow_datasets as tfds

dataset, info = tfds.load('imdb_reviews/subwords8k',
                           with_info = True,
                           as_supervised = True)
train_dataset, test_dataset = dataset['train'], dataset['test']

In [4]:
print(info)

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=0.1.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(None,), dtype=tf.int64, encoder=<SubwordTextEncoder vocab_size=8185>),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Pot

In [5]:
tokenizer = info.features['text'].encoder
print(f'vocabulary size {tokenizer.vocab_size}')

vocabulary size 8185


In [6]:
sample_string = 'Tensorflow is cool'
tokenized_string = tokenizer.encode(sample_string)

original_string = tokenizer.decode(tokenized_string)
print(tokenized_string, original_string)

[6307, 2327, 2934, 7961, 9, 2724] Tensorflow is cool


In [7]:
for token in tokenized_string:
    print(f'{token} - > {tokenizer.decode([token])}')

6307 - > Ten
2327 - > sor
2934 - > flow
7961 - >  
9 - > is 
2724 - > cool


In [8]:
buffer_size = 10000
batch_size = 64
train_dataset = train_dataset.shuffle(buffer_size)
train_dataset = train_dataset.padded_batch(batch_size, # 在分了batch后对每个batch分别进行padding的操作，
                                           tf.compat.v1.data.get_output_shapes(train_dataset)) # 每个batch长度可能不同
test_dataset = test_dataset.padded_batch(batch_size, tf.compat.v1.data.get_output_shapes(test_dataset))
print(tf.compat.v1.data.get_output_shapes(train_dataset))

(TensorShape([None, None]), TensorShape([None]))


## **1.3 模型定义**

In [9]:
embedding_dim = 16
batch_size = 128
vocab_size = 8185

model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim),
    keras.layers.Bidirectional(keras.layers.GRU(units = 64, return_sequences = False)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          130960    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               31488     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 170,769
Trainable params: 170,769
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(train_dataset, epochs=10, validation_data = test_dataset)

Epoch 1/10
Epoch 2/10

CancelledError:  [_Derived_]RecvAsync is cancelled.
	 [[{{node Adam/Adam/update/AssignSubVariableOp/_41}}]]
	 [[Reshape_11/_38]] [Op:__inference_distributed_function_6224]

Function call stack:
distributed_function
