

## 获取数据

首先引入需要的包。

In [98]:
import csv
import ipyparams
import logging
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import random
import re
import shutil
import string  
import sys
import tensorflow as tf
import tensorflow_hub as hub
import traceback

from collections import Counter
from joblib import Parallel, delayed
from nltk.corpus import stopwords
from scipy import stats
from sklearn import feature_extraction, feature_selection
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras import Model, models, layers, regularizers, preprocessing, datasets, metrics, losses, optimizers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorboard.plugins.hparams import api as hp 

base_path = os.path.abspath('/tf/eipi10/jian-xu3/qbz95')
sys.path.append(base_path)

import qbz95
from qbz95 import tf as qtf
from qbz95 import utils as qutils

pd.set_option('display.max_colwidth', None)
# 设置GPU内存自动扩增
qtf.utils.set_gpu_memory_growth()
nltk.download('stopwords')

# auto load the changes of referenced codes
%load_ext autoreload
%autoreload 2

# ebablbe auto-completion
%config Completer.use_jedi = False

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# 当module有新的方法的时候，需要运行下面方法。
%reload_ext autoreload

初始化参数。

In [3]:
def lr_schedule(epoch, lr):
    """config the strategy of reducing learning rate"""
    lr_times = [(0, 1), (60, 1e-1), (90, 1e-2), (105, 1e-3), (120, 0.5e-3)]
    
    base_lr = 1e-3
    new_lr = base_lr
    for border_epoch, times in lr_times:
        if epoch>=border_epoch: 
            new_lr = base_lr*times
    if abs(lr - new_lr)>1e-7:
        if new_lr > lr > 0.1*new_lr - 1e-7:
            print('Epoch %05d: Still keep learning rate %s instead of %s' % 
                  (epoch + 1, round(lr, 7), round(new_lr, 7))) 
            return lr   
        print('Epoch %05d: LearningRateScheduler reducing learning rate to %s from %s.' % 
              (epoch + 1, round(new_lr, 7), round(lr, 7)))
    return new_lr



output_path = os.path.abspath('./output')
data_name = 'imdb'
data_path = os.path.join(output_path, data_name)
program_path = os.path.join(data_path, qbz95.utils.get_notebook_name().split('.')[0])
word_vectors_path = '/tf/eipi10/xuxiangwen.github.io/_notes/05-ai/54-tensorflow/models/word_vectors'
classes = ['Negative', 'Positive']

params = {
    'data_name': data_name,
    'data_path': data_path,
    'program_name': qbz95.utils.get_notebook_name(),
    'program_path': program_path,
    'classes': classes,
    'word_vectors_path': word_vectors_path,
    'sample_perecent': 1,    
    'text_columns': 'snps_sa_comments',
    'validation_percent': 0.0,  
    'use_stop_words': True,    
    'stop_words':stopwords.words('english'),
    'batch_size': 32,    
    'max_features': 20000,
    'sequence_length': 500,    
    'epochs': 3,    
    'learning_rate':0.001,
    'clip_value':None,
    'dropout':0.1,
    'metrics':['accuracy'], 
    'loss': losses.SparseCategoricalCrossentropy(from_logits=True),
    'restore_best_checkpoint':True,
    'use_savedmodel':True,
    'use_bias_initializer':True,
    'use_class_weight':False,
    'class_weight': [1.0, 1.0],
    'callbacks': {
        'ModelCheckpoint': {
            'enabled': True,
            'monitor': 'val_accuracy',               
        },
        'EarlyStopping': {
            'enabled': True,
            'patience': 40,   
            'monitor': 'val_accuracy',            
        },
        'ReduceLROnPlateau': {
            'enabled': True,
            'monitor': 'val_loss',
            'patience': 15,
            'factor': np.sqrt(0.1),            
        },
        'LearningRateScheduler': {
            'enabled': True,
            'schedule': lr_schedule,            
            
        }             
    },
    'model_params':{
        'mlp':{'dropout':0.6, 'layer_count':1, 'units':256, 'epochs':15},
        'rnn':{'dropout':0.4, 'embedding_dim':200, 'units':200, 'epochs':5},
        'embedding':{'dropout':0.4, 'embedding_dim':200, 'epochs':15},
        'sepcnn':{'dropout':0.4, 'epochs': 10, 'batch_size':128},
        'tl':{'dropout':0.4, 'trainable':True, 'layer_count':1, 'unit':64, 'epochs': 10},
        'pg':{'dropout':0.4, 'embedding_dim':300, 'units':80, 'epochs':20, 'learning_rate':0.0005},  
    },
    'embedding_paths':{
        'cc_en_300':os.path.join(word_vectors_path, 'snps', 'cc.en.300.vec'),
        'fasttext_crawl_300d_2M':os.path.join(word_vectors_path, 'fasttext-crawl-300d-2M.vec'),
        'glove_twitter_27B_200d':os.path.join(word_vectors_path, 'glove.twitter.27B.200d.txt')
    },
    'keras_layper_paths':{
    },
    'model_resutls':{
        'show_top_n':20,
        'show_exclude_columns':qtf.classification.ModelResults.exclude_columns1
    }
}

params = qtf.classification.Params(params)
model_results=qtf.classification.ProgramModelResults(params.program_path)
model_results.clear()

### 下载数据

开始下载数据。

In [4]:
dataset_dir =  os.path.join(os.path.expanduser('~'), '.keras/datasets/aclImdb') 
if not os.path.exists(dataset_dir):
    url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url, untar=True)
    dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
    print(dataset_dir)

![image-20201217095446441](images/image-20201217095446441.png)

下面是压缩文件解开后的目录结构。

![image-20201104115404253](images/image-20201104115404253.png)

其中train和test目录包含了实际的文本数据，详细说明见[IMDB](https://eipi10.cn/others/2020/10/22/dataset/#imdb---large-movie-review-dataset)。

### 查看数据

下面看一看实际的数据是啥样的。

In [5]:
def get_files(base_dir):
    pos_dir = os.path.join(base_dir, 'pos')
    pos_files = os.listdir(pos_dir)
    pos_files = [os.path.join(pos_dir, file_name) for file_name in pos_files]
    
    neg_dir = os.path.join(base_dir, 'neg')
    neg_files = os.listdir(neg_dir)
    neg_files = [os.path.join(neg_dir, file_name) for file_name in neg_files]    

    return pos_files, neg_files

train_pos_files, train_neg_files = get_files(os.path.join(dataset_dir, 'train'))
test_pos_files, test_files = get_files(os.path.join(dataset_dir, 'test'))
    
print('train dataset: {} positive reviews, {} negative reviews'.format(len(train_pos_files), len(train_neg_files)))
print('test dataset: {} positive reviews, {} negative reviews'.format(len(test_pos_files), len(test_files)))

train dataset: 12500 positive reviews, 12500 negative reviews
test dataset: 12500 positive reviews, 12500 negative reviews




### 基本信息

In [6]:
def get_samples(texts, labels, rate):
    indexs = np.arange(len(texts))
    sample_indexs, _, sample_labels, _ = train_test_split(indexs, labels,                                                          
                                                          test_size=1-rate,
                                                          random_state=12)
    sample_texts = [texts[index] for index in sample_indexs]
    return sample_texts, sample_labels    

def load_imdb_sentiment_analysis_dataset(imdb_data_path, seed=123):
    """Loads the IMDb movie reviews sentiment analysis dataset.

    # Arguments
        data_path: string, path to the data directory.
        seed: int, seed for randomizer.

    # Returns
        A tuple of training and validation data.
        Number of training samples: 25000
        Number of test samples: 25000
        Number of categories: 2 (0 - negative, 1 - positive)

    # References
        Mass et al., http://www.aclweb.org/anthology/P11-1015

        Download and uncompress archive from:
        http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    """

    # Load the training data
    train_texts = []
    train_labels = []
    for category in ['pos', 'neg']:
        train_path = os.path.join(imdb_data_path, 'train', category)
        for fname in sorted(os.listdir(train_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(train_path, fname)) as f:
                    train_texts.append(f.read())
                train_labels.append(0 if category == 'neg' else 1)

    # Load the validation data.
    test_texts = []
    test_labels = []
    for category in ['pos', 'neg']:
        test_path = os.path.join(imdb_data_path, 'test', category)
        for fname in sorted(os.listdir(test_path)):
            if fname.endswith('.txt'):
                with open(os.path.join(test_path, fname)) as f:
                    test_texts.append(f.read())
                test_labels.append(0 if category == 'neg' else 1)

    # Shuffle the training data and labels.
    random.seed(seed)
    random.shuffle(train_texts)
    random.seed(seed)
    random.shuffle(train_labels)

    return ((train_texts, np.array(train_labels)),
            (test_texts, np.array(test_labels)))

(train_texts, train_labels), (test_texts, test_labels) = load_imdb_sentiment_analysis_dataset(dataset_dir)

if params.sample_perecent<1:
    train_texts, train_labels = get_samples(train_texts, train_labels, rate=params.sample_perecent)
    test_texts, test_labels = get_samples(test_texts, test_labels, rate=params.sample_perecent)

print('train_texts.length={}, train_labels.shape={}'.format(len(train_texts), train_labels.shape))
print('test_texts.length={}, test_labels.shape={}'.format(len(test_texts), test_labels.shape))

train_texts.length=25000, train_labels.shape=(25000,)
test_texts.length=25000, test_labels.shape=(25000,)


## TextVectorization

非常非常非常奇怪， 如果第24行如下内容，将会报错
~~~
                      ngrams=layer._ngrams, output_mode=layer._output_mode,
~~~

In [92]:
def save_layer(layer, layer_path):
    if layer_path.endswith('pkl'):
        obj = {'config': layer.get_config(),
               'weights': layer.get_weights()}
        qutils.pickle.ObjectPickle.save(layer_path, obj)
    else:
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Input(shape=(1,), dtype=tf.string))
        model.add(layer)
        model.save(layer_path, save_format="tf")


def load_layer(layer_path):
    if layer_path.endswith('pkl'):
        obj = qutils.pickle.ObjectPickle.load(layer_path)
        layer = tf.keras.layers.TextVectorization.from_config(obj['config'])
        layer.set_weights(obj['weights'])          
    else:
        model = tf.keras.models.load_model(layer_path)
        layer = model.layers[0]  
        
    if layer._output_mode == 'int':
        layer = get_tv(layer._standardize, layer._max_tokens,
                      ngrams=layer._ngrams, output_mode='int',
                      max_sequence_length=layer._output_sequence_length,
                      vocabulary=layer.get_vocabulary(include_special_tokens=False))    

    return layer


def get_ngram_layer(standardize, texts, max_features):
    ngrams = (1, 2)
    output_mode = 'tf-idf'
    max_sequence_length = None
    layer = get_tv(standardize, max_features,
                  ngrams=ngrams, output_mode=output_mode,
                  max_sequence_length=max_sequence_length,
                  vocabulary=None)
    text_dataset = tf.data.Dataset.from_tensor_slices(texts)
    layer.adapt(text_dataset)  
    return layer


def get_sequence_layer(standardize, texts, max_features, max_sequence_length):
    ngrams = None
    output_mode = 'int'
    layer = get_tv(standardize, max_features,
                  ngrams=ngrams, output_mode=output_mode,
                  max_sequence_length=max_sequence_length,
                  vocabulary=None)
    text_dataset = tf.data.Dataset.from_tensor_slices(texts)
    layer.adapt(text_dataset)    
    return layer

def get_tv(standardize, max_features, ngrams, output_mode, 
           max_sequence_length, vocabulary):
    layer = tf.keras.layers.TextVectorization(
        max_tokens=max_features, standardize=standardize,
        split='whitespace', ngrams=ngrams, output_mode=output_mode,
        output_sequence_length=max_sequence_length, pad_to_max_tokens=False,
        vocabulary=vocabulary
    )
    return layer

def get_dataset(layer, data, labels, use_shuffle=False, batch_size=None, drop_remainder=False, use_cache=True):
    def map_(text):
        text = tf.expand_dims(text, axis=-1)
        return layer(text)[0]
    
    
    if labels is None:
        dataset = tf.data.Dataset.from_tensor_slices((data,))
    else:
        dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    if use_shuffle and labels is not None:
        dataset = dataset.shuffle(len(labels), reshuffle_each_iteration=True)

    if labels is None:
        dataset = dataset.map(lambda x: map_(x))
    else:
        dataset = dataset.map(lambda x, y: (map_(x), y))

#     if batch_size is not None:
#         dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
    if use_cache:
        dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)  
    return dataset


def get_data_from_layer(layer):
    dataset = get_dataset(layer, train_texts[100:1000], labels=None, batch_size=32)
    data = iter(dataset).next()
    print(dataset)
    print(data.shape)    
    return data


def get_data_from_generator(generator):
    dataset = generator(train_texts[100:1000], labels=None, batch_size=32)
    data = iter(dataset).next()
    print(dataset)
    print(data.shape)    
    return data

def show_tv(tv):
    print('-'*20)
    attrs = ['_standardize', '_max_tokens', '_ngrams', '_output_sequence_length', '_output_mode']
    for attr in attrs:
        print(f'{attr}={getattr(tv, attr)}')         

def test(layer_or_generator):
    def test_layer(layer, layer_save_format):
        print("="*50)
        data1 = get_data_from_layer(layer)

        layer_path= './output/imdb_0721_p100/my' + layer_save_format
        save_layer(layer, layer_path)
        show_tv(layer)
        layer = load_layer(layer_path)
        show_tv(layer)

        data2 = get_data_from_layer(layer)
        np.testing.assert_allclose(data1, data2)
        
    def test_generator(generator, layer_save_format):
        print("="*50)
        generator.layer_save_format=layer_save_format
        data1 = get_data_from_generator(generator)
        generator_path = './output/imdb_0721_p100/my.generator'
        generator.before_pickle(generator_path)
        generator.after_pickle(generator_path)

        data2 = get_data_from_generator(generator)
        np.testing.assert_allclose(data1, data2)        
        
    # 传入qbz95中的Generator，或者本地的TextVectorization
    print('#'*20, f'test {type(layer_or_generator)}', '#'*20)
    if isinstance(layer_or_generator, tf.keras.layers.TextVectorization):
        
        layer = layer_or_generator
        try:
            test_layer(layer, '.pkl')
        except Exception as e :
            print(traceback.format_exc())

        try:            
            test_layer(layer, '.layer') 
        except Exception as e :
            print(traceback.format_exc())
    else:        
        generator = layer_or_generator
        
        try:
            test_generator(generator, '.pkl')
        except Exception as e :
            print(traceback.format_exc())
            
        try:
            test_generator(generator, '.layer')  
        except Exception as e :
            print(traceback.format_exc())      

### ngram

In [93]:
layer = get_ngram_layer(standardize=qtf.text.standardize_tf_text1, 
                        texts=train_texts[0:1000], 
                        max_features=params.max_features)

test(layer)

2021-12-06 05:49:27,819: INFO: save object to ./output/imdb_0721_p100/my.pkl


#################### test <class 'keras.layers.preprocessing.text_vectorization.TextVectorization'> ####################
<PrefetchDataset shapes: (20000,), types: tf.float32>
(20000,)
--------------------
_standardize=<function standardize_tf_text1 at 0x7f8cb07d81f0>
_max_tokens=20000
_ngrams=(1, 2)
_output_sequence_length=None
_output_mode=tf_idf
--------------------
_standardize=<function standardize_tf_text1 at 0x7f8cb07d81f0>
_max_tokens=20000
_ngrams=(1, 2)
_output_sequence_length=None
_output_mode=tf_idf
<PrefetchDataset shapes: (20000,), types: tf.float32>
(20000,)
<PrefetchDataset shapes: (20000,), types: tf.float32>
(20000,)




INFO:tensorflow:Assets written to: ./output/imdb_0721_p100/my.layer/assets


2021-12-06 05:49:28,999: INFO: Assets written to: ./output/imdb_0721_p100/my.layer/assets


--------------------
_standardize=<function standardize_tf_text1 at 0x7f8cb07d81f0>
_max_tokens=20000
_ngrams=(1, 2)
_output_sequence_length=None
_output_mode=tf_idf




--------------------
_standardize=<function standardize_tf_text1 at 0x7f8cb07d81f0>
_max_tokens=20000
_ngrams=(1, 2)
_output_sequence_length=None
_output_mode=tf_idf
<PrefetchDataset shapes: (20000,), types: tf.float32>
(20000,)


### sequence

In [94]:
layer = get_sequence_layer(standardize=qtf.text.standardize_tf_text1, 
                           texts=train_texts[0:1000], 
                           max_features=params.max_features,
                           max_sequence_length=params.sequence_length)
test(layer)

2021-12-06 05:49:31,682: INFO: save object to ./output/imdb_0721_p100/my.pkl


#################### test <class 'keras.layers.preprocessing.text_vectorization.TextVectorization'> ####################
<PrefetchDataset shapes: (500,), types: tf.int64>
(500,)
--------------------
_standardize=<function standardize_tf_text1 at 0x7f8cb07d81f0>
_max_tokens=20000
_ngrams=None
_output_sequence_length=500
_output_mode=int
--------------------
_standardize=<function standardize_tf_text1 at 0x7f8cb07d81f0>
_max_tokens=20000
_ngrams=None
_output_sequence_length=500
_output_mode=int
<PrefetchDataset shapes: (500,), types: tf.int64>
(500,)
<PrefetchDataset shapes: (500,), types: tf.int64>
(500,)




INFO:tensorflow:Assets written to: ./output/imdb_0721_p100/my.layer/assets


2021-12-06 05:49:32,714: INFO: Assets written to: ./output/imdb_0721_p100/my.layer/assets


--------------------
_standardize=<function standardize_tf_text1 at 0x7f8cb07d81f0>
_max_tokens=20000
_ngrams=None
_output_sequence_length=500
_output_mode=int




--------------------
_standardize=<function standardize_tf_text1 at 0x7f8cb07d81f0>
_max_tokens=20000
_ngrams=None
_output_sequence_length=500
_output_mode=int
<PrefetchDataset shapes: (500,), types: tf.int64>
(500,)


## TextVectorization on qbz95

### ngram

In [99]:
generator = qtf.classification.LayerGenerator.get_ngram_layer(name='ngram_map', 
                                                              standardize=qtf.text.standardize_tf_text1, 
                                                              texts=train_texts[0:1000], 
                                                              max_features=params.max_features)
test(generator)

#################### test <class 'qbz95.tf.classification.layer_generator.LayerGenerator'> ####################


2021-12-06 06:01:53,173: INFO: save object to ./output/imdb_0721_p100/ngram_map.pkl
2021-12-06 06:01:53,174: INFO: loading layer from ./output/imdb_0721_p100/ngram_map.pkl


<PrefetchDataset shapes: (None, 20000), types: tf.float32>
(32, 20000)
<PrefetchDataset shapes: (None, 20000), types: tf.float32>
(32, 20000)
<PrefetchDataset shapes: (None, 20000), types: tf.float32>
(32, 20000)




INFO:tensorflow:Assets written to: ./output/imdb_0721_p100/ngram_map.layer/assets


2021-12-06 06:01:54,266: INFO: Assets written to: ./output/imdb_0721_p100/ngram_map.layer/assets
2021-12-06 06:01:54,302: INFO: loading layer from ./output/imdb_0721_p100/ngram_map.layer






<PrefetchDataset shapes: (None, 20000), types: tf.float32>
(32, 20000)


### sequence

In [103]:
generator = qtf.classification.LayerGenerator.get_sequence_layer(name='sequence_map', 
                                                                 standardize=qtf.text.standardize_tf_text1, 
                                                                 texts=train_texts[0:1000], 
                                                                 max_features=params.max_features,
                                                                 max_sequence_length=params.sequence_length)
test(generator)

2021-12-06 06:03:08,153: INFO: save object to ./output/imdb_0721_p100/sequence_map.pkl
2021-12-06 06:03:08,155: INFO: loading layer from ./output/imdb_0721_p100/sequence_map.pkl


#################### test <class 'qbz95.tf.classification.layer_generator.LayerGenerator'> ####################
<PrefetchDataset shapes: (None, 500), types: tf.int64>
(32, 500)
<PrefetchDataset shapes: (None, 500), types: tf.int64>
(32, 500)
<PrefetchDataset shapes: (None, 500), types: tf.int64>
(32, 500)




INFO:tensorflow:Assets written to: ./output/imdb_0721_p100/sequence_map.layer/assets


2021-12-06 06:03:11,760: INFO: Assets written to: ./output/imdb_0721_p100/sequence_map.layer/assets
2021-12-06 06:03:12,010: INFO: loading layer from ./output/imdb_0721_p100/sequence_map.layer






<PrefetchDataset shapes: (None, 500), types: tf.int64>
(32, 500)


## root reason

经过TextVectorization保存后，处理的text, 会失去padding。

In [91]:
max_features=100
vocab_data = ["earth", "wind", "and", "fire", "you", "good"]
max_len = 10  # Sequence length to pad the outputs to.

# Create the layer, passing the vocab directly. You can also pass the
# vocabulary arg a path to a file containing one vocabulary word per
# line.
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=max_len)

text_dataset = tf.data.Dataset.from_tensor_slices(vocab_data)
vectorize_layer.adapt(text_dataset)

print(vectorize_layer.get_vocabulary())

print('-'*50)
dataset = tf.data.Dataset.from_tensor_slices(["how old are you", "good morning"])
data = next(iter(dataset))
print(vectorize_layer(data))

print('-'*50)
layer_path = "./tv.pkl"
save_layer(vectorize_layer, layer_path)
show_tv(vectorize_layer)

vectorize_layer = load_layer(layer_path)

show_tv(vectorize_layer)
print(vectorize_layer.get_vocabulary())
print(vectorize_layer._output_sequence_length) 

dataset = tf.data.Dataset.from_tensor_slices(["how old are you", "good morning"])
data = next(iter(dataset))
print(vectorize_layer.call(data)) 

2021-12-06 05:48:24,945: INFO: save object to ./tv.pkl


['', '[UNK]', 'you', 'wind', 'good', 'fire', 'earth', 'and']
--------------------------------------------------
tf.Tensor([1 1 1 2 0 0 0 0 0 0], shape=(10,), dtype=int64)
--------------------------------------------------
--------------------
_standardize=lower_and_strip_punctuation
_max_tokens=100
_ngrams=None
_output_sequence_length=10
_output_mode=int
--------------------
_standardize=lower_and_strip_punctuation
_max_tokens=100
_ngrams=None
_output_sequence_length=10
_output_mode=int
['', '[UNK]', 'you', 'wind', 'good', 'fire', 'earth', 'and']
10
tf.Tensor([1 1 1 2], shape=(4,), dtype=int64)


分析源码，估计可能TextVectorization恢复后，并没有恢复内部的_index_lookup_layer对象。解决办法是，重新构建TextVectorization对象。

~~~python
  def call(self, inputs):
    if isinstance(inputs, (list, tuple, np.ndarray)):
      inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)

    inputs = self._preprocess(inputs)

    # If we're not doing any output processing, return right away.
    if self._output_mode is None:
      return inputs

    lookup_data = self._index_lookup_layer(inputs)
    if self._output_mode == INT:

      # Maybe trim the output (NOOP if self._output_sequence_length is None).
      output_tensor = lookup_data[..., :self._output_sequence_length]

      output_shape = output_tensor.shape.as_list()
      output_shape[-1] = self._output_sequence_length

      # If it is a ragged tensor, convert it to dense with correct shape.
      if tf_utils.is_ragged(output_tensor):
        return output_tensor.to_tensor(default_value=0, shape=output_shape)

      if self._output_sequence_length is None:
        return output_tensor

      padding, _ = array_ops.required_space_to_batch_paddings(
          output_tensor.shape, output_shape)
      return array_ops.pad(output_tensor, padding)

    return lookup_data
~~~

解决办法是，重新构建TextVectorization

In [51]:

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=max_len, 
    vocabulary=vectorize_layer.get_vocabulary(include_special_tokens=False))

dataset = tf.data.Dataset.from_tensor_slices(["how old are you", "good morning"])
data = next(iter(dataset))
print(vectorize_layer(data)) 

tf.Tensor([1 1 1 2 0 0 0 0 0 0], shape=(10,), dtype=int64)


In [14]:
layer = vectorize_layer

In [15]:
layer = get_tv(layer._standardize, layer._max_tokens,
              ngrams=layer._ngrams, output_mode=layer._output_mode,
              max_sequence_length=layer._output_sequence_length,
              vocabulary=layer.get_vocabulary(include_special_tokens=False))   