In [4]:
import tensorflow as tf
import numpy as np
import jieba
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 80
pd.options.display.precision = 4
pd.options.display.max_rows = 999
pd.options.display.float_format = '{:.4f}'.format  # 防止科学计数法，小数显示4位


physical_devices = tf.config.list_physical_devices('GPU') 
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [6]:
jieba.enable_paddle()

Paddle enabled successfully......


In [7]:
from zh_dataset_inews import title_train, label_train, content_train, title_test, label_test, content_test

In [8]:
for x, y in zip(title_train[:10], label_train[:10]):
    print(x, y)

周六晚到卖场听夜场摇滚 1
北京老教授泄露，持有山河药辅节后下跌公告，速速看看！！！ 1
张滩镇积极开展基干民兵训练活动 0
俩小伙无证骑摩托，未成年还试图闯卡！ 2
不好意思，你不配做深圳人!_搜狐汽车_搜狐网 2
蔡英文元旦升旗遇抗议 民众：枪毙蔡英文 2
巢湖市绞吸机械清淤公司重在回访-照明器材项目合作–光波网 1
出租屋半年被偷8次：整栋楼共用一个锁芯 2
从林芝到拉萨，还可以这样玩! 1
为何说奇瑞是技术达人? 看了“雄狮”你就懂了 1


In [9]:
title_train_cut = [' '.join(jieba.cut(x, cut_all=False)) for x in title_train]
title_test_cut  = [' '.join(jieba.cut(x, cut_all=False)) for x in title_test]

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/w2/n3x0x93n5klgjv3j05dp2wqc0000gp/T/jieba.cache
Loading model cost 2.509 seconds.
Prefix dict has been built successfully.


In [10]:
title_train_cut[:10]

['周六 晚到 卖场 听 夜场 摇滚',
 '北京 老 教授 泄露 ， 持有 山河 药辅 节后 下跌 公告 ， 速速 看看 ！ ！ ！',
 '张滩 镇 积极开展 基干民兵 训练 活动',
 '俩 小伙 无证 骑 摩托 ， 未成年 还 试图 闯卡 ！',
 '不好意思 ， 你 不配 做 深圳 人 ! _ 搜狐 汽车 _ 搜狐网',
 '蔡 英文 元旦 升旗 遇 抗议   民众 ： 枪毙 蔡 英文',
 '巢湖市 绞吸 机械 清淤 公司 重在 回访 - 照明 器材 项目 合作 – 光波 网',
 '出租屋 半年 被 偷 8 次 ： 整栋 楼 共用 一个 锁 芯',
 '从 林芝 到 拉萨 ， 还 可以 这样 玩 !',
 '为何 说 奇瑞 是 技术 达 人 ?   看 了 “ 雄狮 ” 你 就 懂 了']

In [11]:
text_vector = tf.keras.layers.experimental.preprocessing.TextVectorization()
# 学习词表
text_vector.adapt(title_train_cut)

In [12]:
title_train_text_vector = text_vector(title_train_cut) # [text_vector(x) for x in title_train_cut]
title_test_text_vector  = text_vector(title_test_cut) # [text_vector(x) for x in title_test_cut]


In [21]:
x_train = tf.convert_to_tensor(title_train_text_vector)
x_test  = tf.convert_to_tensor(title_test_text_vector)

In [50]:
y_train = tf.convert_to_tensor(label_train)
y_test  = tf.convert_to_tensor(label_test)

In [13]:
text_vector('周六 晚到 卖场 听 夜场 摇滚')

<tf.Tensor: shape=(6,), dtype=int64, numpy=array([ 6151, 12058, 15578,  1710, 14533,  5529])>

In [14]:
vocab_size = len(text_vector.get_vocabulary())
embedding_dim = 128

In [44]:
class CustomLSTM(tf.keras.layers.Layer):
    
    """
    LSTM's input: [batch_size, sequence_length, input_size]
    LSTM's output1: [batch_size, sequence_length, input_size]
           output2: [batch_size, input_size]
    """
    
    def __init__(self, output_size, return_sequence=False):
        super(CustomLSTM, self).__init__()
        self.output_size = output_size
        self.return_sequence = return_sequence
    
    def build(self, input_shape):
        super(CustomLSTM, self).build(input_shape)
        input_size = int(input_shape[-1])
        
        self.wf = self.add_weight('wf', shape=(input_size, self.output_size))
        self.wi = self.add_weight('wi', shape=(input_size, self.output_size))
        self.wo = self.add_weight('wo', shape=(input_size, self.output_size))
        self.wc = self.add_weight('wc', shape=(input_size, self.output_size))

        self.uf = self.add_weight('uf', shape=(self.output_size, self.output_size))
        self.ui = self.add_weight('ui', shape=(self.output_size, self.output_size))
        self.uo = self.add_weight('uo', shape=(self.output_size, self.output_size))
        self.uc = self.add_weight('uc', shape=(self.output_size, self.output_size))

        self.bf = self.add_weight('bf', shape=(1, self.output_size))
        self.bi = self.add_weight('bi', shape=(1, self.output_size))
        self.bo = self.add_weight('bo', shape=(1, self.output_size))
        self.bc = self.add_weight('bc', shape=(1, self.output_size))

    def call(self, x):
        sequence_outputs = []
        for i in range(sequence_length):
            if i == 0:
                xt  = x[:, 0, :]
                ft  = tf.sigmoid(tf.matmul(xt, self.wf) + self.bf)
                it  = tf.sigmoid(tf.matmul(xt, self.wi) + self.bi)
                ot  = tf.sigmoid(tf.matmul(xt, self.wo) + self.bo)
                cht = tf.tanh(   tf.matmul(xt, self.wc) + self.bc)
                ct  = it * cht
                ht  = ot * tf.tanh(ct)

            else:
                xt  = x[:, 0, :]
                ft  = tf.sigmoid(tf.matmul(xt, self.wf) + self.bf)
                it  = tf.sigmoid(tf.matmul(xt, self.wi) + self.bi)
                ot  = tf.sigmoid(tf.matmul(xt, self.wo) + self.bo)
                cht = tf.tanh(  tf.matmul(xt, self.wc) + self.bc)
                ct  = ft * ct + it * cht
                ht  = ot * tf.tanh(ct)
                
            sequence_outputs.append(ht)
            
        sequence_outputs = tf.stack(sequence_outputs)
        sequence_outputs = tf.transpose(sequence_outputs, (1, 0, 2))
        if self.return_sequence:
            return sequence_outputs
        return sequence_outputs[:, -1, :]

In [140]:
# 定义一个分词层 

class CustomTextVector(tf.keras.layers.Layer):
    
    """
    输入文本数据，分词，然后转成Embedding层的输入 
    """
    
    def __init__(self, output_size, vocab_size=None, return_sequence=False):
        super(CustomTextVector, self).__init__()
        self.output_size = output_size
        self.vocab_size = vocab_size
        self.text_vector = tf.keras.layers.experimental.preprocessing.TextVectorization()
    
    def build(self, input_shape):
        super(CustomTextVector, self).build(input_shape)

    def call(self, x):
        text_cut = []
        x_array = np.array(x)
        for i in range(len(x)):
            if x[i]:
                text_cut.append(' '.join(jieba.cut(x_array[i].decode(), cut_all=False)))
            else:
                text_cut.append('<unk>') 
        self.text_vector.adapt(text_cut)
        if self.vocab_size:
            pass
        else:
            self.vocab_size = len(text_vector.get_vocabulary())
        
        vector_out = tf.convert_to_tensor(self.text_vector(text_cut))
        return vector_out

In [141]:
model_direct_text = tf.keras.Sequential([
    CustomTextVector(128),
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    CustomLSTM(output_size=32), 
    tf.keras.layers.Dense(3, activation='softmax')
])

model_direct_text.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(), 
    optimizer = tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

In [142]:
test_embedding_layer = tf.keras.layers.Embedding(input_dim = 1000, output_dim = 32)

In [143]:
test_text_vector_layer = CustomTextVector(128)


In [144]:
test_text_vector_out = test_text_vector_layer(title_train_tensor)

In [111]:
len(title_train_tensor)

5355

In [134]:
for x in title_train_tensor[:10]:
#     print(x)
    print(np.array(x).item().decode())

周六晚到卖场听夜场摇滚
北京老教授泄露，持有山河药辅节后下跌公告，速速看看！！！
张滩镇积极开展基干民兵训练活动
俩小伙无证骑摩托，未成年还试图闯卡！
不好意思，你不配做深圳人!_搜狐汽车_搜狐网
蔡英文元旦升旗遇抗议 民众：枪毙蔡英文
巢湖市绞吸机械清淤公司重在回访-照明器材项目合作–光波网
出租屋半年被偷8次：整栋楼共用一个锁芯
从林芝到拉萨，还可以这样玩!
为何说奇瑞是技术达人? 看了“雄狮”你就懂了


In [119]:
y = np.array(x)

In [128]:
y.item().decode()

'为何说奇瑞是技术达人? 看了“雄狮”你就懂了'

In [73]:
test_text_vector_out.shape

TensorShape([5355, 44])

In [74]:
test_text_vector_out[:10, :10]

<tf.Tensor: shape=(10, 10), dtype=int64, numpy=
array([[ 6151, 12058, 15578,  1710, 14533,  5529,     0,     0,     0,
            0],
       [  133,   247,  1996,   818,     2,  5584, 13748,  8950,  2400,
         2940],
       [13337,   224,  4879, 14692,  4608,    47,     0,     0,     0,
            0],
       [ 2179,   388,   287,   605,   834,     2,  5388,    35,  4597,
         7541],
       [ 6724,     2,    18,  6705,   116,   301,    10,    77,   357,
          115],
       [  892,  1008,  2172,  2827,   519,   838,  1957,     9,  2564,
          892],
       [13705,  9496,  5380,  5154,    64,  7766, 14885,  5087, 14940,
          213],
       [ 6375,  1099,     7,   343,   138,   438,     9,  1995,   822,
        16204],
       [  175,  3512,    48,  2620,     2,    35,   184,   194,   376,
            0],
       [  268,   121, 14326,    21,   498,   663,    10,    72,     8,
            4]])>

In [57]:
title_train_text_vector[:10, :10]

<tf.Tensor: shape=(10, 10), dtype=int64, numpy=
array([[ 6151, 12058, 15578,  1710, 14533,  5529,     0,     0,     0,
            0],
       [  133,   247,  1996,   818,     2,  5584, 13748,  8950,  2400,
         2940],
       [13337,   224,  4879, 14692,  4608,    47,     0,     0,     0,
            0],
       [ 2179,   388,   287,   605,   834,     2,  5388,    35,  4597,
         7541],
       [ 6724,     2,    18,  6705,   116,   301,    10,    77,   357,
          115],
       [  892,  1008,  2172,  2827,   519,   838,  1957,     9,  2564,
          892],
       [13705,  9496,  5380,  5154,    64,  7766, 14885,  5087, 14940,
          213],
       [ 6375,  1099,     7,   343,   138,   438,     9,  1995,   822,
        16204],
       [  175,  3512,    48,  2620,     2,    35,   184,   194,   376,
            0],
       [  268,   121, 14326,    21,   498,   663,    10,    72,     8,
            4]])>

In [58]:
title_train_tensor = tf.convert_to_tensor(title_train)

In [145]:
history_model_direct = model_direct_text.fit(
    title_train_tensor, y_train, 
#     validation_split=0.1,
    epochs=20
)

Epoch 1/20


NotImplementedError: in user code:

    /Users/runrun.wei/anaconda3/envs/tf/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    <ipython-input-140-e6aca4602a09>:20 call  *
        x_array = np.array(x)
    /Users/runrun.wei/anaconda3/envs/tf/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:848 __array__  **
        " a NumPy call, which is not supported".format(self.name))

    NotImplementedError: Cannot convert a symbolic Tensor (ExpandDims:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported


In [23]:
x_train.shape

TensorShape([5355, 44])

In [34]:
max(list(len(x.split(' ')) for x in title_train_cut))
# max_word_cnt = 1
# for x_title in title_train_cut:
#     if len(x.split(' ')) > max_word_cnt):
#         max_word_cnt = len(x.split(' ')

46

In [28]:
len(title_train)

5355

In [24]:
embdding_out.shape

TensorShape([5355, 44, 32])

In [65]:
x1 = tf.convert_to_tensor(['a', 'b'])

In [87]:
np.array(x1)[0].decode()

'a'