In [15]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow.keras import models,layers,preprocessing,optimizers,losses,metrics
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import re,string

train_data_path = "./data/imdb/train.csv"
test_data_path =  "./data/imdb/test.csv"

MAX_WORDS = 10000  # 仅考虑最高频的10000个词
MAX_LEN = 200  # 每个样本保留200个词的长度
BATCH_SIZE = 20 


#构建管道
def split_line(line):
    arr = tf.strings.split(line,"\t")
    label = tf.expand_dims(tf.cast(tf.strings.to_number(arr[0]),tf.int32),axis = 0)
    text = tf.expand_dims(arr[1],axis = 0)
    return (text,label)

ds_train_raw =  tf.data.TextLineDataset(filenames = [train_data_path]) \
   .map(split_line,num_parallel_calls = tf.data.experimental.AUTOTUNE) \
   .shuffle(buffer_size = 1000).batch(BATCH_SIZE) \
   .prefetch(tf.data.experimental.AUTOTUNE)

ds_test_raw = tf.data.TextLineDataset(filenames = [test_data_path]) \
   .map(split_line,num_parallel_calls = tf.data.experimental.AUTOTUNE) \
   .batch(BATCH_SIZE) \
   .prefetch(tf.data.experimental.AUTOTUNE)


#构建词典
def clean_text(text):
    lowercase = tf.strings.lower(text)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    cleaned_punctuation = tf.strings.regex_replace(stripped_html,
         '[%s]' % re.escape(string.punctuation),'')
    return cleaned_punctuation

vectorize_layer = TextVectorization(
    standardize=clean_text,
    split = 'whitespace',
    max_tokens=MAX_WORDS-1, #有一个留给占位符
    output_mode='int',
    output_sequence_length=MAX_LEN)

ds_text = ds_train_raw.map(lambda text,label: text)
vectorize_layer.adapt(ds_text)
print(vectorize_layer.get_vocabulary()[0:100])


#单词编码
ds_train = ds_train_raw.map(lambda text,label
                            :(vectorize_layer(text),label)) \
    .prefetch(tf.data.experimental.AUTOTUNE)
    
ds_test = ds_test_raw.map(lambda text,label
                          :(vectorize_layer(text),label)) \
    .prefetch(tf.data.experimental.AUTOTUNE)

[b'the', b'and', b'a', b'of', b'to', b'is', b'in', b'it', b'i', b'this', b'that', b'was', b'as', b'for', b'with', b'movie', b'but', b'film', b'on', b'not', b'you', b'his', b'are', b'have', b'be', b'he', b'one', b'its', b'at', b'all', b'by', b'an', b'they', b'from', b'who', b'so', b'like', b'her', b'just', b'or', b'about', b'has', b'if', b'out', b'some', b'there', b'what', b'good', b'more', b'when', b'very', b'she', b'even', b'my', b'no', b'would', b'up', b'time', b'only', b'which', b'story', b'really', b'their', b'were', b'had', b'see', b'can', b'me', b'than', b'we', b'much', b'well', b'get', b'been', b'will', b'into', b'people', b'also', b'other', b'do', b'bad', b'because', b'great', b'first', b'how', b'him', b'most', b'dont', b'made', b'then', b'them', b'films', b'movies', b'way', b'make', b'could', b'too', b'any', b'after', b'characters']


In [16]:
model=models.Sequential()
model.add(layers.Embedding(MAX_WORDS,7,input_length=MAX_LEN))
model.add(layers.Conv1D(16,kernel_size=5,activation="relu"))
model.add(layers.MaxPool1D())
model.add(layers.Conv1D(128,kernel_size=2,activation="relu"))
model.add(layers.MaxPool1D())
model.add(layers.Flatten())
model.add(layers.Dense(1,activation='sigmoid'))

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 7)            70000     
_________________________________________________________________
conv1d (Conv1D)              (None, 196, 16)           576       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 98, 16)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 97, 128)           4224      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 48, 128)           0         
_________________________________________________________________
flatten (Flatten)            (None, 6144)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 6

In [18]:
optimizer = optimizers.Nadam()
loss_func = losses.BinaryCrossentropy()

In [20]:
# 导入深度学习库
import tensorflow as tf
from tensorflow.keras import optimizers,losses,metrics,callbacks
from tensorflow.keras.utils import to_categorical

# 由于这是一个二分类问题，所以取出来对应的损失函数与衡量标准

from tensorflow.keras.metrics import Accuracy,Precision,Recall,AUC

# 评估标准实体化

acc=Accuracy()
auc=AUC()
precision=Precision()
recall=Recall()

from tensorflow.keras.optimizers import Adam,Adagrad,Adadelta,Nadam,RMSprop,SGD

# 优化器实体化
adam=Adam()
adagrad=Adagrad()
adadelta=Adadelta()
nadam=Nadam()
rms=RMSprop()
sgd=SGD()


from tensorflow.keras.losses import BinaryCrossentropy,Hinge

# 损失函数实体化

bc=BinaryCrossentropy()
hinge=Hinge()


from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau

# 回调函数函数实体化

#如果loss在100个epoch后没有提升，学习率减半。
lr_callback = ReduceLROnPlateau(monitor="accuracy",factor = 0.5, patience = 100)
#当loss在200个epoch后没有提升，则提前终止训练。
stop_callback = EarlyStopping(monitor = "accuracy", patience= 200)
callbacks_list = [lr_callback,stop_callback]


from tensorflow.keras.layers import Input,Dense,Conv2D,MaxPool2D,Dropout,Flatten
from tensorflow.keras.models import Sequential,Model



In [24]:
model.compile(loss=loss_func,
             optimizer=optimizer,
             metrics=['AUC','accuracy','Recall'])
model.fit(ds_train,validation_data=ds_test,epochs=4
          ,batch_size=64
         ,callbacks=[lr_callback,stop_callback])

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x17c52518828>