# 1.Setup

In [3]:
import config
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re
from tensorflow.keras import layers

# 2.数据加载、探索

## 2.1 数据加载

In [1]:
# !curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -xf aclImdb_v1.tar.gz
!ls aclImdb
!ls aclImdb/test
!ls aclImdb/train
!cat aclImdb/train/pos/6248_7.txt
!rm -r aclImdb/train/unsup

README     imdb.vocab imdbEr.txt [1m[36mtest[m[m       [1m[36mtrain[m[m
labeledBow.feat [1m[36mneg[m[m             [1m[36mpos[m[m             urls_neg.txt    urls_pos.txt
labeledBow.feat [1m[36mpos[m[m             unsupBow.feat   urls_pos.txt
[1m[36mneg[m[m             [1m[36munsup[m[m           urls_neg.txt    urls_unsup.txt
Being an Austrian myself this has been a straight knock in my face. Fortunately I don't live nowhere near the place where this movie takes place but unfortunately it portrays everything that the rest of Austria hates about Viennese people (or people close to that region). And it is very easy to read that this is exactly the directors intention: to let your head sink into your hands and say "Oh my god, how can THAT be possible!". No, not with me, the (in my opinion) totally exaggerated uncensored swinger club scene is not necessary, I watch porn, sure, but in this context I was rather disgusted than put in the right context.<br /><br />T

In [5]:
batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    os.path.join(config.data_dir, "train"),
    batch_size = batch_size,
    validation_split = 0.2,
    subset = "training",
    seed = 1337,
)
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    os.path.join(config.data_dir, "train"),
    batch_size = batch_size,
    validation_split = 0.2,
    subset = "validation",
    seed = 1337,
)
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    os.path.join(config.data_dir, "test"),
    batch_size = batch_size
)
print(f"Number of batches in raw_train_ds: {tf.data.experimental.cardinality(raw_train_ds)}")
print(f"Number of batches in raw_val_ds: {tf.data.experimental.cardinality(raw_val_ds)}")
print(f"Number of batches in raw_test_ds: {tf.data.experimental.cardinality(raw_test_ds)}")

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Number of batches in raw_train_ds: 625
Number of batches in raw_val_ds: 157
Number of batches in raw_test_ds: 782


## 2.2 数据探索

In [8]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(3):
        print(f"text_batch.numpy()[i]=\n{text_batch.numpy()[i]}")
        print(f"label_batch.numpy()[i]=\n{label_batch.numpy()[i]}")

text_batch.numpy()[i]=
b'I rented this horrible movie. The worst think I have ever seen. I believe a 1st grade class could have done a better job. The worse film I have ever seen and I have seen some bad ones. Nothing scary except I paid 1.50 to rent it and that was 1.49 too much. The acting is horrible, the characters are worse and the film is just a piece of trash. The slauther house scenes are so low budget that it makes a B movied look like an Oscar candidate. All I can say is if you wnat to waste a good evening and a little money go rent this horrible flick. I would rather watch killer clowns from outer space while sitting in a bucket of razors than sit through this flop again'
label_batch.numpy()[i]=
0
text_batch.numpy()[i]=
b"I spent almost two hours watching a movie that I thought, with all the good actors in it, would be worth watching. I couldn't believe it when the movie ended and I had absolutely no idea what had happened.....I was mad because I could have used that time do

## 2.3 数据准备

In [14]:
def custom_standardization(input_data):
    """
    # 删除 "<br />"
    """
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    standardization_data = tf.strings.regex_replace(stripped_html, f"[{re.escape(string.punctuation)}]", "")
    return standardization_data

In [16]:
# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Let's make a text-only dataset (no labels):
text_ds = raw_train_ds.map(lambda x, y: x)

# Let's call `adapt`:
vectorize_layer.adapt(text_ds)

## 2.4 文本向量化

In [17]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

# 数据向量化
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)
# async prefetching / buffering
train_ds = train_ds.cache().prefetch(buffer_size = 10)
val_ds = val_ds.cache().prefetch(buffer_size = 10)
test_ds = test_ds.cache().prefetch(buffer_size = 10)

# 3.模型构建

## 3.1 模型构建

In [18]:
inputs = tf.keras.Input(shape = (None,), dtype = "int64")
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)
x = layers.Conv1D(128, 7, padding = "valid", activation = "relu", strides = 3)(x)
x = layers.Conv1D(128, 7, padding = "valid", activation = "relu", strides = 3)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation = "relu")(x)
x = layers.Dropout(0.5)(x)
predictions = layers.Dense(1, activation = "sigmoid", name = "predictions")(x)
model = tf.keras.Model(inputs, predictions)

## 3.2 模型编译

In [19]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

## 3.3 模型训练

In [20]:
epochs = 3
model.fit(train_ds, validation_data = val_ds, epochs = epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1550548e0>

## 3.4 模型评估

In [21]:
model.evaluate(test_ds)



[0.427167147397995, 0.8528800010681152]

# 4.构建端到端模型

In [22]:
inputs = tf.keras.Input(shape = (1,), dtype = "string")
indices = vectorize_layer(inputs)
outputs = model(indices)
end_to_end_model = tf.keras.Model(inputs, outputs)
end_to_end_model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])
end_to_end_model.evaluate(raw_test_ds)



[0.4271673858165741, 0.8528800010681152]