In [8]:
import tensorflow as tf
import numpy as np
import string
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout

# 超参数
MAX_FEATURES = 20000  # 增大词汇量
MAX_LEN = 300         # 增加序列长度
BATCH_SIZE = 64
EPOCHS = 5

# 加载数据
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=MAX_FEATURES)

# 数据预处理
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=MAX_LEN, padding='post', truncating='post')
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=MAX_LEN, padding='post', truncating='post')

# 构建稳健模型
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(MAX_FEATURES+1, 128, mask_zero=True),
    Bidirectional(LSTM(64, return_sequences=True)),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(1e-3),
    metrics=['accuracy']
)

# 训练模型
history = model.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    class_weight={0: 1.5, 1: 0.7}  # 假设负面样本更多
)

# 改进的预测预处理
class TextPreprocessor:
    def __init__(self, max_features):
        self.word_index = imdb.get_word_index()
        self.max_features = max_features
        self.translator = str.maketrans('', '', string.punctuation)
        
    def preprocess(self, text):
        # 清洗文本
        text = text.lower().translate(self.translator)
        words = text.split()
        
        # 转换为索引（含偏移修正）
        sequence = []
        for word in words:
            index = self.word_index.get(word, 0) + 3  # 原始数据偏移
            if index >= self.max_features + 3:
                index = 2  # 未知词标记
            sequence.append(index)
        return sequence

# 使用类封装预处理
preprocessor = TextPreprocessor(MAX_FEATURES)

def predict_sentiment(text):
    sequence = preprocessor.preprocess(text)
    padded = pad_sequences([sequence], maxlen=MAX_LEN, padding='post', truncating='post')
    proba = model.predict(padded)[0][0]
    return "Positive" if proba > 0.45 else "Negative"  # 调整阈值

# 测试样例
samples = [
    "This is the best movie I've ever seen! The acting was phenomenal.",
    "Terrible waste of time. Worst cinematography in recent years.",
    "A mediocre film with some good moments but overall forgettable."
]

for text in samples:
    print(f"'{text[:30]}...' → {predict_sentiment(text)}")

Epoch 1/5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 458ms/step - accuracy: 0.6265 - loss: 0.5610 - val_accuracy: 0.8666 - val_loss: 0.3353
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 501ms/step - accuracy: 0.9139 - loss: 0.2210 - val_accuracy: 0.8722 - val_loss: 0.3136
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 483ms/step - accuracy: 0.9573 - loss: 0.1217 - val_accuracy: 0.8832 - val_loss: 0.3123
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 496ms/step - accuracy: 0.9759 - loss: 0.0739 - val_accuracy: 0.8806 - val_loss: 0.3868
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 483ms/step - accuracy: 0.9859 - loss: 0.0443 - val_accuracy: 0.8802 - val_loss: 0.4667
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 673ms/step
'This is the best movie I've ev...' → Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64m