In [1]:
from keras.datasets import imdb
import torch.nn as nn
import torch
from torch.nn.utils.rnn import pad_sequence
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_data():
    # 加载 IMDB 数据集
    (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=200)

    # 查看数据集大小
    print(f'训练集大小: {len(train_data)}')
    print(f'测试集大小: {len(test_data)}')
    print("train labels shape: ", train_labels.shape)
    print("test labels shape: ", test_labels.shape)

    # 填充数据
    max_len = 50
    train_data = pad_sequence([torch.tensor(seq)[:max_len] for seq in train_data], batch_first=True)
    test_data = pad_sequence([torch.tensor(seq)[:max_len] for seq in test_data], batch_first=True)
    print("train data shape: ", train_data.shape)
    print("test data shape: ", test_data.shape)

    # 转换为 pytorch tensor
    train_labels = torch.tensor(train_labels, dtype=torch.float32)
    test_labels = torch.tensor(test_labels, dtype=torch.float32)
    train_data = torch.tensor(train_data, dtype=torch.long)
    test_data = torch.tensor(test_data, dtype=torch.long)
    return train_data, train_labels, test_data, test_labels

In [3]:
train_data, train_labels, test_data, test_labels = load_data()

训练集大小: 25000
测试集大小: 25000
train labels shape:  (25000,)
test labels shape:  (25000,)
train data shape:  torch.Size([25000, 50])
test data shape:  torch.Size([25000, 50])


In [4]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        out = self.dropout(lstm_out[:, -1, :])  # 取最后一层输出
        out = self.fc(out)
        return out


In [5]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)
        rnn_out, _ = self.rnn(x)
        out = self.dropout(rnn_out[:, -1, :])  # 取最后一层输出
        out = self.fc(out)
        return out

In [6]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, n_heads, num_layers):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(dim_feedforward=embedding_dim, d_model=embedding_dim, nhead=n_heads),
            num_layers=num_layers
        )
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)  # 转换为 (seq_len, batch_size, embedding_dim)
        transformer_out = self.transformer_encoder(x)
        out = transformer_out[-1, :, :]  # 取最后一个时间步的输出
        out = self.fc(out)
        return out

In [7]:
from torch.utils.data import Dataset, DataLoader
from prefetch_generator import BackgroundGenerator

class IMDBDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

class DataLoaderX(DataLoader):
    def __iter__(self):
        return BackgroundGenerator(super().__iter__(), max_prefetch=8)

In [8]:
def train_model(model, train_data, train_labels, criterion, optimizer, num_epochs=5, batch_size=128):
    # 创建数据集和数据加载器
    dataset = IMDBDataset(train_data, train_labels)
    train_loader = DataLoaderX(dataset, batch_size=batch_size, shuffle=True)

    model.train()
    correct = 0
    total = 0
    for epoch in range(num_epochs):
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            correct += (torch.round(torch.sigmoid(outputs)).squeeze() == labels).sum().item()
            total += labels.size(0)
            loss = criterion(outputs, labels.unsqueeze(1))  # 需要调整标签形状
            loss.backward()
            optimizer.step()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {correct/total:.4f}')

def evaluate_model(model, test_data, test_labels, batch_size=32):
    dataset = IMDBDataset(test_data, test_labels)
    test_loader = DataLoaderX(dataset, batch_size=batch_size, shuffle=False)

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            predictions = torch.round(torch.sigmoid(outputs))
            correct += (predictions.squeeze() == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f'Accuracy: {accuracy:.4f}')

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device} device')
# 加载数据
train_data, train_labels, test_data, test_labels = load_data()
train_data, train_labels, test_data, test_labels = train_data.to(device), train_labels.to(device), test_data.to(device), test_labels.to(device)

# 设置超参数
vocab_size = 200
embedding_dim = 64
hidden_dim = 128
output_dim = 1
n_layers = 3
dropout = 0.5
num_epochs = 30
learning_rate = 0.001
batch_size = 256

# LSTM 模型训练与评估
lstm_model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout)
lstm_model = lstm_model.to(device)
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()
train_model(lstm_model, train_data, train_labels, criterion, optimizer, num_epochs, batch_size)
evaluate_model(lstm_model, test_data, test_labels, batch_size)

Using cuda device
训练集大小: 25000
测试集大小: 25000
train labels shape:  (25000,)
test labels shape:  (25000,)
train data shape:  torch.Size([25000, 50])
test data shape:  torch.Size([25000, 50])
Epoch [1/30], Loss: 0.6620, Accuracy: 0.5381
Epoch [2/30], Loss: 0.6728, Accuracy: 0.5808
Epoch [3/30], Loss: 0.6614, Accuracy: 0.6043
Epoch [4/30], Loss: 0.5909, Accuracy: 0.6202
Epoch [5/30], Loss: 0.5619, Accuracy: 0.6316
Epoch [6/30], Loss: 0.5911, Accuracy: 0.6397
Epoch [7/30], Loss: 0.5427, Accuracy: 0.6467
Epoch [8/30], Loss: 0.5204, Accuracy: 0.6533
Epoch [9/30], Loss: 0.5500, Accuracy: 0.6592
Epoch [10/30], Loss: 0.5961, Accuracy: 0.6643
Epoch [11/30], Loss: 0.4955, Accuracy: 0.6696
Epoch [12/30], Loss: 0.5633, Accuracy: 0.6749
Epoch [13/30], Loss: 0.6396, Accuracy: 0.6801
Epoch [14/30], Loss: 0.4739, Accuracy: 0.6856
Epoch [15/30], Loss: 0.3970, Accuracy: 0.6907
Epoch [16/30], Loss: 0.4682, Accuracy: 0.6962
Epoch [17/30], Loss: 0.4280, Accuracy: 0.7016
Epoch [18/30], Loss: 0.4327, Accuracy: 

In [37]:
# RNN 模型训练与评估
train_data, train_labels, test_data, test_labels = load_data()
train_data, train_labels, test_data, test_labels = train_data.to(device), train_labels.to(device), test_data.to(device), test_labels.to(device)
rnn_model = RNNModel(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout)
rnn_model = rnn_model.to(device)
optimizer = torch.optim.Adam(rnn_model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()
train_model(rnn_model, train_data, train_labels, criterion, optimizer, num_epochs, batch_size)
evaluate_model(rnn_model, test_data, test_labels, batch_size)

训练集大小: 25000
测试集大小: 25000
train labels shape:  (25000,)
test labels shape:  (25000,)
train data shape:  torch.Size([25000, 50])
test data shape:  torch.Size([25000, 50])
Epoch [1/30], Loss: 0.6862, Accuracy: 0.5024
Epoch [2/30], Loss: 0.6530, Accuracy: 0.5142
Epoch [3/30], Loss: 0.7320, Accuracy: 0.5278
Epoch [4/30], Loss: 0.6840, Accuracy: 0.5340
Epoch [5/30], Loss: 0.7141, Accuracy: 0.5361
Epoch [6/30], Loss: 0.6767, Accuracy: 0.5381
Epoch [7/30], Loss: 0.6228, Accuracy: 0.5423
Epoch [8/30], Loss: 0.7272, Accuracy: 0.5434
Epoch [9/30], Loss: 0.6129, Accuracy: 0.5472
Epoch [10/30], Loss: 0.7102, Accuracy: 0.5472
Epoch [11/30], Loss: 0.7158, Accuracy: 0.5439
Epoch [12/30], Loss: 0.7000, Accuracy: 0.5424
Epoch [13/30], Loss: 0.6916, Accuracy: 0.5420
Epoch [14/30], Loss: 0.6678, Accuracy: 0.5420
Epoch [15/30], Loss: 0.6760, Accuracy: 0.5420
Epoch [16/30], Loss: 0.7196, Accuracy: 0.5415
Epoch [17/30], Loss: 0.6909, Accuracy: 0.5418
Epoch [18/30], Loss: 0.6839, Accuracy: 0.5419
Epoch [19/3

In [40]:
# Transformer 模型训练与评估
train_data, train_labels, test_data, test_labels = load_data()
train_data, train_labels, test_data, test_labels = train_data.to(device), train_labels.to(device), test_data.to(device), test_labels.to(device)
transformer_model = TransformerModel(vocab_size, embedding_dim, output_dim, n_heads=16, num_layers=n_layers)
transformer_model = transformer_model.to(device)
optimizer = torch.optim.Adam(transformer_model.parameters(), lr=learning_rate)
criterion = nn.BCEWithLogitsLoss()
train_model(transformer_model, train_data, train_labels, criterion, optimizer, num_epochs, 16)
evaluate_model(transformer_model, test_data, test_labels, batch_size)

训练集大小: 25000
测试集大小: 25000
train labels shape:  (25000,)
test labels shape:  (25000,)
train data shape:  torch.Size([25000, 50])
test data shape:  torch.Size([25000, 50])
Epoch [1/30], Loss: 0.6736, Accuracy: 0.5949
Epoch [2/30], Loss: 0.6139, Accuracy: 0.6226
Epoch [3/30], Loss: 0.5651, Accuracy: 0.6357
Epoch [4/30], Loss: 0.6579, Accuracy: 0.6459
Epoch [5/30], Loss: 0.6625, Accuracy: 0.6541
Epoch [6/30], Loss: 0.8026, Accuracy: 0.6608
Epoch [7/30], Loss: 0.7271, Accuracy: 0.6672
Epoch [8/30], Loss: 0.6410, Accuracy: 0.6725
Epoch [9/30], Loss: 0.5638, Accuracy: 0.6779
Epoch [10/30], Loss: 0.4935, Accuracy: 0.6824
Epoch [11/30], Loss: 0.3882, Accuracy: 0.6869
Epoch [12/30], Loss: 0.7272, Accuracy: 0.6909
Epoch [13/30], Loss: 0.2741, Accuracy: 0.6949
Epoch [14/30], Loss: 0.6343, Accuracy: 0.6985
Epoch [15/30], Loss: 0.5324, Accuracy: 0.7019
Epoch [16/30], Loss: 0.5798, Accuracy: 0.7048
Epoch [17/30], Loss: 0.9732, Accuracy: 0.7076
Epoch [18/30], Loss: 0.4634, Accuracy: 0.7104
Epoch [19/3