In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import torch
from torch import nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.functional import to_map_style_dataset
import time

###### 1. 导入数据

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
df['labels'] = df['sentiment'].map({'positive':1, 'negative':0})

In [4]:
texts = df['review'].values
labels = df['labels'].values
text_train, text_test, label_train, label_test = train_test_split(texts, labels, test_size=0.3, random_state=37)

###### 2. 预处理

In [5]:
def nltk_tokenizer(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    tokens = text.split()
    #stop_word = set(nltk.corpus.stopwords.words('english'))
    #tokens = [word for word in tokens if word not in stop_word]
    return tokens

In [6]:
def yield_tokens(iter_data):
    for text in iter_data:
        yield nltk_tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(text_train), specials=['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>'])

In [7]:
max_len = 100
def text_pipeline(text):
    tokens = nltk_tokenizer(text)
    token_ids = vocab(tokens)
    if len(token_ids) < max_len:
        token_ids += [vocab['<pad>']] * (max_len - len(token_ids))
    else:
        token_ids = token_ids[:max_len]
    return token_ids

def label_pipeline(label):
    return int(label)

In [8]:
train_dataset = list(zip(text_train, label_train))
test_dataset = list(zip(text_test, label_test))

train_dataset = to_map_style_dataset(train_dataset)
test_dataset = to_map_style_dataset(test_dataset)

In [9]:
def collate_function(batch):
    text_list, label_list = [], []
    for text, label in batch:
        text_list.append(torch.tensor(text_pipeline(text), dtype = torch.int64))
        label_list.append(torch.tensor(label_pipeline(label), dtype = torch.float32))
    text_batch = torch.stack(text_list)
    label_batch = torch.stack(label_list)
    return text_batch, label_batch

In [10]:
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_function)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_function)

In [11]:
vocab_size = len(vocab)
epoch = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
class rnn_imdb(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 128, padding_idx=vocab['<pad>'])
        self.rnn_layer1 = nn.RNN(128, 64, batch_first=True, num_layers = 3)
        self.linear = nn.Linear(64, 1)
    def forward(self, x):
        x = self.embedding(x)
        _, x = self.rnn_layer1(x)
        #单层 设置前面的num_layers
        #x = x.squeeze(0)
        #多层
        x = x[-1]
        x = self.linear(x)
        return x

In [13]:
class lstm_imdb_multilayers(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 128, padding_idx=vocab['<pad>'])
        self.lstm = nn.LSTM(
            input_size=128,
            hidden_size=64,
            num_layers=4,              # 多层加深表达能力
            batch_first=True,
            dropout=0.3                # dropout between LSTM layers
        )
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(64, 1)

    def forward(self, x):
        x = self.embedding(x)               # x: (batch, seq_len) → (batch, seq_len, emb_dim)
        _, (h_n, _) = self.lstm(x)          # h_n: (num_layers, batch, hidden_size)
        x = self.dropout(h_n[-1])           # 取最后一层的输出 → (batch, hidden_size)
        x = self.linear(x)                  # → (batch, 1)
        return x

In [14]:
lstm_imdb = lstm_imdb_multilayers().to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(lstm_imdb.parameters(), lr = 0.001)

In [15]:
start_time = time.time()
for i in range(epoch):
    lstm_imdb.train()
    total_loss = 0
    for text, label in train_loader:
        text = text.to(device)
        label = label.unsqueeze(1).to(device)
        
        outputs = lstm_imdb(text)
        loss = criterion(outputs, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {i+1}, Loss: {total_loss / len(train_loader):.4f}")

    # Evaluate on training set
    lstm_imdb.eval()
    correct_train = 0
    total_train = 0
    with torch.no_grad():
        for text, label in train_loader:
            text = text.to(device)
            label = label.unsqueeze(1).to(device)
            outputs = lstm_imdb(text)
            preds = (torch.sigmoid(outputs) > 0.5).float()
            correct_train += (preds == label).sum().item()
            total_train += label.size(0)
    
    train_accuracy = correct_train / total_train * 100
    print(f"Train Accuracy: {train_accuracy:.2f}%")
end_time = time.time()

Epoch 1, Loss: 0.6600
Train Accuracy: 73.11%
Epoch 2, Loss: 0.5216
Train Accuracy: 82.43%
Epoch 3, Loss: 0.4003
Train Accuracy: 87.12%
Epoch 4, Loss: 0.3050
Train Accuracy: 91.13%
Epoch 5, Loss: 0.2258
Train Accuracy: 94.84%
Epoch 6, Loss: 0.1624
Train Accuracy: 96.92%
Epoch 7, Loss: 0.1214
Train Accuracy: 97.56%
Epoch 8, Loss: 0.0828
Train Accuracy: 98.78%
Epoch 9, Loss: 0.0590
Train Accuracy: 99.16%
Epoch 10, Loss: 0.0511
Train Accuracy: 99.12%


In [16]:
print(f"训练总耗时: {end_time - start_time:.2f} 秒")

训练总耗时: 156.03 秒


In [17]:
torch.save(lstm_imdb.state_dict(), 'lstm_model.pth')

In [18]:
lstm_imdb.eval()
total = 0
correct = 0
with torch.no_grad():
    for text, label in test_loader:
        text = text.to(device)
        label = label.unsqueeze(1).to(device)
        outputs = lstm_imdb(text)
        predictions = torch.sigmoid(outputs) > 0.5
        correct += (predictions.float() == label).sum().item()
        total += label.size(0)
print(f"Test Accuracy: {correct / total * 100:.2f}%\n")

Test Accuracy: 82.05%

