In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/douban-comments/vocab.pkl
/kaggle/input/douban-comments/comments_list.pkl


#### 2. 加载处理后文本构建词典、定义模型、训练、评估、测试。

In [2]:
import pickle
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter

In [3]:
# 加载数据
comments_list = pickle.load(open('/kaggle/input/douban-comments/comments_list.pkl', 'rb'))
vocab = pickle.load(open('/kaggle/input/douban-comments/vocab.pkl', 'rb'))
vocab_size = len(vocab)
print(len(comments_list))
print(vocab_size)

1220249
278670


In [4]:
def collate_fn(batch_data):
    comments, labels = [], []
    for comment, label in batch_data:
        # 转为分词的索引
        comments.append(torch.tensor([vocab.get(word, vocab['UNK']) for word in comment]))
        labels.append(label)
    comments = nn.utils.rnn.pad_sequence(comments, batch_first=True, padding_value=vocab['PAD'])
    labels = torch.tensor(labels)
    return comments, labels
# 加载训练数据：使用自定义函数对齐批次的seq长度
train_list, test_list = train_test_split(comments_list, test_size=0.2, random_state=42)
train_dataLoader = DataLoader(train_list, batch_size=128, shuffle=True, collate_fn=collate_fn)
test_dataLoader = DataLoader(test_list, batch_size=128, collate_fn=collate_fn)

In [5]:
# 定义模型
class Comment_RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    def forward(self, input_index):
        embed = self.embedding(input_index)
        out, (h, _) = self.rnn(embed)
        out = self.fc(out[:, -1, :])
        return out

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_dim = 100
hidden_size = 128
num_layers = 2
num_classes = 2
model = Comment_RNN(vocab_size, embedding_dim, hidden_size, num_layers, num_classes)
model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [7]:
# 训练
writer = SummaryWriter(log_dir='/kaggle/working/runs/jieba')
model.train()
for epoch in range(10):
    for i, (comments, labels) in enumerate(train_dataLoader):
        comments = comments.to(device)
        labels = labels.to(device)
        # 前向传播
        outputs = model(comments)
        loss = loss_fn(outputs, labels)
        # 反向传播和优化
        model.zero_grad()
        loss.backward()
        optimizer.step()
        writer.add_scalar('train loss', loss.item(), len(train_dataLoader) * epoch + i)
        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}], Step [{i+1}], Loss: {loss.item()}')
writer.close()

Epoch [1], Step [100], Loss: 0.5420422554016113
Epoch [1], Step [200], Loss: 0.5845947861671448
Epoch [1], Step [300], Loss: 0.5253560543060303
Epoch [1], Step [400], Loss: 0.4294927418231964
Epoch [1], Step [500], Loss: 0.48426496982574463
Epoch [1], Step [600], Loss: 0.5418112874031067
Epoch [1], Step [700], Loss: 0.46136951446533203
Epoch [1], Step [800], Loss: 0.44212841987609863
Epoch [1], Step [900], Loss: 0.4939717650413513
Epoch [1], Step [1000], Loss: 0.4840162694454193
Epoch [1], Step [1100], Loss: 0.5154366493225098
Epoch [1], Step [1200], Loss: 0.464128315448761
Epoch [1], Step [1300], Loss: 0.515199601650238
Epoch [1], Step [1400], Loss: 0.47719088196754456
Epoch [1], Step [1500], Loss: 0.5366746783256531
Epoch [1], Step [1600], Loss: 0.49447524547576904
Epoch [1], Step [1700], Loss: 0.677943766117096
Epoch [1], Step [1800], Loss: 0.5544470548629761
Epoch [1], Step [1900], Loss: 0.5833597183227539
Epoch [1], Step [2000], Loss: 0.4743793308734894
Epoch [1], Step [2100], Los

In [8]:
# 测试
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for comments, labels in test_dataLoader:
        comments = comments.to(device)
        labels = labels.to(device)
        outputs = model(comments)
        pred = torch.argmax(outputs, dim=1)
        total += len(labels)
        correct += (pred == labels).sum().item()
    print('准确率：', correct * 100 / total)

准确率： 89.9020692481049


In [9]:
import jieba
stopwords = [line.strip() for line in open('/kaggle/input/stopwords/stopwords.txt', 'r', encoding='utf-8')] + ['PAD']
def comment_to_index(comment):
    # 分词
    words = jieba.lcut(comment)
    words = [item for item in words if item not in stopwords]
    # 转为索引
    indices = [vocab.get(word, vocab['UNK']) for word in words]
    return torch.tensor(indices).unsqueeze(0)

# 预测
comment1 = comment_to_index('电影很好看，情节引人入胜，全员演技在线，强烈推荐！').to(device)
comment2 = comment_to_index('这个电影太烂了，不值得一看！').to(device)
pred1 = model(comment1)
pred2 = model(comment2)
pred1 = torch.argmax(pred1, dim=1).item()
pred2 = torch.argmax(pred2, dim=1).item()
print('评论1：', pred1)
print('评论2：', pred2)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.652 seconds.
Prefix dict has been built successfully.


评论1： 1
评论2： 0
