In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/doubanmovieshortcomments/DMSC.csv


# 1 文本预处理

In [2]:
comments = pd.read_csv('/kaggle/input/doubanmovieshortcomments/DMSC.csv')
comments.head()

Unnamed: 0,ID,Movie_Name_EN,Movie_Name_CN,Crawl_Date,Number,Username,Date,Star,Comment,Like
0,0,Avengers Age of Ultron,复仇者联盟2,2017-01-22,1,然潘,2015-05-13,3,连奥创都知道整容要去韩国。,2404
1,1,Avengers Age of Ultron,复仇者联盟2,2017-01-22,2,更深的白色,2015-04-24,2,非常失望，剧本完全敷衍了事，主线剧情没突破大家可以理解，可所有的人物都缺乏动机，正邪之间、...,1231
2,2,Avengers Age of Ultron,复仇者联盟2,2017-01-22,3,有意识的贱民,2015-04-26,2,2015年度最失望作品。以为面面俱到，实则画蛇添足；以为主题深刻，实则老调重弹；以为推陈出...,1052
3,3,Avengers Age of Ultron,复仇者联盟2,2017-01-22,4,不老的李大爷耶,2015-04-23,4,《铁人2》中勾引钢铁侠，《妇联1》中勾引鹰眼，《美队2》中勾引美国队长，在《妇联2》中终于...,1045
4,4,Avengers Age of Ultron,复仇者联盟2,2017-01-22,5,ZephyrO,2015-04-22,2,虽然从头打到尾，但是真的很无聊啊。,723


In [3]:
comments.describe()

Unnamed: 0,ID,Number,Star,Like
count,2125056.0,2125056.0,2125056.0,2125056.0
mean,1062528.0,46097.75,3.63832,1.078081
std,613451.0,31915.18,1.240807,54.36271
min,0.0,1.0,1.0,0.0
25%,531263.8,19641.0,3.0,0.0
50%,1062528.0,40749.0,4.0,0.0
75%,1593791.0,68242.0,5.0,0.0
max,2125055.0,141200.0,5.0,15499.0


In [4]:
comments['label'] = np.where(comments['Star']<3,0,1)
comments_select = comments.loc[comments['Star']!=3,['label','Comment']].copy()
comments_select.head()

Unnamed: 0,label,Comment
1,0,非常失望，剧本完全敷衍了事，主线剧情没突破大家可以理解，可所有的人物都缺乏动机，正邪之间、...
2,0,2015年度最失望作品。以为面面俱到，实则画蛇添足；以为主题深刻，实则老调重弹；以为推陈出...
3,1,《铁人2》中勾引钢铁侠，《妇联1》中勾引鹰眼，《美队2》中勾引美国队长，在《妇联2》中终于...
4,0,虽然从头打到尾，但是真的很无聊啊。
6,0,只有一颗彩蛋必须降一星。外加漫威的编剧是有心无力了吧。复仇者联盟只能永远着手与团队的和与不...


# 2 构建词典并保存

In [5]:
import jieba

In [6]:
# 构建词典
vocab = set()
for idx,row in comments_select.iterrows():
    words = jieba.cut(row['Comment'])
    for word in words:
        vocab.add(word)
print(len(vocab))        

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.623 seconds.
Prefix dict has been built successfully.


287578


In [29]:
vocab_add =  ['PAD','UNK'] + list(vocab)  # PAD: padding, UNK: unknown
w2idx = {word: idx for idx, word in enumerate(vocab_add)}

In [31]:
import json
with open("/kaggle/working/w2idx.json", "w", encoding="utf-8") as f:
    json.dump(w2idx, f, ensure_ascii=False)

#  3加载词典

In [32]:
import json
# 从 JSON 文件加载字典
with open("/kaggle/working/w2idx.json", "r", encoding="utf-8") as f:
    w2idx = json.load(f)

# 4 模型训练 评估 测试

In [33]:
import pickle
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence 
from sklearn.model_selection import train_test_split

In [34]:
def convert_data(batch_data):
    comments, votes = [],[]
    for comment, vote in batch_data:
        tokenized = jieba.lcut(comment) if isinstance(comment, str) else comment
        comments.append(torch.tensor([w2idx.get(word, w2idx['UNK']) for word in tokenized]))
        votes.append(vote)
    
    commt = pad_sequence(comments, batch_first=True, padding_value=w2idx['PAD'])
    labels = torch.tensor(votes)
    return commt, labels

In [40]:
processed_data = [(row['Comment'], row['label'])  # 确保列名对应
    for _, row in comments_select.iterrows()]

train_data, test_data = train_test_split(
    processed_data, 
    test_size=0.2,
    random_state=42  # 固定随机种子保证可复现性
)

train_dataloader = DataLoader(train_data, 
                            batch_size=256, 
                            shuffle=True, 
                            collate_fn=convert_data)

test_dataloader = DataLoader(test_data,
                           batch_size=256,
                           shuffle=False,
                           collate_fn=convert_data)

In [36]:
class Comments_Classifier(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)  # padding_idx=0
        self.rnn = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids):
        # input_ids: (batch_size, seq_len)
        # embedded: (batch_size, seq_len, embedding_dim)
        embedded = self.embedding(input_ids)
        # output: (batch_size, seq_len, hidden_size)
        output, (hidden, _) = self.rnn(embedded)
        output = self.fc(output[:, -1, :])  # 取最后一个时间步的输出
        return output

In [41]:
#模型训练
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Comments_Classifier(len(w2idx), 100, 128, 2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 训练循环
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    for i, (cmt, lbl) in enumerate(train_dataloader):
        cmt, lbl = cmt.to(device), lbl.to(device)
        
        # 前向传播
        outputs = model(cmt)
        loss = criterion(outputs, lbl)
        
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # 每10个batch打印进度
        if (i+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}')


Epoch [1/5], Step [10/5158], Loss: 0.5490
Epoch [1/5], Step [20/5158], Loss: 0.5215
Epoch [1/5], Step [30/5158], Loss: 0.6436
Epoch [1/5], Step [40/5158], Loss: 0.5982
Epoch [1/5], Step [50/5158], Loss: 0.5510
Epoch [1/5], Step [60/5158], Loss: 0.4371
Epoch [1/5], Step [70/5158], Loss: 0.6484
Epoch [1/5], Step [80/5158], Loss: 0.5547
Epoch [1/5], Step [90/5158], Loss: 0.5368
Epoch [1/5], Step [100/5158], Loss: 0.5455
Epoch [1/5], Step [110/5158], Loss: 0.5091
Epoch [1/5], Step [120/5158], Loss: 0.5340
Epoch [1/5], Step [130/5158], Loss: 0.5022
Epoch [1/5], Step [140/5158], Loss: 0.5577
Epoch [1/5], Step [150/5158], Loss: 0.4969
Epoch [1/5], Step [160/5158], Loss: 0.5160
Epoch [1/5], Step [170/5158], Loss: 0.5666
Epoch [1/5], Step [180/5158], Loss: 0.5545
Epoch [1/5], Step [190/5158], Loss: 0.5317
Epoch [1/5], Step [200/5158], Loss: 0.5556
Epoch [1/5], Step [210/5158], Loss: 0.5413
Epoch [1/5], Step [220/5158], Loss: 0.5181
Epoch [1/5], Step [230/5158], Loss: 0.5642
Epoch [1/5], Step [2

../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [118,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [118,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [118,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [118,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [117,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [117,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1308: indexSelectLargeIndex: block: [117,

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [48]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.eval()
model = model.to(device)
total, correct = 0, 0
with torch.no_grad():
    for cmt, lbl in test_dataloader:
        cmt, lbl = cmt.to(device), lbl.to(device)  # 恢复这行代码
        outputs = model(cmt)
        _, predicted = torch.max(outputs.data, 1)
        total += lbl.size(0)
        correct += (predicted == lbl).sum().item()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
