In [5]:
# 读取数据集
import pandas as pd
df = pd.read_csv('/kaggle/input/doubanmovieshortcomments/DMSC.csv')
df.head()

Unnamed: 0,ID,Movie_Name_EN,Movie_Name_CN,Crawl_Date,Number,Username,Date,Star,Comment,Like
0,0,Avengers Age of Ultron,复仇者联盟2,2017-01-22,1,然潘,2015-05-13,3,连奥创都知道整容要去韩国。,2404
1,1,Avengers Age of Ultron,复仇者联盟2,2017-01-22,2,更深的白色,2015-04-24,2,非常失望，剧本完全敷衍了事，主线剧情没突破大家可以理解，可所有的人物都缺乏动机，正邪之间、...,1231
2,2,Avengers Age of Ultron,复仇者联盟2,2017-01-22,3,有意识的贱民,2015-04-26,2,2015年度最失望作品。以为面面俱到，实则画蛇添足；以为主题深刻，实则老调重弹；以为推陈出...,1052
3,3,Avengers Age of Ultron,复仇者联盟2,2017-01-22,4,不老的李大爷耶,2015-04-23,4,《铁人2》中勾引钢铁侠，《妇联1》中勾引鹰眼，《美队2》中勾引美国队长，在《妇联2》中终于...,1045
4,4,Avengers Age of Ultron,复仇者联盟2,2017-01-22,5,ZephyrO,2015-04-22,2,虽然从头打到尾，但是真的很无聊啊。,723


In [6]:
import jieba
# 预过滤
data = df[(df['Star'].astype(int) >= 1) & (df['Star'].astype(int) <= 5) & (df['Star'].astype(int) != 3)]

data = data[:10000]
# 将评分数据映射为标签，1-2为消极取0，4-5为积极取1
data = data['Star'].apply(lambda x: 1 if x >= 4 else 0)
# 处理评论数据
comments = df['Comment'].apply(lambda x: jieba.lcut(x))

ds_comments = list(zip(comments,data))
ds_comments[:2]

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.784 seconds.
Prefix dict has been built successfully.


[([' ', '连', '奥创', '都', '知道', '整容', '要', '去', '韩国', '。'], 0),
 ([' ',
   '非常',
   '失望',
   '，',
   '剧本',
   '完全',
   '敷衍了事',
   '，',
   '主线',
   '剧情',
   '没',
   '突破',
   '大家',
   '可以',
   '理解',
   '，',
   '可',
   '所有',
   '的',
   '人物',
   '都',
   '缺乏',
   '动机',
   '，',
   '正邪',
   '之间',
   '、',
   '妇联',
   '内部',
   '都',
   '没什么',
   '火花',
   '。',
   '团结',
   '-',
   '分裂',
   '-',
   '团结',
   '的',
   '三段式',
   '虽然',
   '老套',
   '但',
   '其实',
   '也',
   '可以',
   '利用',
   '积攒',
   '下来',
   '的',
   '形象',
   '魅力',
   '搞',
   '出',
   '意思',
   '，',
   '但',
   '剧本',
   '写得',
   '非常',
   '肤浅',
   '、',
   '平面',
   '。',
   '场面',
   '上',
   '调度',
   '混乱',
   '呆板',
   '，',
   '满屏',
   '的',
   '铁甲',
   '审美疲劳',
   '。',
   '只有',
   '笑',
   '点算',
   '得',
   '上',
   '差强人意',
   '。'],
  0)]

In [7]:
import pickle
# 二进制文件保存数据
with open('/kaggle/working/comments.pkl', 'wb') as f:
    pickle.dump(ds_comments, f)

In [21]:
import torch
import pickle
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载训练语料
with open('/kaggle/working/comments.pkl','rb') as f:
    comments_data = pickle.load(f)
vocab = set()
for line in comments_data:
    vocab.update(line[0])
vocab = ['PAD','UNK']+list(vocab)
w2idx = {word:idx for idx,word in enumerate(vocab)}
len(w2idx)
vocab = w2idx

In [24]:
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence # 长度不同张量填充为相同长度


def convert_data(batch_data):
    comments,votes = [],[]
    for comment,vote in batch_data:
        comments.append(torch.tensor([vocab.get(word,vocab['UNK']) for word in comment]))
        votes.append(vote)
    commt_tensor = pad_sequence(comments, batch_first=True, padding_value=vocab['PAD'])
    labels = torch.tensor(votes)
    return commt_tensor,labels

In [25]:
from torch.utils.data import DataLoader
# 通过Dataset构建DataLoader
dataloader = DataLoader(comments_data, batch_size=100, shuffle=True, 
                        collate_fn=convert_data)

In [26]:

# 构建模型
# vocab_size: 词汇表大小
# embedding_dim: 词嵌入维度
# hidden_size: LSTM隐藏层大小
# num_classes: 分类数量
vocab_size = len(vocab)
embedding_dim = 100
hidden_size = 128
num_classes = 2

class Comments_classifier(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_size,num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.LSTM(embedding_dim,hidden_size,batch_first=True)
        self.fc = nn.Linear(hidden_size,num_classes)
    def forward(self,input_ids):
        embedded = self.embedding(input_ids)
        output, _ = self.rnn(embedded)
        output = self.fc(output[:,-1,:])
        return output
model = Comments_classifier(vocab_size,embedding_dim,hidden_size,num_classes)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 5
for epoch in range(epochs):
    for i,(cmt,lbl) in enumerate(dataloader):
        cmt = cmt.to(device)
        lbl = lbl.to(device)
        outputs = model(cmt)
        loss = criterion(outputs,lbl)
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if i%100 == 0:
            print(f'Epoch [{epoch+1}/{epochs}],Step {i},Loss:{loss}')
# 保存模型
torch.save(model.state_dict(), 'comments_classifier.pth')
# 模型词典
torch.save(vocab, 'comments_vocab.pth')
        

Epoch [1/5],Step 0,Loss:0.6934474110603333
Epoch [2/5],Step 0,Loss:0.6283988356590271
Epoch [3/5],Step 0,Loss:0.5508683919906616
Epoch [4/5],Step 0,Loss:0.4933689832687378
Epoch [5/5],Step 0,Loss:0.6810575127601624


In [31]:
embedding_dim = 100
hidden_size = 128
num_classes = 2

# 加载词典
vocab = torch.load('comments_vocab.pth')
# 测试模型
comment1 = '这部电影真好看！全程无尿点'
comment2 = '非常失望'

# 将评论转换为索引
comment1_idx = torch.tensor([vocab.get(word, vocab['UNK']) for word in jieba.lcut(comment1)])
comment2_idx = torch.tensor([vocab.get(word, vocab['UNK']) for word in jieba.lcut(comment2)])
print(comment2_idx)
# 将评论转换为tensor
comment1_idx = comment1_idx.unsqueeze(0).to(device)  # 添加batch维度    
comment2_idx = comment2_idx.unsqueeze(0).to(device)  # 添加batch维度

# 加载模型
model = Comments_classifier(len(vocab), embedding_dim, hidden_size, num_classes)
model.load_state_dict(torch.load('comments_classifier.pth'))
model.to(device)

# 模型推理
pred1 = model(comment1_idx)
pred2 = model(comment2_idx)
print(pred1)

# 取最大值的索引作为预测结果
pred1 = torch.argmax(pred1, dim=1).item()
pred2 = torch.argmax(pred2, dim=1).item()
print(f'评论1预测结果: {pred1}')
print(f'评论2预测结果: {pred2}')

tensor([13897, 12452])
tensor([[-0.8218,  0.9575]], grad_fn=<AddmmBackward0>)
评论1预测结果: 1
评论2预测结果: 1


  vocab = torch.load('comments_vocab.pth')
  model.load_state_dict(torch.load('comments_classifier.pth'))
