In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/doubanmovieshortcomments/DMSC.csv


In [4]:
import pickle
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import jieba
import pandas as pd
from collections import Counter
import logging

jieba.setLogLevel(logging.INFO)


# # 加载数据
# def load_data(file_path):
#     df = pd.read_csv(file_path)
#     data = []
#     for index, row in df.iterrows():
#         score = row['Star']
#         if 1 <= score <= 2: 
#             label = 1
#         elif 4 <= score <= 5:
#             label = 0
#         else:
#             continue  # 跳过不符合条件的行
#         comment = row['Comment']
#         data.append((comment, label))
#     return data  # 返回包含评论和标签的列表


# 构建词汇表
def build_from_doc(doc):
    vocab = set()
    for line in doc:
        words = jieba.lcut(line[0])
        vocab.update(words)
    vocab = ['PAD', 'UNK'] + list(vocab)  # PAD: padding, UNK: unknown
    w2idx = {word: idx for idx, word in enumerate(vocab)}
    return w2idx


# 自定义数据集类
class DoubanDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        comment, label = self.data[idx]
        words = jieba.lcut(comment)
        comment_idx = [self.vocab.get(word, self.vocab['UNK']) for word in words]
        return torch.tensor(comment_idx), torch.tensor(label)


class Comments_Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)  # padding_idx=0
        self.rnn = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids):
        # input_ids: (batch_size, seq_len)
        # embedded: (batch_size, seq_len, embedding_dim)
        embedded = self.embedding(input_ids)
        # output: (batch_size, seq_len, hidden_size)
        output, (hidden, _) = self.rnn(embedded)
        output = self.fc(output[:, -1, :])  # 取最后一个时间步的输出
        return output


# 自定义数据转换方法(callback function)回调函数
def convert_data(batch_data):
    comments, votes = [], []
    # 分别提取评论和标签
    for comment, vote in batch_data:
        comments.append(comment)
        votes.append(vote)

    # 将评论和标签转换为tensor
    commt = pad_sequence(comments, batch_first=True, padding_value=0)  # 填充为相同长度
    labels = torch.tensor(votes)
    # 返回评论和标签
    return commt, labels

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 加载数据
file_path = '/kaggle/input/doubanmovieshortcomments/DMSC.csv'
df = pd.read_csv(file_path)
df = df.query('(1 <= Star <= 2) or (4 <= Star <= 5)')
def get_label(score):
    if 1 <= score <= 2:
        return 1
    elif 4 <= score <= 5:
        return 0
df['label'] = df['Star'].apply(get_label)
data = list(zip(df['Comment'], df['label']))

    # 构建词汇表
vocab = build_from_doc(data)
    # print('词汇表大小:', len(vocab))

    # 划分训练集和测试集
train_size = int(0.8 * len(data))
train_data = data[:train_size]
test_data = data[train_size:]

    # 创建数据集和数据加载器
train_dataset = DoubanDataset(train_data, vocab)
test_dataset = DoubanDataset(test_data, vocab)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=convert_data)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=convert_data)

    # 构建模型
vocab_size = len(vocab)
embedding_dim = 100
hidden_size = 128
num_classes = 2

model = Comments_Classifier(vocab_size, embedding_dim, hidden_size, num_classes)
model.to(device)

    # 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # 训练模型
num_epochs = 5
for epoch in range(num_epochs):
    for i, (cmt, lbl) in enumerate(train_dataloader):
        cmt = cmt.to(device)
        lbl = lbl.to(device)

            # 前向传播
        outputs = model(cmt)
        loss = criterion(outputs, lbl)

            # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 1000 == 0:
            print(
             f'Epoch [{epoch + 1}/{num_epochs}],  Loss: {loss.item():.4f}')




Epoch [1/5],  Loss: 0.5673
Epoch [1/5],  Loss: 0.5299
Epoch [1/5],  Loss: 0.5335
Epoch [1/5],  Loss: 0.5959
Epoch [1/5],  Loss: 0.6433
Epoch [1/5],  Loss: 0.5713
Epoch [1/5],  Loss: 0.5955
Epoch [1/5],  Loss: 0.3625
Epoch [1/5],  Loss: 0.5750
Epoch [1/5],  Loss: 0.3826
Epoch [1/5],  Loss: 0.4110
Epoch [1/5],  Loss: 0.2938
Epoch [1/5],  Loss: 0.0807
Epoch [1/5],  Loss: 0.3066
Epoch [1/5],  Loss: 0.2144
Epoch [1/5],  Loss: 0.6388
Epoch [1/5],  Loss: 0.2592
Epoch [1/5],  Loss: 0.1397
Epoch [1/5],  Loss: 0.4355
Epoch [1/5],  Loss: 0.3093
Epoch [1/5],  Loss: 0.1921
Epoch [1/5],  Loss: 0.3916
Epoch [1/5],  Loss: 0.3069
Epoch [1/5],  Loss: 0.2943
Epoch [1/5],  Loss: 0.1633
Epoch [1/5],  Loss: 0.2365
Epoch [1/5],  Loss: 0.1080
Epoch [1/5],  Loss: 0.3632
Epoch [1/5],  Loss: 0.4363
Epoch [1/5],  Loss: 0.2564
Epoch [1/5],  Loss: 0.1185
Epoch [1/5],  Loss: 0.2570
Epoch [1/5],  Loss: 0.2397
Epoch [1/5],  Loss: 0.2001
Epoch [1/5],  Loss: 0.4449
Epoch [1/5],  Loss: 0.3276
Epoch [1/5],  Loss: 0.2500
E

In [6]:
 # 保存模型
torch.save(model.state_dict(), 'douban_comments_classifier.pth')
    # 保存模型词典
torch.save(vocab, 'douban_comments_vocab.pth')