In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        comment_file = os.path.join(dirname, filename)
        print(comment_file)
        df = pd.read_csv(comment_file)
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/doubanmovieshortcomments/DMSC.csv


In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import jieba
import pickle
import torch
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn



# 1. 使用豆瓣电影评论数据完成文本分类处理：文本预处理，加载、构建词典。（评论得分1～2 表示negative取值：0，评论得分4～5代表positive取值：1）
# https://www.kaggle.com/datasets/utmhikari/doubanmovieshortcomments
# 2. 加载处理后文本构建词典、定义模型、训练、评估、测试。
# 3. 尝试不同分词工具进行文本分词，观察模型训练结果。

# 在线访问:
# https://www.kaggle.com/code/mitrecx/notebook

comment_file = '/kaggle/input/doubanmovieshortcomments/DMSC.csv'
print(comment_file)

# ID,Movie_Name_EN,Movie_Name_CN,Crawl_Date,Number,Username,Date,Star,Comment,Like
df = pd.read_csv(comment_file)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
print(df[0:2])

df = df[['Comment', 'Star']].dropna()

# 标签处理：3分去掉
df = df[df['Star'].isin([1, 2, 4, 5])]
# 把Comment小于6的过滤掉
df = df[df['Comment'].str.len() > 6]
df = df[df['Comment'].str.len() <= 50]
# 1~2为negative(0), 4~5 为  positive(1)
df['label'] = df['Star'].apply(lambda x: 0 if x <= 2 else 1)


# print(df[0:10])


def tokenize(text):
    # stopwords = set()
    # with open('cn_stopwords.txt', 'r', encoding='utf-8') as f:
    #     stopwords = set([line.strip() for line in f])
    # words = jieba.lcut(text)
    # # 过滤停用词
    # filtered_words = [word for word in words if word not in stopwords and len(word) > 1]
    return jieba.lcut(text)


df['tokens'] = df['Comment'].apply(tokenize)
# print(df[0:10])

# 构建词典
all_tokens = [token for tokens in df['tokens'] for token in tokens]
vocab = {'<PAD>': 0, '<UNK>': 1}
vocab.update({word: i + 2 for i, (word, _) in enumerate(Counter(all_tokens).most_common(100000))})

# Save vocabulary
with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

# 将文本转为索引
def encode(tokens, max_len=50):
    ids = [vocab.get(token, 1) for token in tokens]
    if len(ids) < max_len:
        ids += [0] * (max_len - len(ids))
    return ids[:max_len]


df['input_ids'] = df['tokens'].apply(encode)
print(df[0:10])


class CommentDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = torch.tensor(inputs)
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        return self.inputs[idx], self.labels[idx]

    def __len__(self):
        return len(self.inputs)


X_train, X_test, y_train, y_test = train_test_split(df['input_ids'], df['label'], test_size=0.2)

train_dataset = CommentDataset(X_train.tolist(), y_train.tolist())
test_dataset = CommentDataset(X_test.tolist(), y_test.tolist())

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_size=64):
        super().__init__()
        # 词嵌入, 可训练
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 2)

    def forward(self, x):
        emb = self.embedding(x)
        out, _ = self.rnn(emb)
        return self.fc(out[:, -1, :])


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TextClassifier(len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(5):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

# Save model
torch.save(model.state_dict(), 'text_classifier.pth')

# Evaluate model using test set
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = model(batch_x)
        predicted = torch.argmax(outputs, dim=1)
        correct += (predicted == batch_y).sum().item()
        total += batch_y.size(0)
print(f"Test Accuracy: {correct / total:.4f}")


/kaggle/input/doubanmovieshortcomments/DMSC.csv
   ID           Movie_Name_EN Movie_Name_CN  Crawl_Date  Number Username        Date  Star                                            Comment  Like
0   0  Avengers Age of Ultron        复仇者联盟2  2017-01-22       1       然潘  2015-05-13     3                                      连奥创都知道整容要去韩国。  2404
1   1  Avengers Age of Ultron        复仇者联盟2  2017-01-22       2    更深的白色  2015-04-24     2   非常失望，剧本完全敷衍了事，主线剧情没突破大家可以理解，可所有的人物都缺乏动机，正邪之间、...  1231


Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.600 seconds.
Prefix dict has been built successfully.


                                        Comment  Star  label                                             tokens                                          input_ids
4                             虽然从头打到尾，但是真的很无聊啊。     2      0         [ , 虽然, 从头, 打到, 尾, ，, 但是, 真的, 很, 无聊, 啊, 。]  [2, 84, 573, 3095, 553, 3, 54, 36, 11, 306, 26...
7                                  看腻了这些打来打去的烂片     2      0                        [ , 看腻, 了, 这些, 打来打去, 的, 烂片]  [2, 8026, 6, 712, 3342, 4, 156, 0, 0, 0, 0, 0,...
15   什么破烂反派，毫无戏剧冲突能消耗两个多小时生命，还强加爱情戏。脑残片好圈钱倒是真的。     1      0  [ , 什么, 破烂, 反派, ，, 毫无, 戏剧, 冲突, 能, 消耗, 两个, 多, 小...  [2, 52, 8884, 908, 3, 556, 3967, 2510, 58, 104...
20                           奥创弱爆了弱爆了弱爆了啊！！！！！！     2      0  [ , 奥创, 弱, 爆, 了, 弱, 爆, 了, 弱, 爆, 了, 啊, ！, ！, ！,...  [2, 2122, 627, 536, 6, 627, 536, 6, 627, 536, ...
23    越是一本正经，就是越是无趣。个个死不了，撕了扯了砸了死了的，你也不关心。全程梦游。     2      0  [ , 越是, 一本正经, ，, 就是, 越是, 无趣, 。, 个个, 死不了, ，, 撕,...  [2, 5947, 6139, 3, 35, 5947, 2112, 5, 4634, 11...
25                    

In [5]:
import pickle
import torch
import torch.nn as nn
import jieba
import torch.nn.functional as F


with open('/kaggle/working/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_size=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 2)

    def forward(self, x):
        emb = self.embedding(x)
        out, _ = self.rnn(emb)
        return self.fc(out[:, -1, :])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TextClassifier(len(vocab)).to(device)
model.load_state_dict(torch.load('/kaggle/working/text_classifier.pth', map_location=device))
model.eval()


def encode(tokens, vocab, max_len=50):
    ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    if len(ids) < max_len:
        ids += [vocab['<PAD>']] * (max_len - len(ids))
    return ids[:max_len]

def predict(text):
    tokens = jieba.lcut(text)
    input_ids = encode(tokens, vocab)
    input_tensor = torch.tensor([input_ids]).to(device)  # [1, seq_len]
    with torch.no_grad():
        logits = model(input_tensor)
        probs = F.softmax(logits, dim=1)
        predicted = torch.argmax(probs, dim=1).item()
    label_str = "正面评价" if predicted == 1 else "负面评价"
    print(f"输入：{text}")
    print(f"预测结果：{label_str}")

predict("这部电影真心不错，演技在线，剧情感人")
predict("非常失望，剧本完全敷衍了事")

输入：这部电影真心不错，演技在线，剧情感人
预测结果：正面评价
输入：非常失望，剧本完全敷衍了事
预测结果：正面评价


  model.load_state_dict(torch.load('/kaggle/working/text_classifier.pth', map_location=device))
