In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jieba-pkl/douban_comments_jieba.pkl
/kaggle/input/jieba-vocab/jieba_comments_vocab.pth


In [2]:
import torch
from torch.nn.utils.rnn import pad_sequence
def build_collate_fun(vocab):

    def collate_func(batch):
        comments,labels=[],[]
        for item in batch:
            token_index=torch.tensor([vocab[tk] for tk in item[0] if tk !=' '])
            comments.append(token_index)
            labels.append(item[1])
        ##padding
        comments=pad_sequence(comments,batch_first=True,padding_value=0)
        return comments,torch.tensor(labels,dtype=torch.int64)
    return collate_func

In [3]:
##定义模型
import torch.nn as nn
class Comments_Classifier(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)  # padding_idx=0
        self.rnn = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids):
        # input_ids: (batch_size, seq_len)
        # embedded: (batch_size, seq_len, embedding_dim)
        embedded = self.embedding(input_ids)
        # output: (batch_size, seq_len, hidden_size)
        output, (hidden, _) = self.rnn(embedded)
        output = self.fc(output[:, -1, :])  # 取最后一个时间步的输出
        return output

In [4]:
###拆分数据集分为训练 测试 验证
import random
def train_test_split(x,split_rate=0.2):
    ###测试集和验证集按照1：1拆分 训练集和test集按照split_rate拆分
    split_size=int(len(x)*(1-split_rate))
    split_size2=int((int(len(x))-split_size)/2)
    split_size3=split_size+split_size2
    split_index=list(range(len(x)))
    random.shuffle(split_index)
    x_train=[x[i][0] for i in split_index[:split_size]]
    y_train=[x[i][1] for i in split_index[:split_size]]


    x_test=[x[i][0] for i in split_index[split_size:split_size3]]
    y_test=[x[i][1] for i in split_index[split_size:split_size3]]

    x_valid=[x[i][0] for i in split_index[split_size3:]]
    y_valid=[x[i][1] for i in split_index[split_size3:]]


    return (x_train,y_train),(x_test,y_test),(x_valid,y_valid)

In [5]:
from torch.utils.tensorboard import SummaryWriter
writer=SummaryWriter()

train_loss_cnt=0
val_loss_cnt=0
val_acc_cnt=0

BATCH_SIZE=100
EPOCHS=5
EMBEDING_SIZE=200
RNN_HIDDEN_SIZE=200
LEARN_RATE=1e-3
NUM_LABELS=2



2025-04-22 03:07:23.486319: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745291243.678659      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745291243.735462      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cuda


In [9]:
import pickle
###还原评论数据
with open('/kaggle/input/jieba-pkl/douban_comments_jieba.pkl','rb' ) as f:
    comments_data=pickle.load(f)

In [10]:
def buid_vocab_from_documents(doc):
    no_repeat_tokens=set()
    for cmt in doc:
        no_repeat_tokens.update(cmt[0])
    tokens=['<PAD>','<UNK>']+list(no_repeat_tokens)

    vocab={ tk:i for i,tk in enumerate(tokens)}
    return vocab

In [11]:
vocab=buid_vocab_from_documents(comments_data)

In [12]:
len(vocab)

282378

In [13]:
##数据拆分
(x_train,y_train),(x_test,y_test),(x_valid,y_valid)=train_test_split(comments_data)


In [14]:
print(len(x_train))
print(len(x_test))
print(len(x_valid))
print(x_train[0],y_train[0])
print(x_test[0],y_test[0])
print(x_valid[0],y_valid[0])

1040700
130087
130088
[' ', '我', '只能', '说', ' ', '都', '是', '新', '演员', ' ', '新', '导演', ' ', '新', '编剧', ' ', '这样', '的', '完成度', '已经', '不错', '了'] 0
[' ', '讲真', '，', '我', '是', '吴亦凡', '的', '粉丝', '。', '但是', '唐僧', '真的', '觉得', '文章', '演', '的', '比', '他', '好', '太', '多', '。', '一下子', '用', '那么', '多', '新人', '。', '真的', '失落', '好多', '。', '没有', '舒淇', '的', '串场', '，', '这部', '真的', '没有', '一点', '感动', '点', '。', '第一部', '我', '看', '哭', '了', '。', '这', '一部', '失落', '哭', '了', '。', '星爷', '！', '成龙', '也', '出过', '轨生', '了', '小龙女', '，', '但', '他', '演技', '所有人', '看到', '了', '！', '文章', '虽然', '某', '方面', '做错', '了', '，', '但', '真的', '演', '唐僧', '演', '的', '很', '到位', '！', '凡凡', '真的', '只', '适合', '看'] 1
[' ', '因为', '上映', '的', '时候', '一直', '没', '时间', '去', '影院', '看', '，', '遗憾', '了', '很', '久', '，', '今天', '终于', '有', '时间', '看', '了', '。', '更', '遗憾', '没有', '去', '影院', '看', '了', '。', '哎', '！', '简直', '太棒了', '~'] 0


In [15]:
from torch.utils.data import DataLoader

train_ds=list(zip(x_train,y_train))
# 通过Dataset构建DataLoader
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, 
                        collate_fn=build_collate_fun(vocab))

valid_ds=list(zip(x_valid,y_valid))
# 通过Dataset构建DataLoader
valid_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=True, 
                        collate_fn=build_collate_fun(vocab))
test_ds=list(zip(x_test,y_test))
# 通过Dataset构建DataLoader
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, 
                        collate_fn=build_collate_fun(vocab))


In [16]:
print(len(vocab))
i=0
for key in vocab.keys():
    if key=='PAD':
        print(key,vocab[key])

282378
PAD 119816


In [17]:
###模型构建

model = Comments_Classifier(len(vocab), EMBEDING_SIZE, RNN_HIDDEN_SIZE, NUM_LABELS)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARN_RATE)

In [18]:
from tqdm import tqdm

def train(model,train_dl,criterion,optimizer):
    global train_loss_cnt
    model.to(device)
    model.train()
    tpbar=tqdm(train_dl)
    for tokens,labels in tpbar:
        optimizer.zero_grad()
        tokens,labels=tokens.to(device),labels.to(device)
        loss=train_step(model,tokens,labels,criterion)
        loss.backward()
        optimizer.step()
        tpbar.set_description(f"epoch:{epoch+1} train_loss:{loss.item():.4f}")
        writer.add_scalar('train_loss',loss.item(),train_loss_cnt)
        train_loss_cnt+=1

def train_step(model,tokens,labels,criterion):
    logits=model(tokens)
    loss=criterion(logits,labels)
    return loss
    
def validate(model,val_dl,criterion):
    global val_loss_cnt,val_acc_cnt
    model.to(device)
    model.eval()
    tpbar=tqdm(val_dl)
    total_loss=0
    total_acc=0

    for tokens,labels in tpbar:
        tokens,labels=tokens.to(device),labels.to(device)
        loss,logits=validate_step(model,tokens,labels,criterion)
        tpbar.set_description(f"epoch:{epoch+1} val_loss:{loss.item():.4f}")

        total_loss+=loss.item()
        total_acc+=(logits.argmax(dim=1)==labels).float().mean()
    writer.add_scalar('val_avg_loss',total_loss/len(val_dl),val_loss_cnt)
    val_loss_cnt+=1
    ##计算准确率
    writer.add_scalar('val_acc',total_acc/len(val_dl),val_acc_cnt)
    val_acc_cnt +=1
         


def validate_step(model,tokens,labels,criterion):
    logits=model(tokens)
    loss=criterion(logits,labels)
    return loss,logits

In [19]:
##训练
for epoch in range(EPOCHS):
    train(model,train_dl,criterion,optimizer)
    validate(model,valid_dl,criterion)

epoch:1 train_loss:0.2015: 100%|██████████| 10407/10407 [03:52<00:00, 44.76it/s]
epoch:1 val_loss:0.3379: 100%|██████████| 1301/1301 [00:09<00:00, 140.69it/s]
epoch:2 train_loss:0.2056: 100%|██████████| 10407/10407 [03:50<00:00, 45.17it/s]
epoch:2 val_loss:0.1723: 100%|██████████| 1301/1301 [00:09<00:00, 144.49it/s]
epoch:3 train_loss:0.2181: 100%|██████████| 10407/10407 [03:50<00:00, 45.22it/s]
epoch:3 val_loss:0.1424: 100%|██████████| 1301/1301 [00:09<00:00, 144.33it/s]
epoch:4 train_loss:0.1425: 100%|██████████| 10407/10407 [03:50<00:00, 45.20it/s]
epoch:4 val_loss:0.2334: 100%|██████████| 1301/1301 [00:08<00:00, 145.72it/s]
epoch:5 train_loss:0.2072: 100%|██████████| 10407/10407 [03:51<00:00, 45.05it/s]
epoch:5 val_loss:0.2621: 100%|██████████| 1301/1301 [00:08<00:00, 144.75it/s]


In [33]:
##保存模型
torch.save({'model_state': model.state_dict(),
            'model_vocab':vocab},'model_jieba.bin')

In [35]:
# 创建模型实例
model_test = Comments_Classifier(len(vocab), EMBEDING_SIZE, RNN_HIDDEN_SIZE, NUM_LABELS)

# 加载保存的文件
checkpoint = torch.load('model_jieba.bin',weights_only=False)

# 加载模型的状态字典
model_test.load_state_dict(checkpoint['model_state'])

# 加载词汇表
vocab_test = checkpoint['model_vocab']

In [47]:
print("Model device:", next(model_test.parameters()).device)


Model device: cuda:0


In [56]:
###测试模型
tpbar=tqdm(test_ds)
i=0
for tokens,labels in tpbar:
    model_test.eval()
    model_test.to(device)
    comment1_idx = torch.tensor([vocab_test.get(word, vocab_test['<UNK>']) for word in tokens])
    comment1_idx = comment1_idx.unsqueeze(0).to(device)  # 添加batch维度    
    predict=model_test(comment1_idx)
    if torch.argmax(predict, dim=1).item()==labels:
        i=i+1
print(f"测试集正确率：{i/len(test_ds):.4f}")


100%|██████████| 130087/130087 [02:44<00:00, 790.97it/s]

测试集正确率：0.8987



