In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sp-comment/douban_comments_sp.pkl
/kaggle/input/sp-pth/sp_comments_vocab.pth


In [2]:
import torch
from torch.nn.utils.rnn import pad_sequence
def build_collate_fun(vocab):

    def collate_func(batch):
        comments,labels=[],[]
        for item in batch:
            token_index=torch.tensor([vocab[tk] for tk in item[0] if tk !=' '])
            comments.append(token_index)
            labels.append(item[1])
        ##padding
        comments=pad_sequence(comments,batch_first=True,padding_value=0)
        return comments,torch.tensor(labels,dtype=torch.int64)
    return collate_func

In [3]:
##定义模型
import torch.nn as nn
class Comments_Classifier(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)  # padding_idx=0
        self.rnn = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids):
        # input_ids: (batch_size, seq_len)
        # embedded: (batch_size, seq_len, embedding_dim)
        embedded = self.embedding(input_ids)
        # output: (batch_size, seq_len, hidden_size)
        output, (hidden, _) = self.rnn(embedded)
        output = self.fc(output[:, -1, :])  # 取最后一个时间步的输出
        return output

In [4]:
###拆分数据集分为训练 测试 验证
import random
def train_test_split(x,split_rate=0.2):
    ###测试集和验证集按照1：1拆分 训练集和test集按照split_rate拆分
    split_size=int(len(x)*(1-split_rate))
    split_size2=int((int(len(x))-split_size)/2)
    split_size3=split_size+split_size2
    split_index=list(range(len(x)))
    random.shuffle(split_index)
    x_train=[x[i][0] for i in split_index[:split_size]]
    y_train=[x[i][1] for i in split_index[:split_size]]


    x_test=[x[i][0] for i in split_index[split_size:split_size3]]
    y_test=[x[i][1] for i in split_index[split_size:split_size3]]

    x_valid=[x[i][0] for i in split_index[split_size3:]]
    y_valid=[x[i][1] for i in split_index[split_size3:]]


    return (x_train,y_train),(x_test,y_test),(x_valid,y_valid)

In [5]:
from torch.utils.tensorboard import SummaryWriter
writer=SummaryWriter()

train_loss_cnt=0
val_loss_cnt=0
val_acc_cnt=0

BATCH_SIZE=100
EPOCHS=5
EMBEDING_SIZE=200
RNN_HIDDEN_SIZE=200
LEARN_RATE=1e-3
NUM_LABELS=2


2025-04-22 04:53:35.334049: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745297615.528475      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745297615.585659      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [7]:
import pickle
###还原评论数据
with open('/kaggle/input/sp-comment/douban_comments_sp.pkl','rb' ) as f:
    comments_data=pickle.load(f)

In [8]:
def buid_vocab_from_documents(doc):
    no_repeat_tokens=set()
    for cmt in doc:
        no_repeat_tokens.update(cmt[0])
    tokens=['<PAD>','<UNK>']+list(no_repeat_tokens)

    vocab={ tk:i for i,tk in enumerate(tokens)}
    return vocab

In [9]:
vocab=buid_vocab_from_documents(comments_data)

In [11]:
len(vocab)

14844

In [12]:
##数据拆分
(x_train,y_train),(x_test,y_test),(x_valid,y_valid)=train_test_split(comments_data)


In [13]:
print(len(x_train))
print(len(x_test))
print(len(x_valid))
print(x_train[0],y_train[0])
print(x_test[0],y_test[0])
print(x_valid[0],y_valid[0])

1040700
130087
130088
['▁', '由于', '对', '武', '隆', '印象', '颇', '深', ',', '后半段', '还以为', '擎天柱', '和', '飞', '船', '从', '香港', '炸', '到', '武', '隆', '去了', '。。。'] 0
['▁', '就觉得', '钢铁侠', '跟', '绿巨人', '的', '打斗', '最', '经典'] 0
['▁', '一步', '非常不错', '的', '国产动画片', '!'] 0


In [14]:
from torch.utils.data import DataLoader

train_ds=list(zip(x_train,y_train))
# 通过Dataset构建DataLoader
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, 
                        collate_fn=build_collate_fun(vocab))

valid_ds=list(zip(x_valid,y_valid))
# 通过Dataset构建DataLoader
valid_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=True, 
                        collate_fn=build_collate_fun(vocab))
test_ds=list(zip(x_test,y_test))
# 通过Dataset构建DataLoader
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, 
                        collate_fn=build_collate_fun(vocab))


In [15]:
###模型构建

model = Comments_Classifier(len(vocab), EMBEDING_SIZE, RNN_HIDDEN_SIZE, NUM_LABELS)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARN_RATE)

In [16]:
from tqdm import tqdm

def train(model,train_dl,criterion,optimizer):
    global train_loss_cnt
    model.to(device)
    model.train()
    tpbar=tqdm(train_dl)
    for tokens,labels in tpbar:
        optimizer.zero_grad()
        tokens,labels=tokens.to(device),labels.to(device)
        loss=train_step(model,tokens,labels,criterion)
        loss.backward()
        optimizer.step()
        tpbar.set_description(f"epoch:{epoch+1} train_loss:{loss.item():.4f}")
        writer.add_scalar('train_loss',loss.item(),train_loss_cnt)
        train_loss_cnt+=1

def train_step(model,tokens,labels,criterion):
    logits=model(tokens)
    loss=criterion(logits,labels)
    return loss
    
def validate(model,val_dl,criterion):
    global val_loss_cnt,val_acc_cnt
    model.to(device)
    model.eval()
    tpbar=tqdm(val_dl)
    total_loss=0
    total_acc=0

    for tokens,labels in tpbar:
        tokens,labels=tokens.to(device),labels.to(device)
        loss,logits=validate_step(model,tokens,labels,criterion)
        tpbar.set_description(f"epoch:{epoch+1} val_loss:{loss.item():.4f}")

        total_loss+=loss.item()
        total_acc+=(logits.argmax(dim=1)==labels).float().mean()
    writer.add_scalar('val_avg_loss',total_loss/len(val_dl),val_loss_cnt)
    val_loss_cnt+=1
    ##计算准确率
    writer.add_scalar('val_acc',total_acc/len(val_dl),val_acc_cnt)
    val_acc_cnt +=1
         


def validate_step(model,tokens,labels,criterion):
    logits=model(tokens)
    loss=criterion(logits,labels)
    return loss,logits

In [17]:
##训练
for epoch in range(EPOCHS):
    train(model,train_dl,criterion,optimizer)
    validate(model,valid_dl,criterion)

epoch:1 train_loss:0.2422: 100%|██████████| 10407/10407 [02:42<00:00, 63.93it/s]
epoch:1 val_loss:0.2104: 100%|██████████| 1301/1301 [00:10<00:00, 129.91it/s]
epoch:2 train_loss:0.2317: 100%|██████████| 10407/10407 [02:43<00:00, 63.84it/s]
epoch:2 val_loss:0.2414: 100%|██████████| 1301/1301 [00:10<00:00, 129.83it/s]
epoch:3 train_loss:0.1629: 100%|██████████| 10407/10407 [02:41<00:00, 64.37it/s]
epoch:3 val_loss:0.2552: 100%|██████████| 1301/1301 [00:09<00:00, 130.88it/s]
epoch:4 train_loss:0.2076: 100%|██████████| 10407/10407 [02:42<00:00, 64.11it/s]
epoch:4 val_loss:0.1441: 100%|██████████| 1301/1301 [00:09<00:00, 133.51it/s]
epoch:5 train_loss:0.1005: 100%|██████████| 10407/10407 [02:42<00:00, 63.98it/s]
epoch:5 val_loss:0.1477: 100%|██████████| 1301/1301 [00:09<00:00, 130.24it/s]


In [18]:
##保存模型
torch.save({'model_state': model.state_dict(),
            'model_vocab':vocab},'model_jieba.bin')

In [19]:
# 创建模型实例
model_test = Comments_Classifier(len(vocab), EMBEDING_SIZE, RNN_HIDDEN_SIZE, NUM_LABELS)

# 加载保存的文件
checkpoint = torch.load('model_jieba.bin',weights_only=False)

# 加载模型的状态字典
model_test.load_state_dict(checkpoint['model_state'])

# 加载词汇表
vocab_test = checkpoint['model_vocab']

In [20]:
###测试模型
tpbar=tqdm(test_ds)
i=0
for tokens,labels in tpbar:
    model_test.eval()
    model_test.to(device)
    comment1_idx = torch.tensor([vocab_test.get(word, vocab_test['<UNK>']) for word in tokens])
    comment1_idx = comment1_idx.unsqueeze(0).to(device)  # 添加batch维度    
    predict=model_test(comment1_idx)
    if torch.argmax(predict, dim=1).item()==labels:
        i=i+1
print(f"测试集正确率：{i/len(test_ds):.4f}")

100%|██████████| 130087/130087 [02:47<00:00, 778.93it/s]

测试集正确率：0.9122



