In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stopwords/stopwords.txt
/kaggle/input/doubanmovieshortcomments/DMSC.csv


#### 3. 尝试不同分词工具进行文本分词，观察模型训练结果。

In [14]:
import csv
import sentencepiece as spm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter

In [3]:
# 将comment写入txt文件
with open('/kaggle/input/doubanmovieshortcomments/DMSC.csv') as f, open('/kaggle/working/comments.txt', 'w', encoding='utf-8') as o:
    reader = csv.DictReader(f)
    for row in reader:
        comment = row['Comment'].strip()
        star = row['Star'].strip()
        if (not comment) or (not star):continue
        star = int(star)
        if star <= 2 or star >= 4:
            o.write(f'{comment}\n')

In [4]:
# 使用sentencepiece进行分词
spm.SentencePieceTrainer.Train(input='/kaggle/working/comments.txt', model_prefix='spm_mod', vocab_size=50000)
print('spm训练完成')

spm训练完成


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: /kaggle/working/comments.txt
  input_format: 
  model_prefix: spm_mod
  model_type: UNIGRAM
  vocab_size: 50000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_pr

In [9]:
# 加载spm分词模型
sp = spm.SentencePieceProcessor(model_file='spm_mod.model')
# 去掉分词中的▁
def spm_cut(comment):
    words = sp.EncodeAsPieces(comment)
    return [word.replace('▁', '') for word in words if word != '▁']

In [10]:
# 加载停用词
stopwords = [line.strip() for line in open('/kaggle/input/stopwords/stopwords.txt', 'r', encoding='utf-8')] + ['PAD']
# 进行分词
comments_list = []
with open('/kaggle/input/doubanmovieshortcomments/DMSC.csv') as f:
    reader = csv.DictReader(f)
    for row in reader:
        comment = row['Comment'].strip()
        star = row['Star'].strip()
        if (not comment) or (not star):continue
        star = int(star)
        if star <= 2:
            # 使用sentencepiece进行分词
            words = spm_cut(comment)
            comments_list.append(([item for item in words if item not in stopwords], 0))
        elif star >= 4:
            words = spm_cut(comment)
            comments_list.append(([item for item in words if item not in stopwords], 1))
        
print(len(comments_list)) # 1650495

1650495


In [12]:
# 取分词长度5-100的评论
comments_list = [c for c in comments_list if len(c[0]) in range(5, 100)]
print(len(comments_list)) # 1174637
# 构建词典
vocab = {}
word_set = set()
for comment, _ in comments_list:
    word_set.update(comment)
word_list = ['PAD', 'UNK'] + list(word_set)
vocab = {word: i for i, word in enumerate(word_list)}
print(len(vocab)) # 52445

1174637
52445


In [28]:
def collate_fn(batch_data):
    comments, labels = [], []
    for comment, label in batch_data:
        # 转为分词的索引
        comments.append(torch.tensor([vocab.get(word, vocab['UNK']) for word in comment]))
        labels.append(label)
    comments = nn.utils.rnn.pad_sequence(comments, batch_first=True, padding_value=vocab['PAD'])
    labels = torch.tensor(labels)
    return comments, labels
# 加载训练数据：使用自定义函数对齐批次的seq长度
train_list, test_list = train_test_split(comments_list, test_size=0.2, random_state=42)
train_dataLoader = DataLoader(train_list, batch_size=128, shuffle=True, collate_fn=collate_fn)
test_dataLoader = DataLoader(test_list, batch_size=128, collate_fn=collate_fn)

# 定义模型
class Comment_RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    def forward(self, input_index):
        embed = self.embedding(input_index)
        out, (h, _) = self.rnn(embed)
        out = self.fc(out[:, -1, :])
        return out

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = len(vocab)
embedding_dim = 100
hidden_size = 128
num_layers = 2
num_classes = 2
model = Comment_RNN(vocab_size, embedding_dim, hidden_size, num_layers, num_classes)
model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [29]:
# 训练
writer = SummaryWriter(log_dir='/kaggle/working/runs/spm')
model.train()
for epoch in range(10):
    for i, (comments, labels) in enumerate(train_dataLoader):
        comments = comments.to(device)
        labels = labels.to(device)
        # 前向传播
        outputs = model(comments)
        loss = loss_fn(outputs, labels)
        # 反向传播和优化
        model.zero_grad()
        loss.backward()
        optimizer.step()
        writer.add_scalar('train loss', loss.item(), len(train_dataLoader) * epoch + i)
        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}], Step [{i+1}], Loss: {loss.item()}')
writer.close()

Epoch [1], Step [100], Loss: 0.5321649312973022
Epoch [1], Step [200], Loss: 0.5257980227470398
Epoch [1], Step [300], Loss: 0.5064579844474792
Epoch [1], Step [400], Loss: 0.47292113304138184
Epoch [1], Step [500], Loss: 0.5127525329589844
Epoch [1], Step [600], Loss: 0.5176987648010254
Epoch [1], Step [700], Loss: 0.43587103486061096
Epoch [1], Step [800], Loss: 0.5066991448402405
Epoch [1], Step [900], Loss: 0.510401725769043
Epoch [1], Step [1000], Loss: 0.5038600564002991
Epoch [1], Step [1100], Loss: 0.5166669487953186
Epoch [1], Step [1200], Loss: 0.4800718426704407
Epoch [1], Step [1300], Loss: 0.5536713004112244
Epoch [1], Step [1400], Loss: 0.5447406768798828
Epoch [1], Step [1500], Loss: 0.5261307954788208
Epoch [1], Step [1600], Loss: 0.5548434853553772
Epoch [1], Step [1700], Loss: 0.5279650688171387
Epoch [1], Step [1800], Loss: 0.47158461809158325
Epoch [1], Step [1900], Loss: 0.545153796672821
Epoch [1], Step [2000], Loss: 0.5749394297599792
Epoch [1], Step [2100], Loss

In [30]:
# 测试
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for comments, labels in test_dataLoader:
        comments = comments.to(device)
        labels = labels.to(device)
        outputs = model(comments)
        pred = torch.argmax(outputs, dim=1)
        total += len(labels)
        correct += (pred == labels).sum().item()
    print('准确率：', correct * 100 / total)

准确率： 90.24211673363754


In [32]:
stopwords = [line.strip() for line in open('/kaggle/input/stopwords/stopwords.txt', 'r', encoding='utf-8')] + ['PAD']
def comment_to_index(comment):
    # 分词
    words = spm_cut(comment)
    words = [item for item in words if item not in stopwords]
    # 转为索引
    indices = [vocab.get(word, vocab['UNK']) for word in words]
    return torch.tensor(indices).unsqueeze(0)

# 预测
comment1 = comment_to_index('电影很好看，情节引人入胜，全员演技在线，强烈推荐！').to(device)
comment2 = comment_to_index('这个电影太烂了，不值得一看！').to(device)
pred1 = model(comment1)
pred2 = model(comment2)
pred1 = torch.argmax(pred1, dim=1).item()
pred2 = torch.argmax(pred2, dim=1).item()
print('评论1：', pred1)
print('评论2：', pred2)

评论1： 1
评论2： 0
