In [None]:
import torch.nn as nn
import torch
import jieba
import csv
from tqdm import tqdm
import matplotlib.pyplot as plt
import sentencepiece as spm
import pickle

### 选取star和comments 数据

In [None]:
comments , labels = [] , []
with open("./data/DouBan.csv", encoding = "utf-8") as f:
    reader = csv.DictReader(f)
    for item in tqdm(reader):
        comment = item['Comment']
        star = int(item['Star'])
        # 选取评论长度在120到130之间
        if star in [1,5] and 120 <= len(comment) <= 130:
            comments.append(comment)
            labels.append(1 if star == 5 or star == 4 else 0)
len(comments)

### 查看评论分布

In [None]:
comt_lens = [len(cmt) for cmt in comments]
plt.hist(comt_lens , bins = 10)
plt.show()

### 使用结巴分词

In [None]:

comments = [cmt.replace('。' ,' ') . replace(',' , ' ') for cmt in comments]
comments_jieba =  [' '.join(jieba.cut(cmt)) for cmt in comments]

In [None]:
comments_jieba[5]

### 使用 sentencepiece 分词

In [None]:
sp = spm.SentencePieceProcessor()
# 将评论写入一个txt 利用这个文件训练spm分词模型
with open("./data/train_corpus.txt" , "w" , encoding = 'utf-8') as f :
    for cmt in comments:
        f.write(cmt + '\n')

# spm 分词模型训练
spm.SentencePieceTrainer.Train(
    input = './data/train_corpus.txt' ,
    model_prefix = 'spm_model' ,
    vocab_size = 5000 ,
)


In [None]:
# 读取分词模型，并且利用分词模型对comments进行分词
sp.Load('spm_model.model')
comments_spm = []
for comt in comments:
    # 将每个comt拆分为字词单元或词片 ，返回列表
    comt_spm = sp.EncodeAsPieces(comt)
    # 用空格连接每个字词单元生成字符串添加到comments_spm的列表中
    comments_spm.append(' '.join(comt_spm))

comments_spm[:10]

### 分别存储jieba分词的文件 和 spm分词的文件

In [None]:
# 以二进制方式写入文件
# pickle.dump 将 ( comments , label) 序列化并且打包成一个元组写入文件

with open('./data/comments_jieba.bin' , 'wb') as f:
    pickle.dump((comments_jieba , labels) , f)

with open('./data/comments_spm.bin' , 'wb') as f:
    pickle.dump((comments_spm , labels) , f)