In [1]:
import pandas as pd
import jieba
import re
import time

### 数据预处理

In [2]:
# 加载停用词
def load_stop_words():
    stop_words = set([])
    fname = './stopwords.txt'
    with open(fname, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            stop_words.add(line)
    return stop_words   

In [3]:
stop_words = load_stop_words()
#stop_words

In [4]:
# 分词
def tokenizer(text):
    return [word for word in jieba.cut(text) if word not in stop_words]

In [5]:
tokenizer('我们都是删除停用词接班人')

Building prefix dict from /Users/wangquanjun/miniforge3/envs/py39/lib/python3.9/site-packages/jieba/dict.txt ...
Loading model from cache /var/folders/0t/5qc9q0vn381cwx3337k5ybdh0000gn/T/jieba.cache
Loading model cost 0.4650759696960449 seconds.
Prefix dict has been built succesfully.


['删除', '停', '用词', '接班人']

In [6]:
# 加载训练集验证集
traindata = pd.read_csv('./train.tsv', sep='\t')
validata = pd.read_csv('./validation.tsv', sep='\t')

In [7]:
traindata.head()

Unnamed: 0,index,label,text
0,0,0,备胎是硬伤！
1,1,0,要说不满意的话，那就是动力了，1.5自然吸气发动机对这款车有种小马拉大车的感觉。如今天气这么...
2,2,0,油耗显示13升还多一点，希望慢慢下降。没有倒车雷达真可恨
3,3,0,空调不太凉，应该是小问题。
4,4,0,1、后排座椅不能平放；2、科技感不强，还不如百万帝豪，最希望增加车联网的车机。像你好博越一样...


In [8]:
validata.head()

Unnamed: 0,index,label,text
0,0,1,外观确实非常霸气，钻石切工有棱有角，3.0的动力在城市里绰绰有余，内饰考究，空间比较大，bo...
1,1,1,外观漂亮，安全性佳，动力够强，油耗够低
2,2,1,后备箱大！！！
3,3,1,空间大。外观大气，中控台用料讲究简洁
4,4,1,外观漂亮，空间够大，动力家用也ok


In [9]:
# 正负样本的比例
traindata['label'].value_counts()

0    28425
1    28275
Name: label, dtype: int64

### 训练词向量

In [10]:
totaldata = pd.concat([traindata, validata])

In [11]:
text_cut = []
for _, row in totaldata.iterrows():
    sentence = row['text']
    seg = tokenizer(sentence)
    text_cut.append(seg)

In [12]:
# 构造标准格式：空格分割
text_concat = []
for seg in text_cut:
    seg_concat = [' '.join(seg)]
    text_concat.append(seg_concat)

In [13]:
text_concat[0: 5]

[['备胎 硬伤'],
 ['要说 满意 动力 1.5 自然 吸气 发动机 这款 车 有种 小马拉 大车 感觉 天气 热 上路 肯定 得开 空调 开 动力 感觉 不给力 空调 制冷 效果 不错'],
 ['油耗 显示 13 升 多一点 希望 慢慢 下降 倒车 雷达 真 可恨'],
 ['空调 不太凉'],
 ['1 后排 座椅 平放 2 科技 感不强 百万 帝豪 希望 增加 车 联网 车机 你好 博越 3 全景 摄像头 晚上 用处']]

In [14]:
corpus = pd.DataFrame(data=text_concat)

In [15]:
corpus.to_csv('corpus.tsv', header=0, index=0)

In [16]:
from gensim.models import word2vec

In [17]:
sentences = word2vec.LineSentence('corpus.tsv')
model = word2vec.Word2Vec(sentences, min_count=5)

In [18]:
# 保存模型

In [19]:
# model.save("comment.model")
# new_model = word2vec.Word2Vec.load('comment.model')

model.wv.save_word2vec_format('vector.vector', binary=False)
#new_model = gensim.models.KeyedVectors.load_word2vec_format('myvector.vector', binary=False)

In [20]:
pairs = [
    ('备胎', '轮胎'),  
    ('座椅', '后排'),   
    ('凉', '空调'), 
    ('真皮', '内饰'), 
    ('内饰', '轴距'), 
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2,model.wv.similarity(w1, w2)))

'备胎'	'轮胎'	0.56
'座椅'	'后排'	0.58
'凉'	'空调'	0.66
'真皮'	'内饰'	0.52
'内饰'	'轴距'	0.11


### 模型定义

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [22]:
class BiRNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, num_hiddens, num_layers, vectors):
        super(BiRNN, self).__init__()
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings = self.word_embeddings.from_pretrained(vectors, freeze=False)
        
        self.encoder = nn.LSTM(input_size=embedding_dim,
                               hidden_size=num_hiddens,
                               num_layers=num_layers,
                               batch_first=True,
                               bidirectional=True)
        
        self.decoder = nn.Linear(2*num_hiddens, 2)
        
    def forward(self, inputs):
        
        embeddings = self.word_embeddings(inputs)
        
        outputs, _ = self.encoder(embeddings)
        
        outs = self.decoder(outputs[:,-1,:])
        return outs  

In [23]:
from torchtext import data

In [24]:
# 输入数据构建

# 1. 定义一个序列字段，文本
text = data.Field(sequential=True, lower=True, tokenize=tokenizer, stop_words=stop_words)

# 2.定义一个数值字段，标签
label = data.Field(sequential=False)

train, val = data.TabularDataset.splits(
    path = './',
    skip_header=True,
    train='train.tsv',
    validation='validation.tsv',
    format='tsv',
    fields=[('index', None), ('label', label), ('text', text)],
)

In [25]:
#train[2].text

In [26]:
#train[5].__dict__.keys()

In [27]:
# import gensim
# 读入预训练的Embedding:word2vec
# model = gensim.models.KeyedVectors.load_word2vec_format('vector.vector', binary=False)

In [28]:
from torchtext.vocab import Vectors

## 构建训练文本和标签的词向量
vectors = Vectors(name='vector.vector')

text.build_vocab(train, val, vectors=vectors) # 加入测试集的vertor
label.build_vocab(train, val)

In [29]:
#text.vocab.freqs.most_common(10)

In [30]:
vectors = text.vocab.vectors

In [31]:
vectors.shape

torch.Size([34805, 100])

In [32]:
batch_size=128
train_iter, val_iter = data.Iterator.splits(
            (train, val),
            sort_key=lambda x: len(x.text), # 使用文本长度对数据进行排序
            batch_sizes=(batch_size, len(val)), # 训练集设置batch_size,验证集整个集合用于测试
    )

### 实例化模型

In [35]:
# batch_size=128
vocab_size = len(text.vocab)
embedding_dim = 100
num_hiddens = 64
num_layers = 1
vectors = text.vocab.vectors
net = BiRNN(vocab_size, embedding_dim, num_hiddens, num_layers, vectors)

In [36]:
net

BiRNN(
  (word_embeddings): Embedding(34805, 100)
  (encoder): LSTM(100, 64, batch_first=True, bidirectional=True)
  (decoder): Linear(in_features=128, out_features=2, bias=True)
)

### 训练模型

In [37]:
def evaluate_accuracy(data_iter, net):
    acc_sum = 0.0
    n = 0
    
    with torch.no_grad():
        for _, batch in enumerate(data_iter):
            X, y = batch.text, batch.label
            
            X = X.permute(1, 0)
            y.data.sub_(1)
            
            net.eval() # 评估模式, 这会关闭dropout
            acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
            net.train() # 改回训练模式
            
            n += y.shape[0]
    return acc_sum / n

In [38]:
y_hat = None
y = None
def train(train_iter, test_iter, net, loss, optimizer, num_epochs):
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for batch_idx, batch in enumerate(train_iter):
            X, y = batch.text, batch.label
            
            X = X.permute(1, 0)
            y.data.sub_(1)  #X转置
            
            # sequence_length 不定长，可以提前pad，也也可以在dataloader的coll_fn动态以batch最长进行pad
            
            y_hat = net(X)
            
            # print(y_hat, y_hat.shape)         
            # print(y, y.shape)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print(
            'epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
            % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n,
               test_acc, time.time() - start))

In [39]:
lr, num_epochs = 0.01, 10
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, val_iter, net, loss, optimizer, num_epochs)

epoch 1, loss 0.3927, train acc 0.841, test acc 0.898, time 17.6 sec
epoch 2, loss 0.0995, train acc 0.929, test acc 0.911, time 18.7 sec
epoch 3, loss 0.0449, train acc 0.953, test acc 0.914, time 18.8 sec
epoch 4, loss 0.0246, train acc 0.965, test acc 0.913, time 18.6 sec
epoch 5, loss 0.0158, train acc 0.972, test acc 0.910, time 18.7 sec
epoch 6, loss 0.0115, train acc 0.975, test acc 0.912, time 19.1 sec
epoch 7, loss 0.0091, train acc 0.977, test acc 0.915, time 18.8 sec
epoch 8, loss 0.0075, train acc 0.978, test acc 0.914, time 19.1 sec
epoch 9, loss 0.0064, train acc 0.979, test acc 0.914, time 19.1 sec
epoch 10, loss 0.0054, train acc 0.980, test acc 0.911, time 18.9 sec


### 保存模型

In [41]:
torch.save(net, './lstm_cls.model')

### 读取模型

In [42]:
# batch_size=128
vocab_size = len(text.vocab)
embedding_dim = 100
num_hiddens = 64
num_layers = 1
vectors = text.vocab.vectors
model_cls = BiRNN(vocab_size, embedding_dim, num_hiddens, num_layers, vectors)

In [43]:
model_cls = torch.load('./lstm_cls.model')

In [44]:
def predict(sentence, model, text):
    with torch.no_grad():
        model.eval()
        words = tokenizer(sentence)
        words_vec = [text.vocab.stoi[word] for word in words]
        x = torch.tensor([words_vec])
        y_hat = model(x)
        
        # print(y_hat)
        # label = y_hat.argmax(dim=1)
        # print(label)
        # print(label.numpy())
        print(y_hat.numpy())
        label = y_hat.argmax(dim=1).numpy()[0]
        
        return label  

In [45]:
sentence = '要说不满意的话，那就是动力了，1.5自然吸气发动机对这款车有种小马拉大车的感觉。如今天气这么热，上路肯定得开空调，开了后动力明显感觉有些不给力不过空调制冷效果还是不错的。'
sentence = '要说不满意的话，那就是动力了，1.5自然吸气发动机对这款车有种小马拉大车的感觉。如今天气这么热，上路肯定得开空调，开了后动力明显感觉有些不给力不过空调制冷效果还是不错的。天安门'
#print(list(jieba.cut(sentence)))

label = predict(sentence, model_cls, text)
print(label)

[[ 3.6799343 -3.8598576]]
0


In [46]:
sentence = '外观确实非常霸气，钻石切工有棱有角，3.0的动力在城市里绰绰有余，内饰考究，空间比较大，bose的音响非常给力，小众品牌不像德系三架马车那样成为街车，为个性代言。'
label = predict(sentence, net, text)
print(label)

[[-3.7052402  3.7136812]]
1
