# TextCNN

* 本案例目标：二分类，判定是保险行业句子为1，非保险的为0

* 基本原理：TextCNN是将卷积神经网络CNN应用到文本分类任务，利用多个不同size的kernel来提取句子中的关键信息（类似于多窗口大小的n-gram), 从而能够更好的捕捉局部相关性

In [1]:
import pandas as pd
import jieba
import numpy as np
from collections import defaultdict

import torch
from torch import nn

### 数据预处理

In [2]:
def tokenize(string):
    res = list(jieba.cut(string, cut_all=False))
    # res = list(string)
    return res

In [3]:
# 划分训练集和测试集
def split_data(df, split=0.7):
    df = df.sample(frac=1)
    length = len(df)
    train_data = df[0:length - 5000]
    eval_data = df[length - 5000:]

    return train_data, eval_data

In [4]:
# 统一长度
def padding_seq(X, max_len=10):
    return np.array([
        np.concatenate([x, [0] * (max_len - len(x))]) if len(x) < max_len else x[:max_len] for x in X
    ])

In [5]:
# 把数据转换成index
def seq2index(seq):
    seg = tokenize(seq)
    seg_index = []
    for s in seg:
        seg_index.append(vocab.get(s, 1))
    return seg_index

#### 构建词典

In [6]:
# 构建词典
def build_vocab(del_word_frequency):
    data = pd.read_csv('./data/classification.csv')
    segment = data['sentence'].apply(tokenize)

    word_frequency = defaultdict(int)
    for row in segment:
        for i in row:
            word_frequency[i] += 1

    word_sort = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True)  # 根据词频降序排序

    f = open('./data/vocab.txt', 'w', encoding='utf-8')
    f.write('[PAD]' + "\n" + '[UNK]' + "\n")
    for d in word_sort:
        if d[1] > del_word_frequency:
            f.write(d[0] + "\n")
    f.close()

In [7]:
build_vocab(del_word_frequency=3)

Building prefix dict from /Users/wangquanjun/miniforge3/envs/py39/lib/python3.9/site-packages/jieba/dict.txt ...
Loading model from cache /var/folders/0t/5qc9q0vn381cwx3337k5ybdh0000gn/T/jieba.cache
Loading model cost 0.4187600612640381 seconds.
Prefix dict has been built succesfully.


#### 读取词典

In [8]:
vocab = {}
import os
if os.path.exists('./data/vocab.txt'):
    with open('./data/vocab.txt', encoding='utf-8')as file:
        for line in file.readlines():
            vocab[line.strip()] = len(vocab)

In [9]:
len(vocab)

638

### 定义模型

In [30]:
class TextCNN(nn.Module):
    
    def __init__(self, vocab_size, embedding_size=100, max_len=10, dropout=0.2):
        super(TextCNN, self).__init__()
        
        # 嵌入层
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        
        # 卷积层
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(2, embedding_size))
        self.conv2 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(3, embedding_size))
        self.conv3 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(4, embedding_size))
        
        # 池化层
        self.max_pool1 = nn.MaxPool1d(kernel_size=max_len-2+1) # 9
        self.max_pool2 = nn.MaxPool1d(kernel_size=max_len-3+1) # 8
        self.max_pool3 = nn.MaxPool1d(kernel_size=max_len-4+1) # 7
        
        # 全连接层
        self.dense = nn.Linear(6, 1)
        
        # 正则化
        self.drop_out = nn.Dropout(dropout)
    
    
    def forward(self, x):
        # [batch_size, seq_len]
        embedding = self.embedding(x)
        # [batch_size, seq_len, embedding_size]
        
        
        # [batch_size, seq_len, embedding_size] ->增加一个channel->  [batch_size, 1, seq_len, embedding_size]
        embedding = embedding.unsqueeze(dim=1)
        
        
        conv1_out = self.conv1(embedding).squeeze(-1)
        conv2_out = self.conv2(embedding).squeeze(-1)
        conv3_out = self.conv3(embedding).squeeze(-1)
        
        #[batch_size, seq_len, seq_len-1]
        out1 = self.max_pool1(conv1_out)
        out2 = self.max_pool2(conv2_out)
        out3 = self.max_pool3(conv3_out)
        
        out = torch.cat([out1, out2, out3], dim=1).squeeze(-1)
        
        out = self.drop_out(out)
        out = self.dense(out)
        
        out = torch.sigmoid(out).squeeze(-1)
        
        return out   

### 模型参数分析

#### 一：嵌入层
* 输入：$ \rm x = [batch\_size, 10] = [batch\_size, seq\_len=10]$

```python
self.embedding = nn.Embedding(vocab_size, embedding_size)
embedding = self.embedding(x)
```

* 输出：$ \rm [batch\_size, seq\_len, embedding\_size]=[batch\_size, 10, 100]$

        

```python
embedding = embedding.unsqueeze(dim=1) # 升维，原因：因为卷积层的输入,需要一个通道维度
```

* 输出：$ \rm [batch\_size, 1, 10, 100] = [batch\_size, channel\_size, seq\_len, embedding\_size]$

#### 二：卷积层

* [文档-卷积层](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html#torch.nn.Conv2d)

* 输入：$ \rm [batch\_size, 1, 10, 100] = [batch\_size, channel\_size,  height_{in}, width_{in}]$

```python
self.conv1 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(2, embedding_size))
self.conv2 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(3, embedding_size))
self.conv3 = nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(4, embedding_size))
conv1_out = self.conv1(embedding)
conv2_out = self.conv2(embedding)
conv3_out = self.conv3(embedding)
```

* 求：$\rm height_{out}$
$$
\begin{aligned}
&\rm
height_{out} = \Big[  \frac{height_{in} + 2*padding[0] -dilation[0]*(kernel\_size[0]-1) -1 }{stride[0]}   +1  \Big]\\
&\rm
height_{out} = \Big[  \frac{height_{in}  -(kernel\_size[0]-1) -1 }{1}   +1  \Big]\\
&\rm
height_{out} = \Big[  \frac{10  - (2-1) -1 }{1}   +1 \Big] = 9 \\
&\rm
height_{out} = \Big[  \frac{10  - (3-1) -1 }{1}   +1 \Big] = 8 \\
&\rm
height_{out} = \Big[  \frac{10  - (4-1) -1 }{1}   +1 \Big] = 7 \\
\end{aligned}
$$

* 求：$\rm width_{out}$
$$
\begin{aligned}
&\rm
width_{out} = \Big[  \frac{width_{in} + 2*padding[1] -dilation[1]*(kernel\_size[1]-1) -1 }{stride[1]}   +1  \Big]\\
&\rm
width_{out} = \Big[  \frac{width_{in} -(kernel\_size[1]-1) -1 }{stride[1]}   +1  \Big]\\
&\rm
width_{out} = \Big[  \frac{100  - (100-1) -1 }{1}   +1 \Big] = 1
\end{aligned}
$$



* 输出：$ \rm [batch\_size, 2, 9, 1]$
* 输出：$ \rm [batch\_size, 2, 8, 1]$
* 输出：$ \rm [batch\_size, 2, 7, 1]$




#### 三：池化层

* [最大池化文档-MaxPool1d api](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool1d.html#torch.nn.MaxPool1d)

```python
conv1_out = conv1_out.squeeze(-1) # 降维，因为池化层MaxPool1d的输入是[N, C, L_out] = [batch_size, channel_size, kernel_zie]
conv2_out = conv2_out.squeeze(-1)
conv3_out = conv3_out.squeeze(-1)
```

* 输入：$ \rm [batch\_size, 2, 9=L_{in}=kernel\_size]$ 
* 输入：$ \rm [batch\_size, 2, 8]$
* 输入：$ \rm [batch\_size, 2, 7]$

```python
self.max_pool1 = nn.MaxPool1d(kernel_size=max_len-2+1) # 9
self.max_pool2 = nn.MaxPool1d(kernel_size=max_len-3+1) # 8
self.max_pool3 = nn.MaxPool1d(kernel_size=max_len-4+1) # 7
out1 = self.max_pool1(conv1_out)
out2 = self.max_pool2(conv2_out)
out3 = self.max_pool3(conv3_out)
```
* 求$\rm L_{out}$

$$
\begin{aligned}
&\rm
L_{out} = \Big[  \frac{L_{in} + 2*padding -dilation*(kernel\_size-1) -1 }{stride}   +1  \Big]\\
&\rm
L_{out} = \Big[  \frac{kernel\_size  -(kernel\_size-1) -1 }{kernel\_size}   +1  \Big]\\
&\rm
L_{out} = \Big[  \frac{0}{kernel\_size}   +1  \Big] =1\\
\end{aligned}
$$

* 输出：$\rm [batch\_size, 2, 1] = [batch\_size, channel, L_{out}]$
* 输出：$\rm [batch\_size, 2, 1]$
* 输出：$\rm [batch\_size, 2, 1]$


* 变成定长

```python
out = torch.cat([out1, out2, out3], dim=1).squeeze(-1)
```
* 输出：$\rm [bach\_size, 6]=[bach\_size, 6, 1]=[bach\_size, 2+2+2, 1]$ 。  并在-1维度降维

池化层功能:
* 不改变输入channel
* 不同长度的句子池化后，变成定长
* max pool: 保留了差异？

#### 四：全链接层

* [文档nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html#torch.nn.Linear)

```python    
self.drop_out = nn.Dropout(dropout) # 正则化
out = self.drop_out(out)
```

* 输入：$\rm [batch\_size, 6]$
```python
self.dense = nn.Linear(6, 1) # 全链接:torch.nn.Linear(in_features, out_features)
out = self.dense(out)
```
* 输出：$\rm [batch\_size, 1]$

```python
out = torch.sigmoid(out) # 输出每个句子的类别概率分布 sigmoid不改变输入数据的形状
```
* 输出：$\rm [batch\_size, 1]$  

* 降维，在维度-1进行降维，为啥降维？ 好看，32个句子的类别
```python
out = out.squeeze(-1)
```
* 输出:$\rm [batch\_size]$
* 输出:32个句子的类别


[二元交叉熵损失](https://pytorch.org/docs/stable/generated/torch.nn.BCELoss.html#torch.nn.BCELoss)

### 测试下计算图是否存在错误

In [11]:
TextCNN(2000)

TextCNN(
  (embedding): Embedding(2000, 100)
  (conv1): Conv2d(1, 2, kernel_size=(2, 100), stride=(1, 1))
  (conv2): Conv2d(1, 2, kernel_size=(3, 100), stride=(1, 1))
  (conv3): Conv2d(1, 2, kernel_size=(4, 100), stride=(1, 1))
  (max_pool1): MaxPool1d(kernel_size=9, stride=9, padding=0, dilation=1, ceil_mode=False)
  (max_pool2): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
  (max_pool3): MaxPool1d(kernel_size=7, stride=7, padding=0, dilation=1, ceil_mode=False)
  (dense): Linear(in_features=6, out_features=1, bias=True)
  (drop_out): Dropout(p=0.2, inplace=False)
)

### 加载训练数据

In [12]:
from torch.utils.data import DataLoader, TensorDataset

In [13]:
df_temp = pd.read_csv('./data/classification.csv')

In [14]:
df_temp.head()

Unnamed: 0,sentence,label
0,最近在安邦长青树中看到什么豁免，这个是什么意思？,0
1,HUTS中有没有适合帆船比赛的保险，我男朋友这周就要开始了,0
2,计划端午节和男朋友自驾去九*山，买保险三天要多少钱？,0
3,端午我们准备要举行赛龙舟，说是要份保险，什么好,0
4,老婆买了安*长*树，她在网上投保的，以后缴费怎么办,0


In [17]:
def load_data(batch_size=32):
    df = pd.read_csv('./data/classification.csv')
    train_df, eval_df = split_data(df)
    train_x = df['sentence']
    train_y = df['label']
    
    eval_x = eval_df['sentence']
    eval_y = eval_df['label']
    
    train_x = padding_seq(train_x.apply(seq2index))
    train_y = np.array(train_y)
    
    train_data_set = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
    train_data_loader = DataLoader(dataset=train_data_set, batch_size=batch_size, shuffle=True)
    
    eval_x = padding_seq(eval_x.apply(seq2index))
    return train_data_loader, eval_x, eval_y.values

### 训练模型

In [18]:
# 训练
def train():
    
    # 定义模型
    model = TextCNN(vocab_size=638, embedding_size=100, max_len=10)
    
    train_data_loader, eval_x , eval_y = load_data(batch_size=512)
    
    eval_x = torch.from_numpy(eval_x)
    
    if torch.cuda.is_available():
        model = model.cuda()
        eval_x = eval_x.cuda().long()
        
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    loss_func = nn.BCELoss()
    
    best_acc = 0
    
    for epoch in range(10):
        for step, (b_x, b_y) in enumerate(train_data_loader):
            if torch.cuda.is_available():
                b_x = b_x.cuda().long()
                b_y = b_y.cuda().long()
            
            output = model(b_x)
            loss = loss_func(output, b_y.float())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if step % 20 == 0:
                test_output = model(eval_x)
                pred_y = (test_output.cpu().data.numpy() > 0.5).astype(int)
                accuracy = float((pred_y == eval_y).astype(int).sum()) / float(eval_y.size)
                if accuracy > best_acc:
                    best_acc = accuracy
                    torch.save(model, './data/text_cnn.model')
                    print('save model, accuracy: %.3f' % accuracy)
                print('Epoch: ', epoch, '| train loss: %.4f' % loss.cpu().data.numpy(),
                      '| test accuracy: %.3f' % accuracy)
                
    #torch.save(model, './data/text_cnn.model')

In [19]:
train()

save model, accuracy: 0.745
Epoch:  0 | train loss: 0.7248 | test accuracy: 0.745
save model, accuracy: 0.872
Epoch:  1 | train loss: 0.4232 | test accuracy: 0.872
save model, accuracy: 0.911
Epoch:  2 | train loss: 0.2850 | test accuracy: 0.911
save model, accuracy: 0.938
Epoch:  3 | train loss: 0.1895 | test accuracy: 0.938
save model, accuracy: 0.959
Epoch:  4 | train loss: 0.1576 | test accuracy: 0.959
save model, accuracy: 0.967
Epoch:  5 | train loss: 0.1013 | test accuracy: 0.967
save model, accuracy: 0.974
Epoch:  6 | train loss: 0.0840 | test accuracy: 0.974
save model, accuracy: 0.983
Epoch:  7 | train loss: 0.0650 | test accuracy: 0.983
save model, accuracy: 0.985
Epoch:  8 | train loss: 0.0407 | test accuracy: 0.985
save model, accuracy: 0.987
Epoch:  9 | train loss: 0.0382 | test accuracy: 0.987


### 读取模型

In [20]:
import torch

In [21]:
# 定义模型
model = TextCNN(vocab_size=638, embedding_size=100, max_len=10)
model = torch.load('./data/text_cnn.model')
# 这个model就是一堆的权重

In [22]:
model.eval()

TextCNN(
  (embedding): Embedding(638, 100)
  (conv1): Conv2d(1, 2, kernel_size=(2, 100), stride=(1, 1))
  (conv2): Conv2d(1, 2, kernel_size=(3, 100), stride=(1, 1))
  (conv3): Conv2d(1, 2, kernel_size=(4, 100), stride=(1, 1))
  (max_pool1): MaxPool1d(kernel_size=9, stride=9, padding=0, dilation=1, ceil_mode=False)
  (max_pool2): MaxPool1d(kernel_size=8, stride=8, padding=0, dilation=1, ceil_mode=False)
  (max_pool3): MaxPool1d(kernel_size=7, stride=7, padding=0, dilation=1, ceil_mode=False)
  (dense): Linear(in_features=6, out_features=1, bias=True)
  (drop_out): Dropout(p=0.2, inplace=False)
)

In [23]:
def classification_predict(s):
    s = seq2index(s)
    #s = torch.from_numpy(padding_seq([s])).cuda().long()
    s = torch.from_numpy(padding_seq([s]))
    out = model(s)
    return out.cpu().data.numpy()

In [24]:
sen = '我是程序员'
classification_predict(sen)

array([0.99999213], dtype=float32)

In [25]:
sen = '为啥只有电子保单'
classification_predict(sen)

array([0.20954591], dtype=float32)

In [26]:
sen = '为啥只有保险'
classification_predict(sen)

array([0.00257886], dtype=float32)

In [27]:
sen = "最近在安邦长青树中看到什么豁免，这个是什么意思？"
classification_predict(sen)

array([0.00269974], dtype=float32)

In [28]:
sen = "你好"
classification_predict(sen)

array([0.99997616], dtype=float32)

In [29]:
sen = "如何购买保险"
classification_predict(sen)

array([1.9318053e-05], dtype=float32)