# 编码分类器模型

## 数据处理

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from flair.embeddings import TransformerWordEmbeddings,StackedEmbeddings,WordEmbeddings,BytePairEmbeddings
from flair.data import Sentence
from tqdm import tqdm
import torch
from flair.data import Sentence
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import datetime
torch.manual_seed(1)

In [3]:
#从文件读入数据
data = pd.read_excel('output.xlsx',index_col=0)

In [4]:
X = data['搜索项'].values
y = data['税收分类编码'].values

In [5]:
#将编码转换为标签
tag_to_ix = {}
for tag in y:
  if tag not in tag_to_ix:
    tag_to_ix[tag] = len(tag_to_ix)

In [6]:
#将数据中的编码转化为对应的标签
def to_index(data, to_ix):
    input_index_list = []
    for sent in data:
        input_index_list.append([to_ix[sent]])
    return input_index_list
y = to_index(y,tag_to_ix)

In [7]:
#将数据分为训练和测试数据集
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33)

## 模型构建

In [8]:
#预处理词嵌入包含Word2vec，BytePair和 BERT 并组合到一起
#from flair.embeddings import TransformerWordEmbeddings,StackedEmbeddings,WordEmbeddings,BytePairEmbeddings
word_embedding = WordEmbeddings('zh')
byte_embedding = BytePairEmbeddings('zh')
bert_embedding = TransformerWordEmbeddings('bert-base-chinese')
stacked_embeddings = StackedEmbeddings(embeddings=[word_embedding,byte_embedding,bert_embedding])

In [9]:
#将训练数据的语句转换为对应的词嵌入
#from flair.data import Sentence
#from tqdm import tqdm
#import torch
train_embedding_matrix = []
for i in tqdm(range(len(X_train))):
        embeddings = []
        sentence = Sentence(X_train[i])
        stacked_embeddings.embed(sentence)
        for token in sentence:
          embeddings.append(token.embedding)
        embeddings = torch.stack(embeddings)
        embeddings = embeddings.view(-1,1,3472)
        train_embedding_matrix.append(embeddings)

100%|██████████| 3128/3128 [05:48<00:00,  8.97it/s]


In [10]:
#将测试数据的语句转换为对应的词嵌入
test_embedding_matrix = []
for i in tqdm(range(len(X_test))):
        embeddings = []
        sentence = Sentence(X_test[i])
        stacked_embeddings.embed(sentence)
        for token in sentence:
          embeddings.append(token.embedding)
        embeddings = torch.stack(embeddings)
        embeddings = embeddings.view(-1,1,3472)
        test_embedding_matrix.append(embeddings)

100%|██████████| 1541/1541 [02:54<00:00,  8.85it/s]


In [11]:
#Bi-LSTM模型做特征提取和分类器
#from flair.data import Sentence
#import torch
#import torch.autograd as autograd
#import torch.nn as nn
#import torch.optim as optim
#import torch.nn.functional as F
#torch.manual_seed(1)

class BiLSTM(nn.Module):
    def __init__(self, tag_to_ix,hidden_dim):
        super(BiLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(3472, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim,len(tag_to_ix))

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    def forward(self, embeddings):
        self.hidden = self.init_hidden()
        embeds = embeddings.view(embeddings.shape[0], 1, -1)
        lstm_out, self.hidden = self.lstm(embeds)
        lstm_out = lstm_out[0].view(-1, self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM(tag_to_ix,400).to(device)

## 模型训练

In [None]:
#用训练模型对模型进行训练
optimizer = optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()
best_model = None
best_accuracy = 0
for epoch in range(15):  
    train_loss = 0
    model.train()
    for i in tqdm(range(len(X_train))):
        embeddings = train_embedding_matrix[i]
        tags_index = y_train[i]
        model.zero_grad()
        outputs = model(embeddings)
        targets = torch.tensor(tags_index, dtype=torch.long).to(device)
        loss = criterion(outputs,targets)
        loss.backward()
        optimizer.step()
        train_loss+=loss.item()  
    print(train_loss)

100%|██████████| 3128/3128 [03:04<00:00, 16.92it/s]
  0%|          | 2/3128 [00:00<03:27, 15.10it/s]

9026.201347194612


100%|██████████| 3128/3128 [03:04<00:00, 16.91it/s]
  0%|          | 2/3128 [00:00<03:34, 14.55it/s]

4433.006144646555


100%|██████████| 3128/3128 [03:05<00:00, 16.84it/s]
  0%|          | 2/3128 [00:00<03:35, 14.54it/s]

2457.4429139094427


100%|██████████| 3128/3128 [03:04<00:00, 16.98it/s]
  0%|          | 2/3128 [00:00<03:31, 14.81it/s]

1439.8204600595636


100%|██████████| 3128/3128 [03:05<00:00, 16.89it/s]
  0%|          | 2/3128 [00:00<03:29, 14.95it/s]

892.1794277280569


100%|██████████| 3128/3128 [03:04<00:00, 16.94it/s]
  0%|          | 1/3128 [00:00<05:57,  8.74it/s]

583.8416432402591


100%|██████████| 3128/3128 [03:17<00:00, 15.82it/s]
  0%|          | 1/3128 [00:00<06:14,  8.35it/s]

401.08044403341773


100%|██████████| 3128/3128 [03:17<00:00, 15.85it/s]
  0%|          | 2/3128 [00:00<03:40, 14.18it/s]

288.2998350953967


100%|██████████| 3128/3128 [03:15<00:00, 16.00it/s]
  0%|          | 2/3128 [00:00<03:33, 14.61it/s]

224.25874061568175


 11%|█         | 346/3128 [00:20<02:34, 18.04it/s]

## Test Model

In [25]:
#在测试数据上进行测试
def argmax(vec,k):
    prob, idx = torch.torch.topk(vec, k)
    return prob.tolist(),idx.tolist()

correct = 0
for i in range(len(y_test)):
  embedding = test_embedding_matrix[i]
  model.eval()
  result = model(embedding)
  prob = F.softmax(result,dim=1)
  prob = argmax(prob,5)
  output = [item for sublist in prob[1] for item in sublist]
  prob = [item for sublist in prob[0] for item in sublist]
  # print(X_test[i],y_test[i][0],output,prob) #示例
  if y_test[i][0] in output:       
        correct+=1

In [26]:
#test data的准确性
print(correct/len(y_test))

0.9318624269954575


In [26]:
#模型保存
torch.save(model,'model.pt')

  "type " + obj.__name__ + ". It won't be checked "


## Online Learning

In [55]:
#online learning 给定新输入的数据进行训练，并保存新模型
def online_learning(X_train,y_train,lr=0.0001,epoch=15):
    train_embedding_matrix = []
    for i in tqdm(range(len(X_train))):
        embeddings = []
        sentence = Sentence(X_train[i])
        stacked_embeddings.embed(sentence)
        for token in sentence:
          embeddings.append(token.embedding)
        embeddings = torch.stack(embeddings)
        embeddings = embeddings.view(-1,1,1936)
        train_embedding_matrix.append(embeddings)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    criterion = nn.CrossEntropyLoss()
    for epoch in range(epoch): 
        train_loss = 0
        model.train()
        for i in tqdm(range(len(X_train))):
            embeddings = train_embedding_matrix[i]
            tags_index = y_train[i]
            model.zero_grad()
            outputs = model(embeddings)
            targets = torch.tensor(tags_index, dtype=torch.long).to(device)
            loss = criterion(outputs,targets)
            loss.backward()
            optimizer.step()
            train_loss+=loss.item()  
        print(train_loss)
    torch.save(model,'model.pt')