<a href="https://colab.research.google.com/github/zhangxueren9/google_colab_notebooks/blob/master/sentence_classify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#某法律网站用户提问自动分类试验

##数据部分

1.   数据来源：国内律师网站

1.   数据处理：将用户提问数据(title部分)经 切词-->去停用词-->选取高频词(top5000)-->过滤低频分类，得到句子向量和分类。

1.   数据说明：处理后数据保存在cleared_and_filter_text_v1.1.1.csv中，句子和标签已Tab("\t")分割。句子为json字符串list类型


##模型部分
模型使用pytorch框架实现，模型结构：embedding -->双向LSTM-->全连接层1-->全连接层2  
全连接层使用SoftMax激活函数。
##试验结果
模型效果很差，损失不能随训练步骤增加而降低。
##分析
造成模型不收敛的主要原因，句子长度太短无法获句子所含信息无法支持分类，脏数据太多。




In [0]:
from google.colab import drive 
drive.mount('/content/drive')
import os
os.chdir("/content/drive/My Drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!pip3 install https://download.pytorch.org/whl/cu100/torch-1.0.1-cp36-cp36m-win_amd64.whl


[31mtorch-1.0.1-cp36-cp36m-win_amd64.whl is not a supported wheel on this platform.[0m


In [0]:
DATA_FILE = "dataset/cleared_and_filter_text_v1.1.1.csv"
CATEGORY_SIZE = 10
WORD_VOC_SIZE = 5000
MAX_SENTENCE_LEN = 25
NUM_DIRECTIONS = 2

EMBEDDING_SIZE = 100
HIDDEN_SIZE = 512
NUM_LAYERS = 1

BATCH_SIZE = 64
LEARN_RATE = 0.05
NUM_EPOCHS = 100


MODEL_PATH = "model_file/sentiment_lstm_model_v1.1.1.pkl"

TEST_SIZE = 0.3

In [0]:
import numpy as np

def decode_one_hot_label(one_hot_label):
    one_hot_label = one_hot_label.tolist()
    label = ""
    index = np.argmax(one_hot_label)
    label += str(index)
    return label


def encode_one_hot_label(label):
    one_hot_label = np.zeros(shape=(CATEGORY_SIZE,)).astype(np.float)
    one_hot_label[int(label) - 1] = 1.0
    return one_hot_label

In [0]:
import json
import random

import numpy as np
from torch.utils.data import DataLoader, Dataset


class SentenceCategoryDataset(Dataset):
    def __init__(self, sentence_lines, max_sentence_len=MAX_SENTENCE_LEN, word_voc_size=WORD_VOC_SIZE,
                 category_size=CATEGORY_SIZE):
        self.sentence_lines = sentence_lines
        self.max_sentence_len = max_sentence_len
        self.word_voc_size = word_voc_size
        self.category_size = category_size

    def __len__(self):
        return len(self.sentence_lines)

    def __getitem__(self, item):
        sentence = np.zeros(shape=(self.max_sentence_len,)).astype(np.long)
        sentence_line = self.sentence_lines[item]
        splits = sentence_line.split("\t")
        words = json.loads(splits[0], encoding="utf8")
#         words = splits[0:-1]
        for idx, word in enumerate(words):
            if idx > self.max_sentence_len - 1:
                break
            sentence[idx] = int(words[idx])
        category = splits[-1]
        category = encode_one_hot_label(category)
        return sentence, category


def get_data_loader(data_file=DATA_FILE, batch_size=BATCH_SIZE, test_size=TEST_SIZE):
    train_lines = []
    test_lines = []
    with open(data_file) as fi:
        for line in fi:
            line = line.strip()
            if random.random() > test_size:
                train_lines.append(line)
            else:
                test_lines.append(line)

    train_data_loader = DataLoader(SentenceCategoryDataset(train_lines), batch_size=batch_size, shuffle=True)
    test_data_loader = DataLoader(SentenceCategoryDataset(test_lines), batch_size=1, shuffle=True)
    return train_data_loader, test_data_loader


In [0]:
import torch.nn as nn
import torch.nn.functional as  F



class SentimentLstm(nn.Module):
    def __init__(self, word_voc_size=WORD_VOC_SIZE + 1, embedding_size=EMBEDDING_SIZE, hidden_size=HIDDEN_SIZE,
                 num_layers=NUM_LAYERS, num_directions=NUM_DIRECTIONS, batch_size=BATCH_SIZE,
                 category_size=CATEGORY_SIZE):
        super(SentimentLstm, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_directions = num_directions
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(word_voc_size, embedding_size)
        torch.manual_seed(1)
        self.is_bi_lstm = num_directions == 2
        self.lstm = nn.LSTM(self.embedding_size, self.hidden_size, batch_first=True, bidirectional=self.is_bi_lstm)
        if self.is_bi_lstm:
            self.liner1 = nn.Linear(hidden_size * 2, 128)
        else:
            self.liner1 = nn.Linear(hidden_size, 128)
        self.liner2 = nn.Linear(128, category_size)

    def forward(self, x):
        w2v = self.embedding(x)
        w2v = w2v.view(x.shape[0], -1, self.embedding_size)
        out, _ = self.lstm(w2v)
        last_stat = out[:, -1, :]
        out = F.softmax(self.liner1(last_stat), dim=1)
        out = F.softmax(self.liner2(out), dim=1)
        return out

In [0]:
import os
import time

import torch
import torch.nn as nn
from torch.autograd import Variable




def train():
    device = torch.device('cuda:0')
    lstm = SentimentLstm()
    lstm.train()
    lstm.to(device)

    model_dir = os.path.split(MODEL_PATH)[0]
#     if not os.path.exists(model_dir):
#         os.mkdir(model_dir)
    if os.path.exists(MODEL_PATH):
        lstm.load_state_dict(torch.load(MODEL_PATH))

    print("init net")

    criterion = nn.MSELoss()
    # criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(lstm.parameters(), lr=LEARN_RATE)
    train_data_loader = get_data_loader()[0]

    for epoch in range(NUM_EPOCHS):
        for i, (sentences, labels) in enumerate(train_data_loader):
            sentences = Variable(sentences).cuda()
            labels = Variable(labels.float()).cuda()
            predict_labels = lstm(sentences)
            loss = criterion(predict_labels, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (i + 1) % 1000 == 0:
                print("epoch:", epoch, "step:", i, "loss:", loss.item())
            if (i + 1) % 1000 == 0:
                torch.save(lstm.state_dict(), MODEL_PATH)  # current is model.pkl
                print("save model")
        if epoch % 5 == 1:
            test()
        print("epoch:", epoch, "step:", i, "loss:", loss.item())
        torch.save(lstm.state_dict(), MODEL_PATH)  # current is model.pkl
        print("save last model")


def test():
    device = torch.device('cuda:0')
    lstm = SentimentLstm()
    lstm.eval()
    lstm.to(device)
    print("load model")
    start_time = time.time()
    test_data_loader = get_data_loader()[-1]
    correct = 0
    total = 0
    for i, (sentences, labels) in enumerate(test_data_loader):
        sentences = sentences
        vsentences = Variable(sentences).cuda()
        predict_label = lstm(vsentences)

        predict_label = decode_one_hot_label(predict_label[0])
        true_label = decode_one_hot_label(labels.numpy()[0])

#         print("the predict_label %s  ---> the real label %s" % (predict_label, true_label))
        total += labels.size(0)
        if (predict_label == true_label):
            correct += 1
        if (total % 2000 == 0):
            print('Test Accuracy of the model on the %d test sentences: %f %%' % (total, 100 * correct / total))
            return
    print('Test Accuracy of the model on the %d test sentences: %f %%' % (total, 100 * correct / total))
    end_time = time.time()
#     print("avg %f secends per sentences" % ((end_time - start_time) / total))

In [0]:
train()