In [1]:
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import numpy as np
import sys
import math
import numpy as np
from collections import Counter,defaultdict
import pandas as pd
import tqdm
from sklearn.model_selection import train_test_split
from itertools import islice
import os
import pkuseg
CLASS_DIM = 2     #情感分类的类别数
EMB_DIM = 128     #词向量的维度
HID_DIM = 512     #隐藏层的维度
STACKED_NUM = 3   #LSTM双向栈的层数
BATCH_SIZE = 64  #batch的大小

In [2]:
train_file = "data/train.csv"
test_file = "data/test_new.csv"
sample_file= "data/sample.csv"
train_file_split="data/train_split.csv"
valid_file_split="data/valid_split.csv"

## 使用pkuseg 构造字典

In [3]:
word_list=np.load("features.pkl",allow_pickle=True)
word_list=word_list["unigram"]
word_dict={word_list[i]:i  for i in range(len(word_list)) }
word_dict["<UNK>"]=len(word_list)

## 数据预处理：分词，切valid

In [20]:
def process_file(X_train_raw):
    """
    分词
    """
    seg=pkuseg.pkuseg()
    X_train=[" ".join(seg.cut(x)) for x in X_train_raw]
    return X_train
df_train=pd.read_csv(train_file, delimiter="\t")
df_test=pd.read_csv(test_file, delimiter=",")

X_train_raw=np.array(df_train["comment"])
y_train_raw=np.array(df_train["label"])
X_test_raw=np.array(df_test["comment"])
X_test_id=np.array(df_test["id"])

X_train,X_valid,y_train,y_valid = train_test_split(X_train_raw,y_train_raw,test_size=0.2,stratify=y_train_raw,random_state=0)

df_train=pd.DataFrame()
df_train["label"]=y_train
df_train["word"]=process_file(X_train)
df_train.to_csv("data/train_split.csv",index=False)

df_valid=pd.DataFrame()
df_valid["label"]=y_valid
df_valid["word"]=process_file(X_valid)
df_valid.to_csv("data/valid_split.csv",index=False)

### 处理完变成三个csv

test_file = "data/test_new.csv"

train_file_split="data/train_split.csv"

valid_file_split="data/valid_split.csv"

### 文本卷积神经网络
我们构建神经网络`convolution_net`，示例代码如下。
需要注意的是：`fluid.nets.sequence_conv_pool` 包含卷积和池化层两个操作。

In [5]:
#文本卷积神经网络
def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
    emb = fluid.layers.embedding(
        input=data, size=[input_dim, emb_dim], is_sparse=True)
    conv_3 = fluid.nets.sequence_conv_pool(
        input=emb,
        num_filters=hid_dim,
        filter_size=3,
        act="tanh",
        pool_type="sqrt")
    conv_4 = fluid.nets.sequence_conv_pool(
        input=emb,
        num_filters=hid_dim,
        filter_size=4,
        act="tanh",
        pool_type="sqrt")
    prediction = fluid.layers.fc(
        input=[conv_3, conv_4], size=class_dim, act="softmax")
    return prediction

网络的输入`input_dim`表示的是词典的大小，`class_dim`表示类别数。这里，我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/nets.py) API实现了卷积和池化操作。

<a name="栈值双向LSTM"></a>

### 栈式双向LSTM

栈式双向神经网络`stacked_lstm_net`的代码片段如下：

In [6]:
#栈式双向LSTM
def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):

    #计算词向量
    emb = fluid.layers.embedding(
        input=data, size=[input_dim, emb_dim], is_sparse=True)

    #第一层栈
    #全连接层
    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
    #lstm层
    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)

    inputs = [fc1, lstm1]

    #其余的所有栈结构
    for i in range(2, stacked_num + 1):
        fc = fluid.layers.fc(input=inputs, size=hid_dim)
        lstm, cell = fluid.layers.dynamic_lstm(
            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
        inputs = [fc, lstm]

    #池化层
    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')

    #全连接层，softmax预测
    prediction = fluid.layers.fc(
        input=[fc_last, lstm_last], size=class_dim, act='softmax')
    return prediction

以上的栈式双向LSTM抽象出了高级特征并把其映射到和分类类别数同样大小的向量上。最后一个全连接层的'softmax'激活函数用来计算分类属于某个类别的概率。

重申一下，此处我们可以调用`convolution_net`或`stacked_lstm_net`的任何一个网络结构进行训练学习。我们以`convolution_net`为例。

接下来我们定义预测程序（`inference_program`）。预测程序使用`convolution_net`来对`fluid.layer.data`的输入进行预测。

In [66]:
## 预测程序
def inference_program(word_dict):
    data = fluid.layers.data(
        name="words", shape=[1], dtype="int64", lod_level=1)

    dict_dim = len(word_dict)
#     net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
    net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM, STACKED_NUM)
    return net

我们这里定义了`training_program`。它使用了从`inference_program`返回的结果来计算误差。我们同时定义了优化函数`optimizer_func`。

因为是有监督的学习，训练集的标签也在`fluid.layers.data`中定义了。在训练过程中，交叉熵用来在`fluid.layer.cross_entropy`中作为损失函数。

在测试过程中，分类器会计算各个输出的概率。第一个返回的数值规定为cost。

In [67]:
def train_program(prediction):
    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
    
# 不带类别平衡的 focal loss，仅仅区分类别难易；猜测此时算出来的梯度有一个 gama 倍，所以学习率可以比以往更小一点
    def focal_loss(pred, label, gama):
        # 使用打印函数查看当前 Tensor，
        # fluid.layers.py_func(func=print_func, x=pred, out=None)
        one_hot = paddle.fluid.layers.one_hot(label, 2)
        prob = one_hot * pred
        cross_entropy = one_hot * fluid.layers.log(pred)
        # cross_entropy = one_hot * pred
        cross_entropy = fluid.layers.reduce_sum(cross_entropy, dim=-1)
        sum = paddle.fluid.layers.sum(cross_entropy)
        weight = -1.0 * one_hot * paddle.fluid.layers.pow((1.0 - pred), gama)
        weight = fluid.layers.reduce_sum(weight, dim=-1)
        return weight * cross_entropy
        
    # cost = fluid.layers.cross_entropy(input=prediction, label=label)
    cost=focal_loss(prediction, label, 2.0)
    
    avg_cost = fluid.layers.mean(cost)
    accuracy = fluid.layers.accuracy(input=prediction, label=label)
#     auc    =   fluid.layers.auc(input=prediction, label=label)
    
    return [avg_cost, accuracy]   #返回平均cost和准确率acc

#优化函数
def optimizer_func():
    return fluid.optimizer.Adagrad(learning_rate=0.002)

In [68]:
use_cuda = True
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

### 定义数据提供器

下一步是为训练和测试定义数据提供器。提供器读入一个大小为 BATCH_SIZE的数据。paddle.dataset.imdb.word_dict 每次会在乱序化后提供一个大小为BATCH_SIZE的数据，乱序化的大小为缓存大小buf_size。

注意：读取IMDB的数据可能会花费几分钟的时间，请耐心等待。

In [69]:
def train_reader_fun(data_dir, word_dict, label_dict,Train=True):
    def reader():
        UNK_ID = word_dict["<UNK>"]
        word_col = 1
        lbl_col = 0
 
        with open(data_dir, "r",encoding="utf-8") as f:
            for line in islice(f, 1, None):  ### 跳过表头的第一行
                line_split = line.strip().split(",")
                word_ids = [
                    word_dict.get(w, UNK_ID)
#                     word_dict.get(w)
                    for w in line_split[word_col].split()
                    ]
                if Train:
                    yield word_ids, label_dict[line_split[lbl_col]]
                else:
                    yield word_ids
    return reader

train_reader=train_reader_fun(train_file_split,word_dict,{"0":0,"1":1},Train=True)

train_batch_reader= paddle.batch(
                paddle.reader.shuffle(
                reader=train_reader,### 这行返回的是reader函数
                buf_size=1000),
                batch_size=BATCH_SIZE)  ## 返回的是个函数

valid_reader=train_reader_fun(valid_file_split,word_dict,{"0":0,"1":1},Train=True)

valid_batch_reader= paddle.batch(
                paddle.reader.shuffle(
                reader=valid_reader,### 这行返回的是reader函数
                buf_size=1000),
                batch_size=BATCH_SIZE)  ## 返回的是个函数


### 构造训练器
训练器需要一个训练程序和一个训练优化函数。

In [70]:
exe = fluid.Executor(place)
prediction = inference_program(word_dict)

[avg_cost, accuracy] = train_program(prediction)#训练程序
sgd_optimizer = optimizer_func()#训练优化函数
sgd_optimizer.minimize(avg_cost)

([inputs {
    parameter: "Grad"
    arguments: "embedding_1.w_0@GRAD"
  }
  inputs {
    parameter: "LearningRate"
    arguments: "learning_rate_1"
  }
  inputs {
    parameter: "Moment"
    arguments: "embedding_1.w_0_moment_0"
  }
  inputs {
    parameter: "Param"
    arguments: "embedding_1.w_0"
  }
  outputs {
    parameter: "MomentOut"
    arguments: "embedding_1.w_0_moment_0"
  }
  outputs {
    parameter: "ParamOut"
    arguments: "embedding_1.w_0"
  }
  type: "adagrad"
  attrs {
    name: "op_role_var"
    type: STRINGS
    strings: "embedding_1.w_0"
    strings: "embedding_1.w_0@GRAD"
  }
  attrs {
    name: "epsilon"
    type: FLOAT
    f: 1e-06
  }
  attrs {
    name: "op_role"
    type: INT
    i: 2
  }
  attrs {
    name: "op_namescope"
    type: STRING
    s: "/optimizer_8/"
  }
  attrs {
    name: "op_callstack"
    type: STRINGS
    strings: "  File \"D:\\Anaconda\\envs\\paddle\\lib\\site-packages\\paddle\\fluid\\framework.py\", line 2488, in append_op\n    attrs=kwarg

该函数用来计算训练中模型在test数据集上的结果

In [71]:
def train_test(program, reader):
    count = 0
    feed_var_list = [
        program.global_block().var(var_name) for var_name in feed_order
    ]
    feeder_test = fluid.DataFeeder(feed_list=feed_var_list, place=place)
    test_exe = fluid.Executor(place)
    accumulated = len([avg_cost, accuracy]) * [0]
    
    for test_data in reader():
        avg_cost_np = test_exe.run(
            program=program,
            feed=feeder_test.feed(test_data),
            fetch_list=[avg_cost, accuracy])
        accumulated = [
            x[0] + x[1][0] for x in zip(accumulated, avg_cost_np)
        ]
        count += 1
    return [x / count for x in accumulated]


### 提供数据并构建主训练循环

`feed_order`用来定义每条产生的数据和`fluid.layers.data`之间的映射关系。比如，`imdb.train`产生的第一列的数据对应的是`words`这个特征。

In [72]:
# Specify the directory path to save the parameters
params_dirname = "understand_sentiment_conv.inference.model"

feed_order = ['words', 'label']
pass_num = 1  #训练循环的轮数

#程序主循环部分
def train_loop(main_program):
    #启动上文构建的训练器
    exe.run(fluid.default_startup_program())

    feed_var_list_loop = [
        main_program.global_block().var(var_name) for var_name in feed_order
    ]
    feeder = fluid.DataFeeder(
        feed_list=feed_var_list_loop, place=place)

    test_program = fluid.default_main_program().clone(for_test=True)

    #训练循环
    for epoch_id in range(pass_num):
        for step_id, data in enumerate(train_batch_reader()):
            #运行训练器  
            metrics = exe.run(main_program,
                              feed=feeder.feed(data),
                              fetch_list=[ avg_cost, accuracy])
            
            #测试结果
            avg_cost_test, acc_test = train_test(test_program, valid_batch_reader)
            print('Step {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
                step_id, avg_cost_test, acc_test))

            print("Step {0}, Epoch {1} Metrics {2}".format(
                step_id, epoch_id, list(map(np.array,
                                            metrics))))

            if step_id == 124:
                if params_dirname is not None:
                    fluid.io.save_inference_model(params_dirname, ["words"],
                                                  prediction, exe)#保存模型
       

In [73]:
train_loop(fluid.default_main_program())

Step 0, Test Loss 0.06, Acc 0.91
Step 0, Epoch 0 Metrics [array([0.17751257], dtype=float32), array([0.171875], dtype=float32)]
Step 1, Test Loss 0.024, Acc 0.97
Step 1, Epoch 0 Metrics [array([0.04535939], dtype=float32), array([0.9375], dtype=float32)]
Step 2, Test Loss 0.013, Acc 0.98
Step 2, Epoch 0 Metrics [array([0.02717954], dtype=float32), array([0.953125], dtype=float32)]
Step 3, Test Loss 0.0069, Acc 0.99
Step 3, Epoch 0 Metrics [array([0.05623794], dtype=float32), array([0.9375], dtype=float32)]
Step 4, Test Loss 0.0033, Acc 1.0
Step 4, Epoch 0 Metrics [array([0.00311627], dtype=float32), array([1.], dtype=float32)]
Step 5, Test Loss 0.0025, Acc 1.0
Step 5, Epoch 0 Metrics [array([0.04620567], dtype=float32), array([0.9375], dtype=float32)]
Step 6, Test Loss 0.0021, Acc 1.0
Step 6, Epoch 0 Metrics [array([0.06578264], dtype=float32), array([0.953125], dtype=float32)]
Step 7, Test Loss 0.0014, Acc 1.0
Step 7, Epoch 0 Metrics [array([0.01725191], dtype=float32), array([0.96875

### 预测测试集

In [74]:
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
inference_scope = fluid.core.Scope()


### 生成测试用输入数据

为了进行预测，我们任意选取3个评论。请随意选取您看好的3个。我们把评论中的每个词对应到`word_dict`中的id。如果词典中没有这个词，则设为`unknown`。
然后我们用`create_lod_tensor`来创建细节层次的张量，关于该函数的详细解释请参照[API文档](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/basic_concept/lod_tensor.html)。

In [75]:
reviews_str = X_test_raw
reviews =process_file(X_test_raw)

UNK = word_dict['<UNK>']
lod = []
for c in reviews:
    lod.append(np.array([word_dict.get(words, UNK) for words in c], dtype = np.int64))

base_shape = [[len(c) for c in lod]]

tensor_words = fluid.create_lod_tensor(lod, base_shape, place)


## 应用模型并进行预测

现在我们可以对每一条评论进行正面或者负面的预测啦。

In [77]:
with fluid.scope_guard(inference_scope):

    [inferencer, feed_target_names,
     fetch_targets] = fluid.io.load_inference_model(params_dirname, exe)

    assert feed_target_names[0] == "words"
    results = exe.run(inferencer,
                      feed={feed_target_names[0]: tensor_words},
                      fetch_list=fetch_targets,
                      return_numpy=False)
    np_data = np.array(results[0])
    ans=[]
    for i, r in enumerate(np_data):
        print("Predict probability of ", r[0], " to be positive and ", r[1],
              " to be negative for review \'", reviews_str[i], "\'")
        if r[0]>r[1]:
            
            ans.append(0)
        else:
            ans.append(1)
            
print(ans)
print(len(ans))

Predict probability of  0.8613183  to be positive and  0.13868175  to be negative for review ' 糯米团是我小时候的记忆了，吃起还是好吃，只是小时候的油条没有这么硬！油茶也还好！可以试试 '
Predict probability of  0.98103124  to be positive and  0.018968698  to be negative for review ' 满满的五星好评，口味好，服务好，特别喜欢，昨天第一次买，今天就回购了，买的刨奶，店长问我加腰果还是核桃，我说随便，他又问我喜欢吃什么，我说都喜欢，然后，帅帅的店长都给我加了，超赞 '
Predict probability of  0.9503834  to be positive and  0.04961651  to be negative for review ' 好喝！经常会再去买来喝！就是排队的人太多了 '
Predict probability of  0.8741074  to be positive and  0.12589256  to be negative for review ' 三个人订的四人餐，菜量大没吃完，问道不错。 '
Predict probability of  0.9288922  to be positive and  0.07110786  to be negative for review ' 好的一如既往，真真爱上了自助炒饭自助八宝粥自助冰粉！！！喜欢所有菜和肉，两女一男吃两份两人餐没吃完，饮料也不错！有空就要来！ '
Predict probability of  0.7068403  to be positive and  0.2931597  to be negative for review ' 太tmd的难吃了跟狗屎一样草 '
Predict probability of  0.93663657  to be positive and  0.06336345  to be negative for review ' 火锅味的味道比渝李记那个火锅米线的味道好吃，更有一种老火锅的感觉，很划算，但是有韭菜叶的就好了，更喜欢吃韭菜叶的面 '
Pred

In [78]:

df_sample=pd.read_csv(sample_file, delimiter=",")
df_sample["label"]=np.array(ans)
df_sample.to_csv("answer.csv",index=False)

In [79]:
df_look=pd.read_csv("answer.csv", delimiter=",")
df_look

Unnamed: 0,id,label
0,0011f384-9e54-4fb4-a272-330a6cab6804,0
1,00223e4f-47e1-4fc8-9657-06444a7de9a5,0
2,00225350-c169-435c-84cf-970068df5b12,0
3,00a3190c-90c1-44c3-b809-7a9b1314cd27,0
4,00b3f76e-fda3-42cd-8884-25e03a5dba64,0
...,...,...
1995,ffbd3c14-56ba-412f-ac68-9f4ccbafe4f5,0
1996,ffcc4330-2b02-485b-a3bb-e1c7d42baaae,0
1997,ffcc53f6-7cf9-4e5e-9294-a9a33c7568f8,0
1998,ffd3ca4b-dc5f-4dbd-b249-bac065045870,0
