In [1]:
import numpy as np
import tensorflow as tf
import sys
import time
from datetime import timedelta
import tensorflow.contrib.keras as kr
from sklearn import metrics
from sklearn.model_selection import KFold

import moxing as mox
mox.file.shift('os', 'mox')

INFO:root:Using MoXing-v1.14.1-ddfd6c9a
INFO:root:Using OBS-Python-SDK-3.1.2


In [2]:
trainDataPath = "s3://corpus-text-classification1/data/train_5500.label.txt"
testDataPath = "s3://corpus-text-classification1/data/TREC_10.label.txt"
vocabPath = "s3://corpus-text-classification1/data/glove.6B.100d.txt"

In [3]:
def readfile(filePath):
    """读取文件内容，返回文本和标签列表"""
    train_data = []
    with open(filePath, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f.readlines():
            word = line.strip().split()
            label = word[0].split(":")[0]
            content = word[1:]
            train_data.append([content,label])
    
    np.random.shuffle(train_data)
    return np.asarray(train_data)


def loadGloVe(filename):
    vocab = []
    embd = []
    print('Loading GloVe!')
    # vocab.append('unk') #装载不认识的词
    # embd.append([0] * emb_size) #这个emb_size可能需要指定
    file = open(filename,'r',encoding='utf-8')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append([float(ei) for ei in row[1:]])
    file.close()
    print('Completed!')
    return vocab,embd


def process_file(contents, labels, word_to_id, cat_to_id, num_classes, pad_max_length):
    """
    将文件转换为id表示,并且将每个单独的样本长度固定为pad_max_lengtn
    """
    # contents, labels = readfile(filePath)
    data_id, label_id = [], []
    # 将文本内容转换为对应的id形式
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])
    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, pad_max_length)
    ''' https://blog.csdn.net/TH_NUM/article/details/80904900
    pad_sequences(sequences, maxlen=None, dtype=’int32’, padding=’pre’, truncating=’pre’, value=0.) 
        sequences：浮点数或整数构成的两层嵌套列表
        maxlen：None或整数，为序列的最大长度。大于此长度的序列将被截短，小于此长度的序列将在后部填0.
        dtype：返回的numpy array的数据类型
        padding：‘pre’或‘post’，确定当需要补0时，在序列的起始还是结尾补
        truncating：‘pre’或‘post’，确定当需要截断序列时，从起始还是结尾截断
        value：浮点数，此值将在填充时代替默认的填充值0
    '''
    y_pad = kr.utils.to_categorical(label_id, num_classes=num_classes)  # 将标签转换为one-hot表示
    ''' https://blog.csdn.net/nima1994/article/details/82468965
    to_categorical(y, num_classes=None, dtype='float32')
        将整型标签转为onehot。y为int数组，num_classes为标签类别总数，大于max(y)（标签从0开始的）。
        返回：如果num_classes=None，返回len(y) * [max(y)+1]（维度，m*n表示m行n列矩阵，下同），否则为len(y) * num_classes。
    '''
    return x_pad, y_pad

In [4]:
categories = ['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM']
num_classes = len(categories)

cat_to_id = {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 'HUM': 3, 'LOC': 4, 'NUM': 5}

vocab, embd = loadGloVe(vocabPath)
vocab_size = len(vocab)
embedding_dim = len(embd[0])
embedding = np.asarray(embd)
word_to_id = dict(zip(vocab, range(vocab_size)))

print(len(embedding),embedding_dim,vocab_size)

testData = readfile(testDataPath)
trainData = readfile(trainDataPath)

print(len(testData),len(trainData))
trainData = np.r_[trainData,testData]
np.random.shuffle(trainData)
len(trainData)

seq_length = 37

Loading GloVe!
Completed!
400000 100 400000
500 5452


In [5]:
def train_10_fold(train_data, categories):
    
    tx, ty = process_file(train_data[:,0], train_data[:,1], word_to_id, cat_to_id, num_classes, seq_length)
    print(len(tx),len(tx[0]),len(tx[1]))
    
    fold_id = 0
    test_acc = []
    
    kf = KFold(n_splits=10)
    for train_i, test_i in kf.split(tx):
        fold_id += 1
        print("Fold: ", fold_id)
        test_acc.append(classifier.train(
            X_train=tx[train_i],
            y_train=ty[train_i],
            X_eval=tx[test_i],
            y_eval=ty[test_i],
            categories=categories,
            epochs=30
        ))
    print(test_acc)
    print("%s, %s, %s, %s" % (np.mean(test_acc),np.std(test_acc),np.std(test_acc,ddof=1),np.var(test_acc)))
    return test_acc

In [6]:
class Classifier:

    def __init__(self, model, input_length, output_length):
        self.model = model
        self.input_length = input_length
        self.output_length = output_length

    def compile(self, batch_size=32):
        self._ds_x = tf.placeholder(tf.float32, [None, self.input_length])
        self._ds_y = tf.placeholder(tf.float32, [None, self.output_length])

        ds = tf.data.Dataset.from_tensor_slices((self._ds_x, self._ds_y))
        ds = ds.batch(batch_size)

        self._ds_it = ds.make_initializable_iterator()
        self._input, self._labels = self._ds_it.get_next()

        self._features = self.model(self._input)
        self._output = _create_dense_layer(self._features, self.output_length)

        self._create_acc_computations()
        self._create_backpropagation()

    def _create_acc_computations(self):
        self._predictions = tf.argmax(self._output, 1)
        labels = tf.argmax(self._labels, 1)
        self._accuracy = tf.reduce_mean(
            tf.cast(tf.equal(self._predictions, labels), 'float32'))

    def _create_backpropagation(self):
        losses = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=self._output,
            labels=self._labels)
        self._loss = tf.reduce_mean(losses)

        optimizer = tf.train.AdamOptimizer(0.001)
        global_step = tf.Variable(0, name="global_step", trainable=False)
        grads_and_vars = optimizer.compute_gradients(self._loss)

        self._train_op = optimizer.apply_gradients(
            grads_and_vars, global_step=global_step)

    def summary(self):
        print('input:', self._input.shape)
        self.model.summary()
        print('output:', self._output.shape)

    def train(self, X_train, y_train, X_eval, y_eval, categories, epochs=20, require_improve=3):
        
        session = tf.Session()
        session.run(tf.global_variables_initializer())
        session.run(tf.local_variables_initializer())
        
        best_vac_acc = 0.0
        last_improved = 0
        
        for e in range(epochs):
            start_time = time.time()
            loss, acc = self._train(X_train, y_train, session)
            duration = time.time() - start_time

            val_loss, val_acc = self._eval(X_eval, y_eval, session)
            
            if val_acc > best_vac_acc:
                best_vac_acc = val_acc
                last_improved = e
                improved_str = '*'
            else:
                improved_str = ''
            
            output = 'Epoch: {:>1}, Train Loss: {:>6.4}, Train Acc: {:>6.2%}, Val Loss: {:>6.4}, Val Acc: {:>6.2%}, Time: {:.2f}s {}'
            print(output.format(e + 1, loss, acc, val_loss, val_acc, duration, improved_str))
            
            if e - last_improved > require_improve:
                print("No optimization for a long time, auto-stopping...")
                
                y_test_cls = np.argmax(y_eval, 1)  # 获得类别
                y_test_pred_cls = np.argmax(self.predict(X_eval, session), 1)
                accuracy_score = metrics.accuracy_score(y_test_cls, y_test_pred_cls)
                
                # evaluate
                print("Precision, Recall and F1-Score...")
                print(metrics.classification_report(y_test_cls, y_test_pred_cls, target_names=categories))
                '''
                sklearn中的classification_report函数用于显示主要分类指标的文本报告．在报告中显示每个类的精确度，召回率，F1值等信息。
                    y_true：1维数组，或标签指示器数组/稀疏矩阵，目标值。 
                    y_pred：1维数组，或标签指示器数组/稀疏矩阵，分类器返回的估计值。 
                    labels：array，shape = [n_labels]，报表中包含的标签索引的可选列表。 
                    target_names：字符串列表，与标签匹配的可选显示名称（相同顺序）。 
                    原文链接：https://blog.csdn.net/akadiao/article/details/78788864
                '''

                print("Confusion Matrix...")
                print(metrics.confusion_matrix(y_test_cls, y_test_pred_cls))
                '''
                混淆矩阵是机器学习中总结分类模型预测结果的情形分析表，以矩阵形式将数据集中的记录按照真实的类别与分类模型作出的分类判断两个标准进行汇总。
                这个名字来源于它可以非常容易的表明多个类别是否有混淆（也就是一个class被预测成另一个class）
                https://blog.csdn.net/u011734144/article/details/80277225
                '''
                break
        # endfor
        session.close()
        return accuracy_score

    def _train(self, X_train, y_train, session):
        import numpy as np

        session.run(
            fetches=self._ds_it.initializer,
            feed_dict={
                self._ds_x: X_train,
                self._ds_y: y_train
            })
        loss, acc, = [], []
        while True:
            try:
                _, vloss, vacc = session.run(
                    fetches=[self._train_op, self._loss, self._accuracy])

                loss.append(vloss)
                acc.append(vacc)
            except tf.errors.OutOfRangeError:
                break
        # endwhile

        loss, acc = np.mean(loss), np.mean(acc)
        return loss, acc

    def _eval(self, X_val, y_val, session):
        session.run(
            fetches=self._ds_it.initializer,
            feed_dict={
                self._ds_x: X_val,
                self._ds_y: y_val
            })

        loss, acc, = 0, 0
        while True:
            try:
                l, vloss, vacc = session.run(
                    fetches=[self._labels, self._loss, self._accuracy])

                loss += vloss * len(l)
                acc += vacc * len(l)
            except tf.errors.OutOfRangeError:
                break

        return loss / len(X_val), acc / len(X_val)

    def predict(self, X, session):
        

        session.run(self._ds_it.initializer,
                         feed_dict={
                             self._ds_x: X,
                             self._ds_y: np.empty((len(X), self.output_length))
                         }
                         )

        pred = list()
        while True:
            try:
                ppred = session.run(tf.nn.softmax(self._output))

                pred.extend(map(lambda l: l.tolist(), ppred))
            except tf.errors.OutOfRangeError:
                break

        return pred

def _create_dense_layer(x, output_length):
    '''Creates a dense layer
    '''
    input_size = x.shape[1].value
    W = tf.Variable(
        initial_value=tf.truncated_normal(
            shape=[input_size, output_length],
            stddev=0.1))
    b = tf.Variable(
        initial_value=tf.truncated_normal(
            shape=[output_length]))

    dense = tf.nn.xw_plus_b(x, W, b)

    return dense

In [7]:
class KimConvolutionalModel:
    '''
    Implementation proposal of: https://arxiv.org/pdf/1408.5882.pdf
    '''
    def __init__(self,
        embeddings_configuration,
        conv_configurations = [(3, 100), (4, 100), (5, 100)],
        drop_rate           = 0.5):
        '''Constructor.
        # Parameters:
        embeddings: List of embeddings configuration. Each configuration is a
            pair of the form (embedding, trainable). `embedding` is a numpy
            array and `trainable` is a boolean that indicates whether that
            embedding is trainable or not.
        conv_configurations: List of pairs. Each pair represents a
            convolution configuration. Each configuration determines the
            size and number of each filter.
        '''

        self._embeddings_configuration = embeddings_configuration
        self._conv_configurations = conv_configurations
        self._drop_rate = drop_rate

    def __call__(self, input):
        self._embeddings_tf = tf.stack(
            values = [
                self._create_embedding_layer(e, input)
                for e in self._embeddings_configuration],
            axis = 1
        )

        self._convolutions_tf = self._create_convolutional_layers(
            self._conv_configurations, self._embeddings_tf)
        
        self._add_tf = self._create_add_layers(self._convolutions_tf)

        self._poolings_tf = self._create_maxpooling_layer(
            self._add_tf)

        self._reshape_tf = self._create_reshape_layer(self._poolings_tf)
        self._dropout_tf = tf.nn.dropout(
            self._reshape_tf,
            keep_prob = self._drop_rate)

        return self._dropout_tf

    def summary(self):
        print('embedding:', str(self._embeddings_tf.shape))
        for c in self._convolutions_tf:
            print('conv:', str(c.shape))
        for a in self._add_tf:
            print('add:', str(a.shape))
        for p in self._poolings_tf:
            print('pool:', str(p.shape))
        print('reshape:', str(self._reshape_tf.shape))

    def _create_embedding_layer(self, embedding_configuration, input_x):
        return tf.nn.embedding_lookup(
            params = tf.Variable(
                initial_value = embedding_configuration[0],
                trainable     = embedding_configuration[1]),
            ids = tf.cast(input_x, 'int32')
        )

    def _create_convolutional_layers(self, configuration, input_embedding):
        '''Creates the convolutional layers.
        # Parameters:
        configuration: A list. It must be of the form
            [(filter_size, num_filters), ...]
        # Returns:
        A list of tensorflow nodes. Each node 'i' computes the configuration 'i'.
        '''
        convolutions = []
        for filter_height, num_filters in configuration:
            filter_width = input_embedding.shape[3].value
            filter_shape = [1, filter_height, filter_width, num_filters]

            # Create weights and bias
            W = tf.Variable(
                initial_value=tf.truncated_normal(
                    shape=filter_shape,
                    stddev=0.1))
            b = tf.Variable(
                initial_value=tf.truncated_normal(
                    shape=[num_filters]))

            conv = tf.nn.conv2d(
                input=input_embedding,
                filter=W,
                strides=[1, 1, 1, 1],
                padding="VALID")
            bias = tf.nn.bias_add(conv, b)
            h = tf.nn.relu(bias)
            convolutions.append(h)

        return convolutions

    def _create_add_layers(self, convolutions):
        return [
            tf.reduce_sum(
                input_tensor = c,
                axis=1,
                keepdims=True)
            for c in convolutions
        ]

    def _create_maxpooling_layer(self, tensors):
        '''Creates the maxpooling layer. Computes maxpooling on each node
        # Parameters:
        input_convolutions: List of tensorflow nodes.
        # Returns:
        A list of tensorflow nodes. Each node 'i' computes the maxpooling of node 'i'
        '''
        return [
            tf.reshape(
                tensor = tf.nn.max_pool(
                    value=t,
                    ksize=[1, 1, t.shape[2], 1],
                    strides=[1, 1, 1, 1],
                    padding='VALID'),
                shape = [-1, t.shape[3]]
            )
            for t in tensors
        ]

    def _create_reshape_layer(self, tensors):
        '''Creates a flatten layer
        '''
        return tf.concat(tensors, axis=1)

In [8]:
word_vector = embedding.astype('float32')
model = KimConvolutionalModel(
        embeddings_configuration=[
            (word_vector, True)
        ]
    )

classifier = Classifier(
    model=model,
    input_length=seq_length,
    output_length=num_classes)

classifier.compile(batch_size=32)
classifier.summary()

input: (?, 37)
embedding: (?, 1, 37, 100)
conv: (?, 1, 35, 100)
conv: (?, 1, 34, 100)
conv: (?, 1, 33, 100)
add: (?, 1, 35, 100)
add: (?, 1, 34, 100)
add: (?, 1, 33, 100)
pool: (?, 100)
pool: (?, 100)
pool: (?, 100)
reshape: (?, 300)
output: (?, 6)


In [9]:
train_10_fold(trainData, categories)

5952 37 37
Fold:  1
Epoch: 1, Train Loss:  1.755, Train Acc: 41.67%, Val Loss:  1.207, Val Acc: 56.88%, Time: 21.24s *
Epoch: 2, Train Loss: 0.9805, Train Acc: 64.07%, Val Loss: 0.9755, Val Acc: 64.26%, Time: 22.44s *
Epoch: 3, Train Loss:  0.728, Train Acc: 73.73%, Val Loss:  0.916, Val Acc: 66.28%, Time: 22.59s *
Epoch: 4, Train Loss: 0.5807, Train Acc: 79.53%, Val Loss: 0.7895, Val Acc: 71.81%, Time: 22.25s *
Epoch: 5, Train Loss: 0.4607, Train Acc: 84.44%, Val Loss: 0.7655, Val Acc: 75.00%, Time: 22.74s *
Epoch: 6, Train Loss: 0.3705, Train Acc: 87.25%, Val Loss: 0.7367, Val Acc: 75.84%, Time: 22.36s *
Epoch: 7, Train Loss: 0.3184, Train Acc: 88.90%, Val Loss:  0.725, Val Acc: 75.84%, Time: 22.58s *
Epoch: 8, Train Loss: 0.2623, Train Acc: 91.21%, Val Loss: 0.7406, Val Acc: 77.18%, Time: 22.70s *
Epoch: 9, Train Loss: 0.2234, Train Acc: 92.18%, Val Loss: 0.7047, Val Acc: 77.01%, Time: 22.46s 
Epoch: 10, Train Loss:  0.191, Train Acc: 93.74%, Val Loss: 0.7558, Val Acc: 77.85%, Time:

Epoch: 1, Train Loss:  1.809, Train Acc: 40.29%, Val Loss:  1.152, Val Acc: 57.31%, Time: 25.15s *
Epoch: 2, Train Loss: 0.9967, Train Acc: 63.70%, Val Loss: 0.9202, Val Acc: 66.89%, Time: 24.92s *
Epoch: 3, Train Loss: 0.7233, Train Acc: 74.21%, Val Loss: 0.7761, Val Acc: 69.08%, Time: 24.41s *
Epoch: 4, Train Loss:  0.579, Train Acc: 78.91%, Val Loss: 0.7043, Val Acc: 74.29%, Time: 24.81s *
Epoch: 5, Train Loss: 0.4716, Train Acc: 83.53%, Val Loss: 0.6274, Val Acc: 77.48%, Time: 24.61s *
Epoch: 6, Train Loss: 0.3819, Train Acc: 87.02%, Val Loss: 0.7028, Val Acc: 73.78%, Time: 24.45s 
Epoch: 7, Train Loss: 0.3197, Train Acc: 89.10%, Val Loss: 0.6442, Val Acc: 77.31%, Time: 24.79s 
Epoch: 8, Train Loss: 0.2753, Train Acc: 90.69%, Val Loss: 0.6172, Val Acc: 78.49%, Time: 24.55s *
Epoch: 9, Train Loss:  0.228, Train Acc: 92.44%, Val Loss: 0.5674, Val Acc: 79.33%, Time: 24.62s *
Epoch: 10, Train Loss: 0.1932, Train Acc: 93.42%, Val Loss: 0.6438, Val Acc: 75.46%, Time: 24.27s 
Epoch: 11, T

Epoch: 4, Train Loss: 0.5626, Train Acc: 79.86%, Val Loss: 0.8232, Val Acc: 72.10%, Time: 24.54s *
Epoch: 5, Train Loss: 0.4557, Train Acc: 83.89%, Val Loss: 0.8093, Val Acc: 71.76%, Time: 24.40s 
Epoch: 6, Train Loss: 0.3643, Train Acc: 87.45%, Val Loss: 0.7852, Val Acc: 73.45%, Time: 24.24s *
Epoch: 7, Train Loss: 0.3059, Train Acc: 89.34%, Val Loss:  0.776, Val Acc: 73.61%, Time: 24.69s *
Epoch: 8, Train Loss: 0.2579, Train Acc: 91.15%, Val Loss: 0.8493, Val Acc: 71.76%, Time: 23.56s 
Epoch: 9, Train Loss: 0.2251, Train Acc: 92.48%, Val Loss:  0.743, Val Acc: 75.13%, Time: 22.84s *
Epoch: 10, Train Loss:   0.19, Train Acc: 93.63%, Val Loss: 0.8269, Val Acc: 75.97%, Time: 24.34s *
Epoch: 11, Train Loss: 0.1631, Train Acc: 94.57%, Val Loss: 0.8694, Val Acc: 74.29%, Time: 24.75s 
Epoch: 12, Train Loss: 0.1555, Train Acc: 94.83%, Val Loss: 0.7945, Val Acc: 74.79%, Time: 24.42s 
Epoch: 13, Train Loss: 0.1329, Train Acc: 95.72%, Val Loss: 0.8287, Val Acc: 76.47%, Time: 24.27s *
Epoch: 14,

Epoch: 6, Train Loss: 0.3691, Train Acc: 87.17%, Val Loss: 0.7663, Val Acc: 72.27%, Time: 17.07s *
Epoch: 7, Train Loss: 0.3041, Train Acc: 89.60%, Val Loss: 0.7724, Val Acc: 72.94%, Time: 17.08s *
Epoch: 8, Train Loss: 0.2627, Train Acc: 90.78%, Val Loss: 0.7474, Val Acc: 73.45%, Time: 17.08s *
Epoch: 9, Train Loss:  0.216, Train Acc: 92.86%, Val Loss: 0.7633, Val Acc: 74.12%, Time: 17.06s *
Epoch: 10, Train Loss: 0.1938, Train Acc: 93.50%, Val Loss: 0.7533, Val Acc: 74.45%, Time: 17.07s *
Epoch: 11, Train Loss: 0.1679, Train Acc: 94.40%, Val Loss: 0.7652, Val Acc: 76.81%, Time: 17.14s *
Epoch: 12, Train Loss: 0.1435, Train Acc: 95.04%, Val Loss: 0.7585, Val Acc: 74.29%, Time: 17.14s 
Epoch: 13, Train Loss:  0.137, Train Acc: 95.47%, Val Loss: 0.8449, Val Acc: 76.13%, Time: 17.06s 
Epoch: 14, Train Loss: 0.1217, Train Acc: 95.77%, Val Loss: 0.7731, Val Acc: 76.81%, Time: 17.05s 
Epoch: 15, Train Loss:  0.109, Train Acc: 96.13%, Val Loss: 0.8009, Val Acc: 77.65%, Time: 17.02s *
Epoch: 

[0.7953020134228188,
 0.7734899328859061,
 0.7798319327731092,
 0.7831932773109244,
 0.7815126050420168,
 0.7394957983193278,
 0.7764705882352941,
 0.7495798319327731,
 0.7781512605042017,
 0.7630252100840336]