In [2]:
# -*- coding:utf8 -*-
"""
该模型主要用于将文本转化为id形式，同时提供分词和训练词向量的静态函数
"""

import jieba
import pickle
import os
import re
import time
from zhon.hanzi import punctuation
from gensim.models import Word2Vec
from functools import lru_cache

class Vocab(object):
    
    def __init__(self, filename=None,  initial_tokens=None):
        self.id2token = {}
        self.token2id = {}

        self.embed_dim = None
        self.embeddings = None

        self.pad_token = '<blank>'
        self.unk_token = '<unk>'
        
        self.initial_tokens = initial_tokens.copy() if initial_tokens is not None else []
        self.initial_tokens.insert(0, self.unk_token)
        self.initial_tokens.insert(0, self.pad_token)
        
        for token in self.initial_tokens:
            self.add(token)
            
            
    def add(self, token) -> int:
        """
        Adds the token to vocab
        Args:
            token: a string
        """
        if token in self.token2id:
            idx = self.token2id[token]
        else:
            idx = len(self.id2token)
            self.id2token[idx] = token
            self.token2id[token] = idx

        return idx 
    
    @property
    @lru_cache(1)
    def size(self) -> int:
        """
        Get the size of vocabulary
        Returns:
            an integer indicating the size
        """
        return len(self.id2token)
    
    def get_id(self, token) -> int:
        """
        Gets the id of a token, returns the id of unk token if token is not in vocab
        Args:
            key: a string indicating the word
        Returns:
            an integer
        """
        try:
            return self.token2id[token]
        except KeyError:
            return self.token2id[self.unk_token]
        
    def get_token(self, idx) -> str:
        """
        Gets the token corresponding to idx, returns unk token if idx is not in vocab
        Args:
            idx: an integer
        returns:
            a token string
        """
        try:
            return self.id2token[idx]
        except KeyError:
            return self.unk_token
        
    def convert_to_ids(self, tokens) -> list:
        """
        Convert a list of tokens to ids, use unk_token if the token is not in vocab.
        Args:
            tokens: a list of token
        Returns:
            a list of ids
        """
        vec = [self.get_id(term) for term in tokens]
        return vec
    
    def recover_from_ids(self, ids, stop_id=None) -> list:
        """
        Convert a list of ids to tokens, stop converting if the stop_id is encountered
        Args:
            ids: a list of ids to convert
            stop_id: the stop id, default is None
        Returns:
            a list of tokens
        """
        tokens = []
        for i in ids:
            tokens += [self.get_token(i)]
            if stop_id is not None and i == stop_id:
                break
        return tokens
    
    def recover_id2token(self) -> dict:
        """
        Rebuild the id2token
        Returns:
            a dict about converting id to token
        """
        id2token_temp = {}
        for token_iter, idx_iter in self.token2id:
            id2token_temp[idx_iter] = token_iter
            
        return id2token_temp
                    
    
    def load_pretrained_embeddings(self, trained_embeddings):
        """
        Loads the pretrained embeddings
        Args:
            trained_embeddings: the pretrained embeddings
        """        
        if self.embed_dim is None:
            self.embed_dim = len(trained_embeddings[0])

        # load embeddings
        self.embeddings = np.zeros([self.size, self.embed_dim])
        for idx, trained_vec in enumerate(trained_embeddings):
            self.embeddings[idx+2] = trained_vec 
            

    def randomly_init_embeddings(self, embed_dim):
        """
        Randomly initializes the embeddings for each token
        Args:
            embed_dim: the size of the embedding for each token
        """
        self.embed_dim = embed_dim
        self.embeddings = np.random.rand(self.size(), embed_dim)
        for token in [self.pad_token, self.unk_token]:
            self.embeddings[self.get_id(token)] = np.zeros([self.embed_dim])
            
    
    def save(self, mode='pkl', base_dir = '.'):
        """
        Save the dict and embedding
        Args:
            mode: the way to save
            base_dir: the root path of the save file
        """
        print("保存字典和词向量...........\n")
        model_dir = f'{base_dir}/model'
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
            
        if mode == 'pkl':
            with open(f'{model_dir}/word_idx_dict.pkl', 'wb') as f:
                pickle.dump(self.token2id , f)
            
            with open(f'{model_dir}/word_vectors_arr.pkl', 'wb') as f:
                pickle.dump(self.embeddings, f)
        
        if mode == 'txt':
            with open(f'{model_dir}/word_idx_dict.pkl', 'wb') as f:
                for token_iter, id_iter in self.token2id.items():
                    f.write(token_iter + ' ' + id_iter + '\n')
                    
            with open(f'{model_dir}/word_idx_dict.pkl', 'wb') as f:
                for embedding_vec_iter in self.embeddings:
                    f.write(embedding_vec_iter + '\n')
                
        print("字典和词向量已保存到该目录下！\n")
        
        
    def load(self, mode='pkl', base_dir = '.'):
        """
        Loads the dict and embedding 
        Args:
            mode: the way to save
            base_dir: the root path of the load file
        """
        print("加载字典和词向量...........\n")
                
        model_dir = f'{base_dir}/model'
            
        if mode == 'pkl':
            with open(f'{model_dir}/word_idx_dict.pkl', 'rb') as f:
                self.token2id = pickle.load(f)
            
            self.id2token = self.recover_id2token()
            
                
            with open(f'{model_dir}/word_vectors_arr.pkl', 'rb') as f:
                self.embeddings = pickle.load(f)
        # TO DO        
        if mode == 'txt':
            pass
                
        print("字典和词向量已保存到该目录下！\n")

             
    
    @staticmethod
    def tokenize(text, filter_stop_word = None, lower=False) -> list:
        """
        The function to tokenize
        Args:
            filter_stop_word: the stop words list
            lower: lower or not
        """
        text = re.sub(r'[%s]+' % punctuation, '', text) 
        if lower:
            text = text.lower()
        tokens = jieba.lcut(text)
        if filter_stop_word:
            stop_word_set = set(filter_stop_word)
            tokens = filter(lambda w: w not in stop_word_set, tokens)
        return list(tokens)
    
    @staticmethod
    def train_word2vec(sentences, base_dir = '.'):
        """
        Train a word2vec model
        Args:
            sentences: the train data
            base_dir: the root path of the load file
        """
        time_s = time.time()
        vec_size = 300
        win_size = 1
        print ('begin to train model...')
        w2v_model = Word2Vec(sentences=sentences, 
                             size=50, 
                             window=1, 
                             min_count=1,
                             workers=20, 
                             sg=1, 
                             iter=25, 
                             hs=0)
        print("train model success\n")

        word2vec_dir = f'{base_dir}/word2vec'
        if not os.path.exists(word2vec_dir):
            os.makedirs(word2vec_dir)
        
        w2v_model.save(f'{word2vec_dir}/word2vec.model')
        print ('save model success, model_path=%s, time=%.4f sec.' 
                % (f'{word2vec_dir}/word2vec.model', time.time() - time_s))
        
        return w2v_model



In [3]:
import os
import json
import logging
import numpy as np
import tensorflow as tf
from collections import Counter


class RCDataset(object):
    """
    This module implements the APIs for loading and using  dataset
    """

    def __init__(self, train_file=[], dev_file=[], test_file=[], pad_id=0):

        self.logger = logging.getLogger("RC")

        self.pattern_symbol = re.compile(r'[\(\)\[\]\{\},:;!~@^_$¥`<>]')

        self.pad_id = pad_id
        self.limit = 60

        self.train_contents = []
        self.dev_contents = []
        self.test_contents = []

        self.train_entities1_position = []
        self.dev_entities1_position = []
        self.test_entities1_position = []

        self.train_entities2_position = []
        self.dev_entities2_position = []
        self.test_entities2_position = []

        self.train_labels = []
        self.dev_labels = []
        self.test_labels = []

        if train_file:
            self.train_contents, self.train_entities1_position, self.train_entities2_position, self.train_labels = self._load_dataset(
                train_file)
            self.logger.info('Train set size: {} titles.'.format(
                len(self.train_labels)))

        if dev_file:
            self.dev_contents, self.dev_entities1_position, self.dev_entities2_position, self.dev_labels = self._load_dataset(
                dev_file)
            self.logger.info('Dev set size: {} titles.'.format(
                len(self.dev_labels)))

        if test_file:
            self.test_contents, self.test_entities1_position, self.test_entities2_position, self.test_labels = self._load_dataset(
                test_file)
            self.logger.info('Test set size: {} titles.'.format(
                len(self.test_labels)))
        
        self.max_content_len = len(max(self.all_contents,key=lambda x:len(x))) + 2


        self.raw_test_contents = self.test_contents.copy()

        self.categories = sorted(list(set(self.train_labels + self.dev_labels + self.test_labels)))
        
        labelsMapping = {'Cause-Effect(e1,e2)': 0,
                         'Cause-Effect(e2,e1)': 1,
                         'Component-Whole(e1,e2)': 2,
                         'Component-Whole(e2,e1)': 3,
                         'Content-Container(e1,e2)': 4,
                         'Content-Container(e2,e1)': 5,
                         'Entity-Destination(e1,e2)': 6,
                         'Entity-Destination(e2,e1)': 7,
                         'Entity-Origin(e1,e2)': 8,
                         'Entity-Origin(e2,e1)': 9,
                         'Instrument-Agency(e1,e2)': 10,
                         'Instrument-Agency(e2,e1)': 11,
                         'Member-Collection(e1,e2)': 12,
                         'Member-Collection(e2,e1)': 13,
                         'Message-Topic(e1,e2)': 14,
                         'Message-Topic(e2,e1)': 15,
                         'Product-Producer(e1,e2)': 16,
                         'Product-Producer(e2,e1)': 17,
                         'Other': 18
                        }
        
        self.cat2id = labelsMapping
        self.id2cat = dict(zip(range(len(self.categories)), self.categories))

        self.num_class = len(self.cat2id)

    def _load_dataset(self, data_path):
        """
        Loads the dataset
        Args:
            data_path: the data file to load
        """
        with open(data_path, mode='r', encoding='utf-8',errors='ignore') as fin:
            lines = fin.readlines()
            contents, entities1_pos, entities2_pos, relations = [], [], [], []

            for i in range(0, len(lines), 4):
                relation = lines[i + 1].strip()
                question = lines[i].strip().split('\t')[1][1:-2].lower()
                question, e1_begin, e1_end, e2_begin, e2_end = self._process_question(question)
                contents.append(question)
                entities1_pos.append([e1_begin, e1_end])
                entities2_pos.append([e2_begin, e2_end])
                relations.append(str(relation))

        return contents, entities1_pos, entities2_pos, relations

    def _remove_tag(self, x):
        x = x.replace('<e1>', '')
        x = x.replace('</e1>', '')
        x = x.replace('<e2>', '')
        x = x.replace('</e2>', '')
        return x

    def _process_question(self, question):
        question = question.replace("'", " '")
        question = question.replace(",", " ,")
        question = question.replace(".", " .")
        question = question.split(' ')
        e1_begin = e1_end = e2_begin = e2_end = 0

        for i, item in enumerate(question):
            if item.startswith('<e1>'):
                e1_begin = i
            if item.endswith('</e1>'):
                e1_end = i
            if item.startswith('<e2>'):
                e2_begin = i
            if item.endswith('</e2>'):
                e2_end = i

        question = list(map(self._remove_tag, question))

        return question, e1_begin, e1_end, e2_begin, e2_end

    # 将位置距离转化为正的标签 
    def _map_position(self, x):
        '''
        clip the postion range:
        '''
        if x < -self.limit:
            return 0
        if -self.limit <= x <= self.limit:
            return x + self.limit + 1
        if x > self.limit:
            return self.limit * 2 + 1

    def convert_to_ids(self, vocab):
        """
        Convert the tokens to ids
        Args:
            vocab: the convert vocab
        """

        if self.train_contents:
            self.train_contents = [[vocab.convert_to_ids(contents)]
                                   for contents in self.train_contents]

        if self.dev_contents:
            self.dev_contents = [[vocab.convert_to_ids(contents)]
                                 for contents in self.dev_contents]
            self.logger.info('Dev set size: {} titles.'.format(
                len(self.dev_labels)))

        if self.test_contents:
            self.test_contents = [[vocab.convert_to_ids(contents)]
                                  for contents in self.test_contents]

    @property
    def all_contents(self) -> list:
        """
        Get all data
        Args:
            the list of all data
        """
        
        return self.train_contents + self.dev_contents + self.test_contents

    @property
    def all_labels(self) -> list:
        """
        Get the all labels
        Returns:
            the list of all labels
        """
        return self.train_labels + self.dev_labels + self.test_labels

    @property
    @lru_cache(1)
    def size(self) -> int:
        """
        Get the size of all data
        Returns:
            an integer indicating the size
        """
        return len(self.train_labels + self.dev_labels + self.test_labels)

    @property
    @lru_cache(1)
    def train_size(self) -> int:
        """
        Get the size of train data
        Returns:
            the size of train data
        """
        return len(self.train_labels)

    @property
    @lru_cache(1)
    def dev_size(self) -> int:
        """
        Get the size of dev data
        Returns:
            the size of dev data
        """
        return len(self.dev_labels)

    @property
    @lru_cache(1)
    def test_size(self) -> int:
        """
        Get the size of  test data
        Returns:
            the size of test data
        """
        return len(self.test_labels)

    def _dynamic_padding(self, batch_data):
        """
        Dynamically pads the batch_data with pad_id
        """
        pad_content_len = self.max_content_len
        batch_data['contents'] = [(ids + [self.pad_id] *(pad_content_len - len(ids)))[:pad_content_len] 
                                  for ids in batch_data['contents']]
        return batch_data


    def _one_mini_batch(self, data) -> dict:
        """
        Get one mini batch
        Args:
            data: all data
        Returns:
            one batch of data
        """
        batch_data = {
            'contents': [list(contents)[0] for contents in data[0]],
            'e1_position': [list(contents) for contents in list(data[1])],
            'e2_position': [list(contents) for contents in list(data[2])],
#             'e1_entities': [],
#             'e2_entities': [],
            'e1_distance': [],
            'e2_distance': [],
            'labels': data[3],
            'contents_length': []
        }
        labelId_list = []
        for sent_idx, [content, e1_position, e2_position] in enumerate(zip(batch_data['contents'], 
                                                                         batch_data['e1_position'], 
                                                                         batch_data['e2_position'])):
            
            batch_data['contents_length'].append(len(content))
#             batch_data['e1_entities']append(content[e1_position[1]]) 
#             batch_data['e2_entities']append(content[e2_position[1]]) 
            
            batch_data['e1_distance'].append([self._map_position(idx - e1_position[1]) 
                                              for idx, _ in enumerate(content)])
            batch_data['e1_distance'][sent_idx] = (batch_data['e1_distance'][sent_idx] + 
                                                   [self.limit*2+2] * (self.max_content_len - len(content)))[:self.max_content_len] 
            
            batch_data['e2_distance'].append([self._map_position(idx - e2_position[1]) 
                                              for idx, _ in enumerate(content)])
            batch_data['e2_distance'][sent_idx] = (batch_data['e2_distance'][sent_idx] + 
                                                   [self.limit*2+2] * (self.max_content_len - len(content)))[:self.max_content_len] 
            
            lid = self.cat2id[batch_data['labels'][sent_idx]]
            labelId_list.append(lid)

        batch_data['labels'] = tf.keras.utils.to_categorical(labelId_list, num_classes=len(self.categories))

        batch_data = self._dynamic_padding(batch_data)

        return batch_data

    def gen_mini_batches(self, set_name='train', batch_size=256):
        """
        Generate  batches
        Args:
            set_name: the mode
            batch_size: the size of one batch
        """
        if set_name == 'train':
            x = self.train_contents
            e1_pos = self.train_entities1_position
            e2_pos = self.train_entities1_position
            y = self.train_labels
            shuffle = True
        elif set_name == 'dev':
            x = self.dev_contents
            e1_pos = self.dev_entities1_position
            e2_pos = self.dev_entities1_position
            y = self.dev_labels
            shuffle = False
        elif set_name == 'test':
            x = self.test_contents
            e1_pos = self.test_entities1_position
            e2_pos = self.test_entities1_position
            y = self.test_labels
            shuffle = False
        else:
            raise NotImplementedError(
                'No data set named as {}'.format(setName))

        data_len = len(x)
        num_batch = int((data_len - 1) / batch_size) + 1

        if shuffle:
            indices = np.random.permutation(np.arange(data_len))
            x_shuffle = np.array(x)[indices]
            e1_pos_shuffle = np.array(e1_pos)[indices]
            e2_pos_shuffle = np.array(e2_pos)[indices]
            y_shuffle = np.array(y)[indices]
        else:
            x_shuffle = x
            e1_pos_shuffle = e1_pos
            e2_pos_shuffle = e2_pos
            y_shuffle = y

        for i in range(num_batch):
            start_id = i * batch_size
            end_id = min((i + 1) * batch_size, data_len)
            yield self._one_mini_batch([x_shuffle[start_id:end_id],
                                        e1_pos_shuffle[start_id:end_id],
                                        e2_pos_shuffle[start_id:end_id],
                                        y_shuffle[start_id:end_id]
                                       ])

    def get_category(self, idx):
        """
        Get the category corresponding to idx, returns None if idx is not in vocab
        Args:
            idx: an integer
        returns:
            a token string or None
        """
        try:
            return self.id2cat[idx]
        except KeyError:
            return None

    def save(self, id_categories_dict_path, is_id2categories=True):
        """
        Save the needed data
        Args:
            id_categories_dict_path: save path
            is_id2categories: whether to save the dict of id2categories
        """
        with open(id_categories_dict_path, 'wb') as f:
            pickle.dump(self.id2cat, f)

### 2. 实际过程

In [5]:
# 读取数据
dataSet = RCDataset(train_file='../raw_data/train.txt', 
                    dev_file='../raw_data/test.txt',
                    test_file='../raw_data/test.txt')

### 2.1 词向量和词典

#### 2.1.1 加载本地已有的词向量和词典

In [6]:
load_vocab_words_list = []

with open('../raw_data/pretrained_w2v/vocab.txt', mode='r', encoding='utf-8',errors='ignore') as fin:
    lines = fin.readlines()
    for word in lines:
        load_vocab_words_list.append(word.split('\n')[0])

In [7]:
import numpy
word_2x = numpy.load('../raw_data/pretrained_w2v/w2v_50.npy')

In [8]:
vocab = Vocab(initial_tokens=load_vocab_words_list)

In [9]:
vocab.load_pretrained_embeddings(word_2x)

#### 2.1.2 生成词向量和词典

In [None]:
# word2vecModel = Vocab.train_word2vec(dataSet.train_contents, base_dir='../data')
# vocab = Vocab(initial_tokens=word2vecModel.wv.index2word)
# vocab.load_pretrained_embeddings(word2vecModel.wv.vectors)

### 2.2 将文本转化成ids

In [10]:
dataSet.convert_to_ids(vocab)

## 二、模型的构建

In [55]:
import os
import time
import logging
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from typing import Union
from datetime import datetime, timedelta



class CNN_att(object):
    """
    Implements the main CNN RelationClassification 
    """
    def __init__(self, args, vocab):

        # logging
        self.logger = logging.getLogger("RC")
        self.log_every_n_batch = args.log_every_n_batch

        # basic config
        self.seq_length = args.seq_length
        self.pos_num = args.pos_num
        self.num_classes = args.num_classes
        self.use_dropout = args.use_dropout
        self.pos_embed_dim = args.pos_embed_dim
        self.num_filters = args.num_filters
        self.filter_list = [3, 4, 5]
        self.fc_size = args.fc_size

        self.optim_type = args.optim
        self.learning_rate = args.learning_rate
        self.clip = args.clip
        self.weight_decay = args.weight_decay
        
        # the vocab
        self.vocab = vocab
        
        # needed config
        self.input_all_lenth= self.vocab.embed_dim + self.pos_embed_dim*2
        self.init_value = tf.truncated_normal_initializer(stddev=0.1)

        # session info
        sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        sess_config.gpu_options.allow_growth = True
        sess_config.gpu_options.per_process_gpu_memory_fraction = 0.1
        self.sess = tf.Session(config=sess_config)

        self._build_graph()

        # save info
        self.saver = tf.train.Saver()

        # initialize the model
        self.sess.run(tf.global_variables_initializer())

    def _build_graph(self):
        """
        Builds the computation graph with Tensorflow
        """
        start_t = time.time()
        self._setup_placeholders()
        self._embed()
        self._convolution_layer()
        self._classify()
        self._compute_loss()
        self._compute_acc()
        self._create_train_op()
        print('Time to build graph: {} s'.format(time.time() - start_t))
        param_num = sum([np.prod(self.sess.run(tf.shape(v))) for v in self.all_params])
        print('There are {} parameters in the model'.format(param_num))
        
    def _setup_placeholders(self):
        """
        Placeholders
        """
        # 输入和标签
        self.inputs_x = tf.placeholder(tf.int32, [None, self.seq_length],
                                       name='inputs_x')
        self.distant_1 = tf.placeholder(tf.int32, [None, self.seq_length],
                                        name="dist_e1")
        self.distant_2 = tf.placeholder(tf.int32, [None, self.seq_length],
                                        name="dist_e2")
        self.labels_y = tf.placeholder(tf.int32, [None, self.num_classes],
                                       name='labels_y')
        # 词向量dropout保留的神经元比例
        self.emb_keep_prob = tf.placeholder(tf.float32, name='emb_keep_prob')
        # RNN的dropout保留的神经元比例
        self.rnn_keep_prob = tf.placeholder(tf.float32, name='rnn_keep_prob')
        # 全连接dropout保留的神经元比例
        self.fc_keep_prob = tf.placeholder(tf.float32, name='fc_keep_prob')
        # 输入的实际长度
        self.inputs_length = tf.placeholder(tf.int32, [None], name='input_length')

    # 词向量层
    def _embed(self):
        """
        The embedding layer
        """
        with tf.device('/cpu:0'), tf.variable_scope('embeddings'):
            self.word_embeddings = tf.get_variable(name='word_embeddings',
                                                   shape=[self.vocab.size, self.vocab.embed_dim],
                                                   initializer=tf.constant_initializer(self.vocab.embeddings),
#                                                    initializer=tf.random_uniform([self.vocab.size, self.vocab.embed_dim], -0.25, 0.25),
                                                   trainable=True)
            self.dist1_embeddings = tf.get_variable(name="pos1_embeddings",
                                                    dtype=tf.float32,
                                                    shape=[self.pos_num+1, self.pos_embed_dim])
            self.dist2_embeddings = tf.get_variable(name="pos2_embeddings",
                                                    dtype=tf.float32,
                                                    shape=[self.pos_num+1, self.pos_embed_dim])
            
            self.x_emb = tf.nn.embedding_lookup(self.word_embeddings, self.inputs_x)
            self.dist1_emb  = tf.nn.embedding_lookup(self.dist1_embeddings, self.distant_1)
            self.dist2_emb  = tf.nn.embedding_lookup(self.dist2_embeddings, self.distant_2)
            
            self.input_emb = tf.concat([self.x_emb, self.dist1_emb, self.dist2_emb], axis=-1, 
                                       name="input_emb")
            
            self.input_emb = tf.reshape(self.input_emb, [-1, self.seq_length, self.input_all_lenth, 1])
            
    def _convolution_layer(self):
        """
        The CNN layer
        """
        with tf.name_scope("convolution_layer"):
            # 这里使用的是1D卷积，卷积窗口是整个词向量加上位置向量的size
            w_windows = self.input_all_lenth
            
            pool_outputs = []
            for filter_size in self.filter_list:
                with tf.variable_scope('conv-%s' % filter_size):
                    cnn_w = tf.get_variable(shape=[filter_size, w_windows, 1, self.num_filters],
                                            initializer=self.init_value,
                                            name="cnn_w")
                    cnn_b = tf.get_variable(shape=[self.num_filters],
                                            initializer=tf.constant_initializer(0.1),
                                            name="cnn_b")
                    conv = tf.nn.conv2d(self.input_emb, cnn_w, strides=[1, 1, self.input_all_lenth, 1], padding="SAME")
                    R = tf.nn.relu(tf.nn.bias_add(conv, cnn_b),name="R") # [batch_size, max_len, 1, n_filters]
                    
                    R_pool = tf.nn.max_pool(R, ksize=[1,self.seq_length,1 , 1],
                                            strides=[1,self.seq_length,1, 1], 
                                            padding="SAME")  # [batch_size, 1, 1, n_filters]
                    pool_outputs.append(R_pool)
                    
            self.x_encode = tf.reshape(tf.concat(pool_outputs, 3), [-1, 3 * self.num_filters]) # [batch_size, 3 * num_filters]

    def _classify(self):
        """
        The classify layer
        """
        # 全连接层，后面接dropout以及relu激活
        with tf.name_scope("score"):
            # 一层全连接
            W_fc = tf.get_variable("W_fc", shape=[3*self.num_filters, self.fc_size], 
                                   initializer=self.init_value)
            b_fc = tf.Variable(tf.constant(0.1, shape=[self.fc_size]), name="b_fc")
            self.fc = tf.nn.xw_plus_b(self.x_encode, W_fc, b_fc)
            if self.use_dropout:
                self.fc = tf.nn.dropout(self.fc, self.fc_keep_prob)
            self.fc  = tf.nn.relu(self.fc, name="fc1")

            # 分类输出层
            W_output = tf.get_variable("W_output", shape=[self.fc_size, self.num_classes],
                                       initializer=self.init_value)
            b_output = tf.Variable(tf.constant(0.1, shape=[self.num_classes]), name="b_output")
            self.logits = tf.nn.xw_plus_b(self.fc, W_output, b_output, name="fc2")
            self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits, name='softmax'), 1, name="model_pred")

    def _compute_loss(self):
        """
        The loss function
        """
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.labels_y)
        self.loss = tf.reduce_mean(cross_entropy)
        
        self.all_params = tf.trainable_variables()
        
        
        if self.weight_decay > 0:
            self.all_params = tf.trainable_variables()
            with tf.variable_scope('l2_loss'):
                l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.all_params])
            self.loss += self.weight_decay * l2_loss
        
        if self.clip > 0:
            self.globle_step = tf.Variable(0,name="globle_step",trainable=False)
            self.tvars = tf.trainable_variables()
            self.grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, self.tvars), self.clip)
            
    def _compute_acc(self):
        """
        The acc
        """
        correct_pred = tf.equal(tf.argmax(self.labels_y , 1), self.y_pred_cls)
        self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    def _create_train_op(self):
        """
        Selects the training algorithm and creates a train operation with it
        """
        if self.optim_type == 'adagrad':
            self.optimizer = tf.train.AdagradOptimizer(self.learning_rate)
        elif self.optim_type == 'adam':
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        elif self.optim_type == 'rprop':
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate)
        elif self.optim_type == 'sgd':
            self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
        else:
            raise NotImplementedError('Unsupported optimizer: {}'.format(self.optim_type))
        
        if self.clip>0:
            self.train_op = self.optimizer.apply_gradients(zip(self.grads, self.tvars))
        else:
            self.train_op = self.optimizer.minimize(self.loss)


    def _train_epoch(self, train_batches, emb_keep_prob, rnn_keep_prob, fc_keep_prob):
        """
        Trains the model for a single epoch
        Args:
            train_batches: iterable batch data for training
            emb_keep_prob: float value indicating dropout keep probability of embedding layer
            rnn_keep_prob: float value indicating dropout keep probability of rnn layer
            fc_keep_prob: float value indicating dropout keep probability of fully-connection layer
        """
        total_num, total_loss, total_acc = 0, 0, 0
        log_every_n_batch, n_batch_loss, n_batch_acc = self.log_every_n_batch, 0, 0
        for bitx, batch in enumerate(train_batches, 1):
            feed_dict = {self.inputs_x: batch['contents'],
                         self.distant_1: batch['e1_distance'],
                         self.distant_2: batch['e2_distance'],
                         self.labels_y: batch['labels'],
                         self.emb_keep_prob: emb_keep_prob,
                         self.rnn_keep_prob: rnn_keep_prob,
                         self.fc_keep_prob: fc_keep_prob,
                         self.inputs_length: batch['contents_length']}
            _, loss, acc = self.sess.run([self.train_op, self.loss, self.acc], feed_dict)
            total_loss += loss * len(batch['contents'])
            total_acc += acc * len(batch['contents'])
            total_num += len(batch['contents'])
            n_batch_loss += loss
            n_batch_acc += acc
            if log_every_n_batch > 0 and bitx % log_every_n_batch == 0:
                print('Average loss and acc from batch {} to {} is {:>6.2} and {:>7.2%}'
                      .format(bitx - log_every_n_batch + 1, 
                              bitx, 
                              n_batch_loss / log_every_n_batch, 
                              n_batch_acc / log_every_n_batch))
                n_batch_loss = 0
                n_batch_acc = 0
        return 1.0 * total_loss / total_num, 1.0 * total_acc / total_num

    def train(self, data, epochs, batch_size, save_dir, save_prefix, 
              emb_keep_prob=1.0, rnn_keep_prob=1.0, fc_keep_prob=1.0,
              evaluate=True):
        """
        Train the model with data
        Args:
            data: the RCDataset class 
            epochs: number of training epochs
            batch_size: the size of one mini-batch
            save_dir: the directory to save the model
            save_prefix: the prefix indicating the model type
            emb_keep_prob: float value indicating dropout keep probability of embedding layer
            rnn_keep_prob: float value indicating dropout keep probability of rnn layer
            fc_keep_prob: float value indicating dropout keep probability of fully-connection layer
            evaluate: whether to evaluate the model on dev data after each epoch
        """
        pad_id = self.vocab.get_id(self.vocab.pad_token)
        best_acc_val = 0
        for epoch in range(1, epochs + 1):
            print('\nTraining the model for epoch {}'.format(epoch))
            train_batches = data.gen_mini_batches('train', batch_size)
            train_loss, train_acc = self._train_epoch(train_batches, emb_keep_prob, rnn_keep_prob, fc_keep_prob)
            print('The {} Epoch average train loss and acc is {:>6.2} and {:>7.2%}'.format(epoch, train_loss, train_acc))

            # 验证评估
            if evaluate:
                self.logger.info('Evaluating the model after epoch {}'.format(epoch))
                if data.dev_contents is not None:
                    eval_batches = data.gen_mini_batches('dev', batch_size)
                    # 计算验证集的f1、loss和accuracy
                    f1_val, loss_val, acc_val = self.evaluate(eval_batches, batch_size, data)
                    print('Dev eval loss: {:>6.2}'.format(loss_val))
                    print('Dev eval acc: {:>7.2%}'.format(acc_val))
                    print('Dev eval f1: {:>7.2%}'.format(f1_val))

                    if acc_val > best_acc_val:
                        self.save(save_dir, save_prefix)
                        best_acc_val = acc_val
                else:
                    self.logger.warning('No dev set is loaded for evaluation in the dataset!')
            else:
                self.save(save_dir, save_prefix + '_' + str(epoch))

    def evaluate(self, eval_batches, batch_size, data, test=False):
        """
        evaluate the model on dev data or test data
        Args:
            eval_batches: the eval data
            data: the TCDataset class 
            test: whether to choose test mode
        """
        if test:
            data_len = data.test_size
        else:
            data_len = data.dev_size
            
        num_batch = int((data_len - 1) / batch_size) + 1
        num_batch_list = list(range(num_batch))

        y_test_cls = np.zeros(shape=data_len, dtype=np.int32)
        y_pred_cls = np.zeros(shape=data_len, dtype=np.int32)  # 保存预测结果

        total_loss = 0.0
        total_acc = 0.0
        
        for i, batch in enumerate(eval_batches):  # 逐批次处理
            start_id = num_batch_list[i] * batch_size
            end_id = min((num_batch_list[i]  + 1) * batch_size, data_len)
            feed_dict = {self.inputs_x: batch['contents'],
                         self.distant_1: batch['e1_distance'],
                         self.distant_2: batch['e2_distance'],
                         self.labels_y: batch['labels'],
                         self.emb_keep_prob: 1.0,
                         self.rnn_keep_prob: 1.0,
                         self.fc_keep_prob: 1.0,
                         self.inputs_length: batch['contents_length']}
            
            y_pred_cls[start_id:end_id], loss, acc = self.sess.run([self.y_pred_cls, self.loss, self.acc],
                                                                   feed_dict=feed_dict)
            y_test_cls[start_id:end_id] = np.argmax(batch['labels'], 1)
            
            batch_len = len(batch['contents'])
            total_loss += loss * batch_len
            total_acc += acc * batch_len
            
        # 评估
        f1 = f1_score(y_test_cls, y_pred_cls, average="macro")
        
        if test:
            self.cr_report = classification_report(y_test_cls, y_pred_cls, target_names=sorted(set(data.test_labels)))
            self.cm_report = confusion_matrix(y_test_cls, y_pred_cls)
            self.y_pred_cls = y_pred_cls
            
            print('Test eval loss: {:>6.2}'.format(total_loss / data_len))
            print('Test eval acc: {:>7.2%}'.format(total_acc / data_len))
            print('Test eval f1: {:>7.2%}'.format(f1))                    
        else:
             return f1, total_loss / data_len, total_acc / data_len


    def save(self, model_dir, model_prefix):
        """
        Saves the model into model_dir with model_prefix as the model indicator
        Args:
            model_dir: the save path
            model_prefix: the prefix indicating the model type
        """
        self.saver.save(self.sess, os.path.join(model_dir, model_prefix))
        self.logger.info('Model saved in {}, with prefix {}.'.format(model_dir, model_prefix))

    def restore(self, model_dir, model_prefix):
        """
        Restores the model into model_dir from model_prefix as the model indicator
        Args:
            model_dir: the load path
            model_prefix: the prefix indicating the model type
        """
        self.saver.restore(self.sess, os.path.join(model_dir, model_prefix))
        self.logger.info('Model restored from {}, with prefix {}'.format(model_dir, model_prefix))
        
        
    def save_report(self, data, result_dir, save_suffix: Union[datetime, str, None] = None):
        """
        Saves the model into model_dir with model_prefix as the model indicator
        Args:
            data: the TCDataset class 
            result_dir: the save path
            save_suffix: the suffix 
        """
        if save_suffix is None:
            save_suffix = datetime.now().strftime('%Y-%m-%d-%H-%M') 
            
        report_data = [] 
        lines = self.cr_report.split('\n') 
        for line in lines[2:-5]:
            row = {}
            row_data = line.split()
            row['class'] = row_data[0]
            row['precision'] = float(row_data[1])
            row['recall'] = float(row_data[2])
            row['f1_score'] = float(row_data[3])
            row['support'] = float(row_data[4])
            report_data.append(row)
        df = pd.DataFrame.from_dict(report_data)
        cr_filename = f'{result_dir}/classification_report_{save_suffix}.csv'
        df.to_csv(cr_filename, index = False)
        
        
        df = pd.DataFrame(self.cm_report)
        df.columns = sorted(set(data.test_labels))
        
        df = df.rename(index=data.id2cat)
        cm_filename = f'{result_dir}/confusion_matrix_{save_suffix}.csv'
        df.to_csv(cm_filename)
        
        # 预测与实际值统计
        df = pd.DataFrame(self.y_pred_cls)
        df.columns = ['predict']

        df['predict'] = df['predict'].apply(lambda x: data.id2cat[x] )
        df['label'] = data.test_labels
        compare_filename = f'{result_dir}/predictAndlabel_{save_suffix}.csv'
        df.to_csv(compare_filename)
        
    def write_rc_results(self, data, result_dir, save_suffix: Union[datetime, str, None] = None):
        df = pd.DataFrame(self.y_pred_cls)
        df.columns = ['predict']
        df['predict'] = df['predict'].apply(lambda x: data.id2cat[x] )
        
        if save_suffix is None:
            save_suffix = datetime.now().strftime('%Y-%m-%d-%H-%M')            
        
        results_file = f'{result_dir}/result_{save_suffix}.txt'
        
        start_no = 8001
        with open(results_file, 'w') as f:
            for idx, rel in enumerate(df['predict'].tolist()):
                f.write('%d\t%s\n' % (start_no+idx, rel))
            
    # Length of the sequence data
    @staticmethod
    def _length(seq):
        relevant = tf.sign(tf.abs(seq))
        length = tf.reduce_sum(relevant, reduction_indices=1)
        length = tf.cast(length, tf.int32)
        return length
            
        
        

## 三、设置必要的参数并创建图

In [70]:
class args(object):
    """模型配置类"""
    
    # 日志配置
    log_every_n_batch = 100  # 每多少个batch打印一次日志    
    
    # 基础配置
    # 1. 整体参数
    seq_length = dataSet.max_content_len  # 序列长度
    pos_num =123 # 位置的map个数
    num_classes = dataSet.num_class  # 标签数
    use_dropout = True  # 是否开启dropout
    # 2. 词向量
    pos_embed_dim = 5
    dropout_emb_keep_prob = 0.9  # 词向量层dropout保留比例
    # 3. CNN参数
    num_filters = 100
    # 4. 全连接层
    fc_size = 100  # 全连接层的输出维度
    dropout_fc_keep_prob = 0.5  # 全连接层dropout保留比例
        
    # 优化算法配置
    optim = 'adam'  # 所选的优化算法
    learning_rate =1e-4 # 学习率
    clip = 5  # 梯度裁剪的限制，当值为0时，不开启梯度裁剪
    weight_decay =  0.00005  # L2正则，当值为0时，不开启正则

    
    # 训练配置
    batch_size = 64  # 每批训练大小
    num_epochs = 20  # 总迭代轮次
    
    # 保存配置
    save_dir = './checkpoints/'
    save_prefix = 'RC_V1'
    report_dir = './data'
    results_dir = './data'

In [71]:
args = args()

In [72]:
# 消除当前已经建立的静态图
import tensorflow.contrib.keras as kr
kr.backend.clear_session()

In [73]:
model = CNN_att(args, vocab)

Time to build graph: 0.5354444980621338 s
There are 1221409 parameters in the model


## 四、训练模型

In [75]:
model.restore(args.save_dir, args.save_prefix)

INFO:tensorflow:Restoring parameters from ./checkpoints/RC_V1


In [76]:
model.train(dataSet, 
            args.num_epochs, 
            args.batch_size, 
            args.save_dir,
            args.save_prefix, 
            args.dropout_emb_keep_prob, 
            args.dropout_fc_keep_prob)


Training the model for epoch 1
Average loss and acc from batch 1 to 100 is    1.7 and  99.98%
The 1 Epoch average train loss and acc is    1.7 and  99.96%


  'precision', 'predicted', average, warn_for)


Dev eval loss:    2.9
Dev eval acc:  68.97%
Dev eval f1:  64.30%

Training the model for epoch 2
Average loss and acc from batch 1 to 100 is    1.6 and  99.98%
The 2 Epoch average train loss and acc is    1.6 and  99.96%
Dev eval loss:    2.8
Dev eval acc:  68.86%
Dev eval f1:  64.54%

Training the model for epoch 3
Average loss and acc from batch 1 to 100 is    1.6 and  99.97%
The 3 Epoch average train loss and acc is    1.6 and  99.96%
Dev eval loss:    2.8
Dev eval acc:  69.16%
Dev eval f1:  64.73%

Training the model for epoch 4
Average loss and acc from batch 1 to 100 is    1.6 and  99.95%
The 4 Epoch average train loss and acc is    1.6 and  99.95%
Dev eval loss:    2.8
Dev eval acc:  68.79%
Dev eval f1:  64.40%

Training the model for epoch 5
Average loss and acc from batch 1 to 100 is    1.6 and  99.97%
The 5 Epoch average train loss and acc is    1.5 and  99.96%
Dev eval loss:    2.7
Dev eval acc:  69.08%
Dev eval f1:  64.59%

Training the model for epoch 6
Average loss and ac

KeyboardInterrupt: 

## 五、测试和结果保存

In [68]:
test_batches = dataSet.gen_mini_batches('test', args.batch_size)

In [69]:
model.evaluate(test_batches, args.batch_size, dataSet, True)

Test eval loss:    2.5
Test eval acc:  68.27%
Test eval f1:  63.94%


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
model.save_report(dataSet, args.report_dir)

In [None]:
model.write_rc_results(dataSet, args.results_dir)