In [None]:
%tensorflow_version 2.1
!pip install bert-for-tf2
!pip install sentencepiece
!pip install rouge-score

# Data preprocessing

In [None]:
# !wget -O "cnn_stories.tgz" "https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ"
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ" -O cnn_stories.tgz && rm -rf /tmp/cookies.txt
!tar -xvzf "cnn_stories.tgz"
import nltk
nltk.download('punkt')
nltk.download('tagsets')

In [None]:
import os
from nltk import sent_tokenize
import pickle
from multiprocessing import Pool, Process
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from rouge_score import rouge_scorer
import bert


def get_sample(doc, sum, scorer, tokenizer):
    doc_size = len(doc)
    res = np.zeros(doc_size, dtype="int32")
    n = min(len(sum), doc_size)
    # f1 of rouge-L
    for j in range(n):
        score = [scorer.score(sum[j], sent_i)['rougeL'][2] for sent_i in doc]
        sent_pos = np.argmax(score)
        for i in range(doc_size):
            if res[sent_pos] == 1:
                score[sent_pos] = 0
                sent_pos = np.argmax(score)
            else:
                res[sent_pos] = 1
                break
        # print(score[sent_pos])
        # print(doc[sent_pos])
        # print(sum[j], "\n")

    doc_ids = []
    for sent_j in doc:
        if sent_j != '':
            doc_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize("[CLS] " + sent_j)))
        else:
            doc_ids.append([0])

    return doc_ids, res


def pre_processing(data_dir, files, tokenizer, MAX_SEQ_LEN, MAX_DOC_LEN):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    docs_ids = []
    labels = []
    for i in range(len(files)):
        if i%1000 == 0:
            print(os.getpid(), i)
        with open(os.path.join(data_dir, files[i]), encoding='utf-8') as f:
            document = f.read().rstrip().split("\n\n@highlight\n\n")
            summary = document[1:]
            doc = document[0].split("\n\n")
            remove = [i for i in range(len(doc)) if (len(doc[i]) < 20 or ("EST" in doc[i] and "20" in doc[i]))]
            for i in range(len(remove)-1, -1, -1):
                doc.pop(i)
            if len(doc) == 0 or doc[0] == '':
                continue    
            if len(doc) > MAX_DOC_LEN:
                doc = doc[:MAX_DOC_LEN]
            else:
                doc += [''] * (MAX_DOC_LEN - len(doc)) 
            doc_ids, label = get_sample(doc, summary, scorer, tokenizer)
            docs_ids.append(pad_sequences(doc_ids, maxlen=MAX_SEQ_LEN, dtype='int32',
                                            padding='post', truncating='post'))
            labels.append(label)
    return docs_ids, labels


def write(write_dir, data, name, is_x_train=True):
    if not os.path.isdir(write_dir + "/x_train/"):
        os.mkdir(write_dir + "/x_train/")
        os.mkdir(write_dir + "/y_train/")
    if is_x_train:
        with open(write_dir + "/x_train/" + name, "wb") as f:
            pickle.dump(data, f)
    else:
        with open(write_dir + "/y_train/" + name, "wb") as f:
            pickle.dump(data, f)


def preprocess_and_write(inp_list):
    data_dir, write_dir, files, vocab_file, MAX_SEQ_LEN, MAX_DOC_LEN, name = inp_list
    tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file)
    docs_ids, labels = pre_processing(data_dir, files, tokenizer, MAX_SEQ_LEN, MAX_DOC_LEN)
    write(write_dir, docs_ids, name, True)
    write(write_dir, labels, name, False)
    return docs_ids, labels


def read(inp_list):
    read_dir, name = inp_list
    with open(read_dir + "/x_train/" + name, "rb") as f:
        x_train = pickle.load(f)
    with open(read_dir + "/y_train/" + name, "rb") as f:
        y_train = pickle.load(f)

    return x_train, y_train


def parallel(func, input_list, num_workers=4):
    with Pool(num_workers) as p:
        res = p.map(func, input_list)
    return [i for i in res]


def sequence(func, input_list):
    res = []
    for inp in input_list:
        res.append(func(inp))
    return res


if __name__ == "__main__":
    DATA_DIR = "cnn/stories"
    WRITE_DIR = "train_data"
    files = os.listdir(DATA_DIR)
    num_files = len(files)
    print(num_files)

    do_lower_case = True
    vocab_file = os.path.join(model_dir, "vocab.txt")
    tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)
    MAX_SEQ_LEN = 128
    MAX_DOC_LEN = 64

    np.random.shuffle(files)
    if not os.path.isdir(WRITE_DIR):
        os.mkdir(WRITE_DIR)
    with open(WRITE_DIR + "/files_0", "wb") as f:
        pickle.dump(files, f)

        
    NUM_WORKERS = 4
    write_task_input = [[DATA_DIR, WRITE_DIR, files[num_files*i//NUM_WORKERS: num_files*(i+1)//NUM_WORKERS],
                         vocab_file, MAX_SEQ_LEN, str(i)] for i in range(0, NUM_WORKERS)]
    
    parallel(preprocess_and_write, write_task_input, NUM_WORKERS)
    sequence(preprocess_and_write, write_task_input)
    data_len = 15000
#     sequence(preprocess_and_write, [[DATA_DIR, WRITE_DIR, files[:data_len],
#                          vocab_file, MAX_SEQ_LEN, MAX_DOC_LEN, "data"]])

    read_task_input = [[WRITE_DIR, str(i)] for i in np.arange(0, NUM_WORKERS*INTERVAL, INTERVAL)]
    print(parallel(read, read_task_input, NUM_WORKERS))



In [None]:
# !cp -a "./train_data" "drive/My Drive/project 2/Data/Daily_Mail"
from sklearn.model_selection import train_test_split

x_train, y_train = read(["./train_data", "data"])
for i in range(len(x_train)):
    x_train[i] = x_train[i][:MAX_DOC_LEN]
    y_train[i] = y_train[i][:MAX_DOC_LEN]
lengths = [len(i) for i in y_train]
# x_test, y_test = x_train[:1000], y_train[:1000]
# x_train, y_train = x_train[1000:], y_train[1000:]
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train,
     test_size=1000/len(x_train), random_state=2)
print(np.mean(lengths), np.std(lengths))

In [None]:
import bert
import tensorflow as tf
import os
import numpy as np
from nltk.tokenize import sent_tokenize
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Bidirectional, Flatten, \
    GRU, Input, GlobalAveragePooling2D, Dense, Conv2D, MaxPool2D, \
    GlobalAveragePooling1D
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.activations import sigmoid, softmax
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model


class Extractor:
    def __init__(self, MAX_DOC_LEN, MAX_SEQ_LEN, layer_bert):
        #  base on architecture of "Ranking Sentences for Extractive Summarization"
#         self.bert_input_shape = (MAX_DOC_LEN, MAX_SEQ_LEN)
        self.input_shape = (MAX_DOC_LEN, MAX_SEQ_LEN, 128)
        self.bert = layer_bert
        self.pooled_bert = GlobalAveragePooling1D()
        self.document_encoder_hidden = 96
        self.cnn_output_dim = 32
        self.sentence_extractor_hidden = self.document_encoder_hidden + self.cnn_output_dim
        self.loss = BinaryCrossentropy()
        self.optimizer = Adam(0.001)
        self.model = self._build_model()

    def _build_model(self):
        # input
        inp_cnn = Input(self.input_shape)  # (None, MAX_DOC_LEN, MAX_SEQ_LEN, 128)
        inp_sum = Input((MAX_DOC_LEN, MAX_SEQ_LEN))  # (None, MAX_DOC_LEN, MAX_SEQ_LEN)
        
        # document_encoder and sentence_extractor
        document_encoder = GRU(self.document_encoder_hidden, go_backwards=True)
        sentence_extractor = GRU(self.sentence_extractor_hidden, return_sequences=True)
        
        # CNN for extract features
        cnn1 = Conv2D(64, (4, 4), activation="relu", padding="same")
        maxpool1 = MaxPool2D((4, 4))
        cnn2 = Conv2D(16, (4, 4), activation="relu", padding="same")
        maxpool2 = MaxPool2D((2, 2))
        
        # cnn flow
        features = cnn1(inp_cnn)  # 64x128x64
        features = maxpool1(features)  # 16x32x64
        features = cnn2(features)  # 16x32x32
        features = maxpool2(features)  # 8x16x16
        features = Flatten()(features)  # 2048
        features = Dense(self.cnn_output_dim)(features)  # (None, self.cnn_output_dim )
        
        # main flow
        doc_encoder_out = document_encoder(inp_sum)  # (None, self.document_encoder_hidden)
        sent_extractor_intitail_state = tf.concat([doc_encoder_out, features], axis=1)
        out = sentence_extractor(inp_sum, initial_state=sent_extractor_intitail_state)  # (None, MAX_DOC_LEN, self.sentence_extractor_hidden)
        out = GlobalAveragePooling1D(data_format='channels_first')(out)  # (None, MAX_DOC_LEN)
        out = Dense(MAX_DOC_LEN, activation='sigmoid')(out)
        
        model = Model([inp_sum, inp_cnn], out)
        model.compile(optimizer=self.optimizer,
                       loss=self.loss)
        
        return model
        

    def get_inp(self, inputs):
        inp_sums = []
        inp_cnns = []
        for inp in inputs:
            inp_cnn = self.bert(tf.convert_to_tensor(inp))
            inp_cnns.append(inp_cnn)
            inp_sums.append(self.pooled_bert(inp_cnn))
        return [tf.convert_to_tensor(inp_sums), tf.convert_to_tensor(inp_cnns)]
    
    def predict(self, inputs):
        inps = self.get_inp(inputs)
        return self.model.predict(inps)

    def train(self, x_train, y_train, batch_size=4):
        losses = []
        mean_loss = 0
        for i in range(0, len(x_train), batch_size):
            inp = self.get_inp(x_train[i: i+batch_size])
            loss = np.mean(self.model.train_on_batch(inp, np.array(y_train[i: i+batch_size])))           
            mean_loss = (mean_loss*i + loss*batch_size) / (i+batch_size)
            print("step: {},  loss: {}".format(i,mean_loss))
            losses.append(loss)

        return losses
    
    def save(self, dir):
        if not os.path.isdir(dir):
            os.mkdir(dir)
        self.model.save_weights(os.path.join(dir, "model_weights.h5"))
                  
    def load(self, dir):
        if not os.path.isdir(dir):
            print("Path does not exist!")
            return None
        self.model.load_weights(os.path.join(dir, "model_weights.h5"))


In [None]:
if __name__ == "__main__":

    model_dir = "uncased_L-2_H-128_A-2"
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.WARN)

    model_ckpt = os.path.join(model_dir, "bert_model.ckpt")
    MAX_SEQ_LEN = 128
    MAX_DOC_LEN = 64

    # do_lower_case = not (model_name.find("cased") == 0 or model_name.find("multi_cased") == 0)
    do_lower_case = True
    bert.bert_tokenization.validate_case_matches_checkpoint(do_lower_case, model_ckpt)
    vocab_file = os.path.join(model_dir, "vocab.txt")
    tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

    bert_params = bert.params_from_pretrained_ckpt(model_dir)
    bert_layer = bert.BertModelLayer.from_params(bert_params, name="bert")


In [None]:
extractor = Extractor(MAX_DOC_LEN, MAX_SEQ_LEN, bert_layer)
# extractor.load("drive/My Drive/project 2/save/cnn")
print(extractor.model.summary())

In [None]:
# from sklearn.utils import shuffle
# x_train, y_train = shuffle(x_train, y_train)
batch_size = 32
losses = extractor.train(x_train, y_train, batch_size)
print(losses)

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
size = len(losses)
print(len(losses))
step = np.arange(0, len(losses))
plt.plot(step, losses)
plt.ylabel("loss")
plt.xlabel("step (x{})".format(size))
plt.show()

In [None]:
extractor.save("save/cnn")

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk import sent_tokenize
from rouge_score import rouge_scorer
def predict(data_dir, tokenizer, extractor):
    with open(data_dir, encoding='utf-8') as f:
        document = f.read().rstrip().split("\n\n@highlight\n\n")
        summary = document[1:]
        doc = document[0].split("\n\n")
        remove = [i for i in range(len(doc)) if (len(doc[i]) < 20 or ("EST" in doc[i] and "20" in doc[i]))]
        for i in range(len(remove)-1, -1, -1):
            doc.pop(i)
        if len(doc) == 0 or doc[0] == "":
            return None, None
        if len(doc) > MAX_DOC_LEN:
            doc = doc[:MAX_DOC_LEN]
        else:
            doc += [''] * (MAX_DOC_LEN - len(doc))
        doc_ids = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize("[CLS] " + sent_j)) for sent_j in doc]
        doc_ids = pad_sequences(doc_ids, maxlen=MAX_SEQ_LEN, dtype='int32',
                                padding='post', truncating='post')
    out = softmax(tf.convert_to_tensor(extractor.predict([doc_ids])))[0]
    sent_pos = sorted(sorted(range(len(out)), key = lambda sub: out[sub])[-4:])
    return [doc[i] for i in sent_pos], summary

# def cal_rouge(sum, doc):
#     scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
#     return scorer.score(sum, doc)


def eval(data_dir, files, tokenizer, extractor):
    fs = files
    res = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
    for f in fs:
        print(f)
        doc, sum = predict(os.path.join(data_dir, f), tokenizer, extractor)
        if doc is None:
            continue
        pos = []
        for i in range(len(sum)):
            sent_sum = sum[i]
            score = [scorer.score(sent_sum, sent)['rougeL'][2] for sent in doc]
            mx = np.argmax(score)
            for j in range(len(doc)):
                if mx not in pos:
                    pos.append(mx)
                    break
                else:
                    score[mx] = -999
                    mx = np.argmax(score)
        # x_score = []
        # for sent in doc:
        #     x_score.append(max([scorer.score(sum_i, sent)['rougeL'][2] for sum_i in sum]))
        # x_score = np.mean(x_score)
        doc = '\n'.join([doc[i] for i in pos])
#         doc = '\n'.join(doc)
        sum = '\n'.join(sum)
        score = scorer.score(sum, doc)
        print(score['rougeLsum'][2])
        res.append([score['rouge1'][2],
                    score['rouge2'][2],
                    score['rougeLsum'][2],
#                     score['rougeL'][2]
                    # , x_score
                    ])
    print(np.mean(np.array(res, dtype="float32"), 0))
    return res

res = eval(DATA_DIR, files[70000: 70500], tokenizer, extractor)
    

In [None]:
# R1 R2 RL 
# [0.30686525 0.12081952 0.27703723 ]
# [0.31640592 0.12772253 0.28809837 ]
print(len(res))