# Header

In [None]:
# !pip install transformers
# !pip install sentencepiece
# !pip install rouge-score
# !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ" -O cnn_stories.tgz && rm -rf /tmp/cookies.txt
# !tar -xzf "cnn_stories.tgz"

In [1]:
import json
import os
import stanza
import numpy as np
import pandas as pd
import re
import torch 

from rouge_score import rouge_scorer
from tqdm import tqdm

from sklearn.utils import shuffle

from transformers import BertModel, BertTokenizerFast

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import LSTM, Conv2d, Linear
from torch.nn.functional import max_pool2d
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

2021-09-27 14:43:51.781056: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/lib64:/usr/local/cuda-10.1/lib64
2021-09-27 14:43:51.781094: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Data pre-processing

In [2]:
# url list from https://github.com/abisee/cnn-dailymail
with open('data/cnndm/filenames/cnn_files.json') as f:
    filenames = json.load(f)

train_files = filenames['train']
valid_files = filenames['valid']
test_files = filenames['test']


# stanza.download(lang='en')
nlp = stanza.Pipeline(lang='en', processors='tokenize')

HBox(children=(FloatProgress(value=0.0, description='Downloading https://raw.githubusercontent.com/stanfordnlp…

2021-09-16 13:46:52 INFO: Downloading default packages for language: en (English)...





HBox(children=(FloatProgress(value=0.0, description='Downloading http://nlp.stanford.edu/software/stanza/1.2.2…




2021-09-16 13:48:23 INFO: Finished downloading models and saved to /home/aimenext/stanza_resources.
2021-09-16 13:48:23 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |

2021-09-16 13:48:23 INFO: Use device: gpu
2021-09-16 13:48:23 INFO: Loading: tokenize
2021-09-16 13:48:34 INFO: Done loading processors!


In [16]:
LOWER = False
LENGTH_THRESHOLD = 10
rouge_factors = {'rouge1': 0.4, 'rouge2': 0.3, 'rougeL': 0.3}  

def sent_tokenize(doc):
    doc = nlp(doc)
    sentences = []
    for sentence in doc.sentences:
        # print(sentence.tokens[0])
        sentence = ' '.join([token.text for token in sentence.tokens])
        if len(sentence) > LENGTH_THRESHOLD:
            sentences.append(sentence)
    
    return sentences

def reconstruct_text(text):
    return re.sub('\s([?.!"](?:\s|$))', '', text)

def parse_file(file):
    with open(file, encoding='utf-8') as f:
        document = f.read().rstrip().split("\n\n@highlight\n\n")
    summary = document[1:]
    doc = sent_tokenize(document[0])
    return doc, summary


def make_label(doc, sum, scorer):
    doc_size = len(doc)
    res = [0] * doc_size
    n = min(len(sum), doc_size)
    # f1 of rouge-L
    for j in range(n):
        # score = [scorer.score(sum[j], sent_i)['rouge2'][2] for sent_i in doc]
        score = [scorer.score(sum[j], sent_i) for sent_i in doc]
        score = [( 
            # x['rouge1'][2] * rouge_factors['rouge1'] + \
            x['rouge2'][2] * rouge_factors['rouge2'] + \
            x['rougeL'][2] * rouge_factors['rougeL']
            ) for x in score]
        sent_pos = np.argmax(score)
        for i in range(doc_size):
            if res[sent_pos] == 1:
                score[sent_pos] = 0
                sent_pos = np.argmax(score)
            else:
                res[sent_pos] = 1
                break
        # print(score[sent_pos])
        # print(doc[sent_pos])
        # print(sum[j], "\n")
    return res

def process(data_dir, files):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    docs = {}
    summaries = {}
    labels = {}
    remove_files = []
    for idx in tqdm(range(len(files))):
        # if idx%1000 == 0:
        #     print('\n', os.getpid(), idx)
        doc, summary = parse_file(os.path.join(data_dir, files[idx]))
        if len(doc) < len(summary) or len(doc) == 0 or len(summary) == 0:
            remove_files.append(files[idx])   
            continue    
        label = make_label(doc, summary, scorer)
        docs[files[idx]] = doc
        labels[files[idx]] = label
        summaries[files[idx]] = summary
        # if idx%5000 == 0:
        #     a = list(zip(label, doc))
        #     for i in a:
        #         print(len(i[1]), i[0], i[1])
        #     print('##########\n','\n'.join(summary))
    return docs, labels, summaries, remove_files

def json_dump(obj, file):
    with open(file, 'w', encoding='utf8') as f:
        json.dump(obj, f, ensure_ascii=False, indent=4)

def process_and_write(data_dir, files, write_dir):
    docs, labels, summaries, remove_files = process(data_dir, files)

    os.makedirs(write_dir, exist_ok=True)
    json_dump(docs, os.path.join(write_dir, 'docs.json'))
    json_dump(labels, os.path.join(write_dir, 'labels.json'))
    json_dump(summaries, os.path.join(write_dir, 'summaries.json'))
    json_dump(remove_files, os.path.join(write_dir, 'remove_files.json'))


In [20]:
base_write_dir = 'data/cnndm/cnn'
process_and_write('cnn/stories', valid_files, os.path.join(base_write_dir, 'valid'))
process_and_write('cnn/stories', train_files, os.path.join(base_write_dir, 'train'))

# Data processing

In [44]:

class Config:
    def __init__(self):
        self.train_data_dir = 'data/cnndm/cnn/train'
        self.val_data_dir = 'data/cnndm/cnn/valid'

        self.MAX_SEQ_LEN = 128
        self.MAX_DOC_LEN = 48

        self.bert_hidden = 512
        self.bert_n_layers = 4

        self.windows_size = [1, 3, 5, 10]
        self.out_channels = 50
        self.lstm_hidden = 512
        self.device = 'cpu'

        self.batch_size = 4
        self.num_epochs = 10
        self.print_freq = 0.05
        self.save_dir = './save'

config = Config()
# config.__dict__

{'train_data_dir': 'data/cnndm/cnn/train',
 'val_data_dir': 'data/cnndm/cnn/valid',
 'MAX_SEQ_LEN': 128,
 'MAX_DOC_LEN': 48,
 'bert_hidden': 512,
 'bert_n_layers': 4,
 'windows_size': [1, 3, 5, 10],
 'out_channels': 50,
 'lstm_hidden': 512,
 'device': 'cpu'}

In [23]:
tokenizer = BertTokenizerFast.from_pretrained('prajjwal1/bert-small')

def load_json(file):
    with open(file) as f:
        return json.load(f)

def load_text(dir):
    docs = load_json(os.path.join(dir, 'docs.json'))
    labels = load_json(os.path.join(dir, 'labels.json'))
    return docs, labels

def get_encodings(docs, labels):
    keys = list(docs.keys())
    encodings = []
    return_labels = []

    for k in tqdm(keys):
        encodings.append(tokenizer(docs[k][:config.MAX_DOC_LEN], truncation=True,
                                   max_length=config.MAX_SEQ_LEN, padding='max_length'))
        return_labels.append(labels[k][:config.MAX_DOC_LEN])
    
    return keys, encodings, return_labels

# train_texts, train_labels = load_text(config.train_data_dir)
val_texts, val_dict_labels = load_text(config.val_data_dir)

# train_encodings = tokenizer(, truncation=True, max_length=config.MAX_SEQ_LEN, padding='max_length')
val_keys, val_encodings, val_labels = get_encodings(val_texts, val_dict_labels)

100%|██████████| 1220/1220 [00:05<00:00, 217.64it/s]


In [30]:
class ESDataset(Dataset):
    def __init__(self, encodings, labels=None, keys=None, config=config):
        self.encodings = encodings
        self.labels = labels
        self.keys = keys
        self.encoding_keys = ['input_ids', 'attention_mask']

        self.config = config

    def __getitem__(self, idx):
        item = {key: torch.tensor(self.encodings[idx][key]) for key in self.encoding_keys}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings)

def collate_fn(data, device=config.device):
    keys = data[0].keys()

    result = {k: [item[k] for item in data] for k in keys}
    input_ids = result['input_ids']
    result['document_mask'] = [torch.tensor([1] * len(input_ids[i])) for i in range(len(input_ids))]
    

    for k in result:
        result[k] = pad_sequence(result[k], batch_first=True).to(device)
    
    return result


In [31]:
dataset = ESDataset(val_encodings, val_labels, val_keys)
data_loader = DataLoader(dataset, batch_size=3, shuffle=True, collate_fn=collate_fn)

# Model

In [8]:
bert = BertModel.from_pretrained('prajjwal1/bert-small')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=286.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=116270890.0, style=ProgressStyle(descri…




In [35]:
class Bert_Embedding(nn.Module):
    def __init__(self, bert, config=config):
        super(Bert_Embedding, self).__init__()
        self.bert = bert
        self.bert_hidden = config.bert_hidden * config.bert_n_layers
        self.get_n_layers = config.bert_n_layers
        self.config = config
        
        self.windows_size = config.windows_size
        self.out_channels = config.out_channels
        self.lstm_embedding_size = len(self.windows_size) * config.MAX_SEQ_LEN  
        self.filters = nn.ModuleList([nn.Conv2d(1, self.out_channels,
                                                (i, self.bert_hidden)) for i in self.windows_size])
        self.relu = nn.ReLU()
        
    def forward(self, x, document_mask, attention_mask):
        lens = [mask_i.sum().item() for mask_i in document_mask]

        batch, doc_len, seq_len = list(x.shape)
        x = x.reshape((batch*doc_len, seq_len))
        attention_mask = attention_mask.reshape((batch*doc_len, seq_len))        

        last_hds, pooler_output, hidden_states = self.bert(x, attention_mask, output_hidden_states=True)
        embeddings = torch.cat(hidden_states[-self.get_n_layers:], axis=-1)  # batch, doc_len, seq_len, self.bert_hidden
        print(embeddings.shape)
        embeddings = embeddings.reshape((batch * doc_len, 1,  seq_len, self.bert_hidden))  # batch * doc_len, 1, MAX_SEQ_LEN, bert_hidden
        lstm_inputs = []

        for i in range(len(self.windows_size)):
            temp_out = self.filters[i](embeddings).squeeze(-1)  # batch * doc_len, self.out_channels, MAX_SEQ_LEN - self.windows_size[i] + 1
            cnn_result = torch.mean(temp_out, dim=1) # average along out_channels axis
            if cnn_result.shape[1] < self.config.MAX_SEQ_LEN: # pad cnn_result to MAX_SEQ_LEN
                pad_tensor = torch.zeros((cnn_result.shape[0], self.config.MAX_SEQ_LEN - cnn_result.shape[1])).to(cnn_result.device)
                cnn_result = torch.cat([cnn_result, pad_tensor], axis=1)
            lstm_inputs.append(cnn_result)
        lstm_inputs = torch.cat(lstm_inputs, dim=-1).reshape((batch, doc_len, self.lstm_embedding_size))  
        lstm_inputs = pack_padded_sequence(lstm_inputs, lens, batch_first=True, enforce_sorted=False)

        return lstm_inputs


class Document_Encoder(nn.Module):
    def __init__(self, embedding_size=350, config=config):
        super(Document_Encoder, self).__init__()

        self.config = config
        self.embedding_size = embedding_size
        self.doc_encoder = nn.LSTM(self.embedding_size, config.lstm_hidden, num_layers=1,
                            bidirectional=True, batch_first=True)

    def forward(self, lstm_inputs):
        _, doc_encoder_out = self.doc_encoder(lstm_inputs)

        return doc_encoder_out

class Sentence_Extractor(nn.Module):
    def __init__(self, embedding_size=350, config=config):
        super(Sentence_Extractor, self).__init__()

        self.config = config
        self.embedding_size = embedding_size
        self.sentence_extractor = nn.LSTM(self.embedding_size, config.lstm_hidden, num_layers=1,
                                  bidirectional=True, batch_first=True)
        self.dropout_layer = nn.Dropout(0.3)

    def forward(self, lstm_inputs, encoder_in):
        out_packed, (_, __) = self.sentence_extractor(lstm_inputs, encoder_in)
        out, out_lens = pad_packed_sequence(out_packed, batch_first=True)
        out = self.dropout_layer(out)
        return out

class Model(nn.Module):
    def __init__(self, bert, config=config):
        super(Model, self).__init__()
        self.config = config
        self.embeddings = Bert_Embedding(bert, config=config)
        self.doc_encoder = Document_Encoder(self.embeddings.lstm_embedding_size, config=config)
        self.sentence_extractor = Sentence_Extractor(self.embeddings.lstm_embedding_size, config=config)

        self.linear = Linear(config.lstm_hidden * 2, 1) 
        self.loss_func = nn.BCELoss()
        self.loss_padding_value = -100
        self.softmax = nn.Softmax(dim=-1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, document_mask, attention_mask, y=None):
        lstm_inputs = self.embeddings(x, document_mask, attention_mask)

        doc_encoder_out = self.doc_encoder(lstm_inputs)  
        encoder_in = doc_encoder_out

        out = self.sentence_extractor(lstm_inputs, encoder_in)
        out = self.sigmoid(self.linear(out).squeeze(-1))
        # print(out.shape, mask.shape)
        # out *= mask
        
        if y is not None:
            y = pad_sequence(y, batch_first=True, padding_value=self.loss_padding_value).to(out.device)
            loss = self.loss_func(out, y)
            return out, loss

        return out

In [42]:
## Test model
# model = Model(bert).to(config.device)

# for item in data_loader:
#     ids = item['input_ids']
#     document_mask = item['document_mask']
#     attention_mask = item['attention_mask']
#     print(model(ids, document_mask, attention_mask).shape, item['labels'].shape)
#     break

config.__dict__

{}

# Train

In [45]:
from sklearn.metrics import classification_report

def torch_save(dir, model, config, epoch=0,  optimizer=None, scheduler=None, all_train_loss=[],
               all_dev_loss=[], best_dev_loss=1e9):
    torch.save({
            'model_state_dict': model.state_dict(),
            'config': config,
            'epoch': epoch,
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'all_train_loss': all_train_loss,
            'all_dev_loss': all_dev_loss,
            'best_dev_loss': best_dev_loss
            }, dir)
    
def torch_load(dir, model, config, optimizer=None, scheduler=None, return_config=True):
    checkpoint = torch.load(dir)
    model.load_state_dict(checkpoint['model_state_dict'])
    old_config = checkpoint['config']
    model.to(config.device)

    if optimizer is not None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if scheduler is not None:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    all_train_loss = checkpoint['all_train_loss']
    all_dev_loss = checkpoint['all_dev_loss']
    best_dev_loss = checkpoint['best_dev_loss']
    
    if return_config:
        return old_config, all_train_loss, all_dev_loss, best_dev_loss
    else:
        return all_train_loss, all_dev_loss, best_dev_loss



def train(train_loader, val_loader, start_epoch=0, epochs=epochs, batch_size=batch_size, print_freq=0.5, save_dir=SAVE_DIR):
    best_dev_loss = 1e9
    model.train()
    all_train_loss, all_dev_loss, best_dev_loss = torch_load(os.path.join(SAVE_DIR, 'model_{}.pt'.format(max(start_epoch-1, 0))))

    start_time = time.time()
    print_after = int(print_freq * len(x) / batch_size)
    for epoch in range(start_epoch, epochs):
        print_counter = 0
        total_loss = []
        print('epoch:', epoch)
        for i in tqdm(range(0, len(x), batch_size)):
            prob, loss = model.forward(x[i: i+batch_size],
                                       mask[i: i+batch_size],
                                       y[i: i+batch_size],
                                       )
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()
            total_loss.append(loss.item())
            if i > print_counter:
                print('step: {}, loss: {}, total loss: {}'.format(i, loss.item(), np.mean(total_loss)))
                print_counter += print_after
        scheduler.step()
        
        print('train loss:', np.mean(total_loss))
        dev_loss = eval()
        print('dev_loss:', dev_loss)
        if dev_loss < best_dev_loss:
            torch_save(os.path.join(save_dir, 'best-model.pt'),
                   model, optimizer, scheduler, all_train_loss, all_dev_loss, best_dev_loss)
            best_dev_loss = dev_loss
        all_train_loss.append(total_loss)
        all_dev_loss.append(dev_loss)

        torch_save(os.path.join(save_dir, 'model_{}.pt'.format(str(epoch))),
                   model, optimizer, scheduler, all_train_loss, all_dev_loss, best_dev_loss)
        end_time = time.time()
        print('Finish epoch {} at {}, in {} seconds. \n'.format(epoch, end_time, end_time - start_time))
 

def eval(x=x_valid, y=y_valid, mask=mask_valid, batch_size=batch_size, get_report=True):
    model.eval()
    total_loss = []
    y_pred = []
    y_true = []

    for i in y:
        y_true.extend(i.tolist())

    with torch.no_grad():
        for i in range(0, len(x), batch_size):
            prob, loss = model.forward(x[i: i+batch_size],
                                      mask[i: i+batch_size],
                                      y[i: i+batch_size],
                                      )
            
            temp_y_pred = [0 for _ in range(len(y[i: i+batch_size]))]
            for j, sent in enumerate(y[i: i+batch_size]):
                temp_prob = np.argsort(prob[j][:len(sent)].tolist())
                temp_y_pred[j] = [0] * len(sent)
                # print(temp_prob)
                for k in temp_prob[-4:]:
                    temp_y_pred[j][k] = 1
            for sent in temp_y_pred:
                y_pred.extend(sent)
            total_loss.append(loss.item())

    if get_report:
        print(classification_report(y_true, y_pred))
        
    model.train()    
    
    return np.mean(total_loss)

SyntaxError: invalid syntax (2059861487.py, line 30)