# SummaRuNNer 

## 00. imports

In [1]:
%load_ext lab_black

In [29]:
import platform
import json
import dill  # pip install dill
import numpy as np
import pandas as pd

from functools import partial
from collections import Counter, OrderedDict, defaultdict
from tqdm import tqdm  # pip install tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pytorch_lightning as pl

from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import random_split
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TestTubeLogger  # pip install test-tube

if platform.system() == "Windows":
    try:
        from eunjeon import Mecab
    except:
        print("please install eunjeon module")
else:  # Linux or MacOS일 경우
    from konlpy.tag import Mecab

In [3]:
import warnings

warnings.filterwarnings(action="ignore")

In [4]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## 01. Preprocessing 

### 1) Build Vocab

- 학습 및 테스트에 사용할 vocabulary를 구축합니다.

In [5]:
def build_vocab(dataset, num_words=40000, stopwords=None):
    # 0. tokenizer
    tokenizer = Mecab()

    # 1. tokenization
    all_tokens = []
    for data in tqdm(dataset):
        sents = data["article_original"]
        for sent in sents:
            tokens = tokenizer.morphs(sent)
            if stopwords:
                all_tokens.extend([token for token in tokens if token not in stopwords])
            else:
                all_tokens.extend(tokens)

    # 2. build vocab
    vocab = Counter(all_tokens)
    vocab = vocab.most_common(num_words)

    # 3. add pad & unk tokens
    word_index = defaultdict()
    word_index["<PAD>"] = 0
    word_index["<UNK>"] = 1

    for idx, (word, _) in enumerate(vocab, 2):
        word_index[word] = idx

    index_word = {idx: word for word, idx in word_index.items()}

    return word_index, index_word

### 2) Feature Class

In [22]:
class Feature:
    def __init__(self, word_index, tokenizer):
        self.word_index = word_index
        self.index_word = {idx: word for word, idx in word_index.items()}
        assert len(self.word_index) == len(self.index_word)
        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.PAD_TOKEN = "<PAD>"
        self.UNK_TOKEN = "<UNK>"
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.word_index)

    def index_to_word(self, idx):
        return self.index_word[idx]

    def word_to_index(self, w):
        if w in self.word_index:
            return self.word_index[w]
        else:
            return self.UNK_IDX

    ###################
    # Create Features #
    ###################
    def make_features(
        self, docs, ext_idx_list, summaries_list, doc_trunc=50, sent_trunc=128,
    ):

        # trunc document
        # 문서 내 doc_trunc 문장 개수까지 가져옴
        sents_list, targets, doc_lens, ext_sums, abs_sums = [], [], [], [], []
        for doc, ext_indices, abs_sum in zip(docs, ext_idx_list, summaries_list):
            labels = []
            for idx in range(len(doc)):
                if idx in ext_indices:
                    labels.append(1)
                else:
                    labels.append(0)

            max_sent_num = min(doc_trunc, len(doc))
            sents = doc[:max_sent_num]
            labels = labels[:max_sent_num]
            ext_sum = [sent for sent, label in zip(sents, labels) if label == 1]

            sents_list.extend(sents)
            targets.extend(labels)
            doc_lens.append(len(sents))
            ext_sums.append(ext_sum)
            abs_sums.append(abs_sum)

        # trunc or pad sent
        # 문장 내 sent_trunc 단어 개수까지 가져옴
        max_sent_len = 0
        batch_sents = []
        for sent in sents_list:
            words = self.tokenizer.morphs(sent)
            if len(words) > sent_trunc:
                words = words[:sent_trunc]
            max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
            batch_sents.append(words)

        features = []
        for sent in batch_sents:
            feature = [self.PAD_IDX for _ in range(max_sent_len - len(sent))] + [
                self.word_to_index(w) for w in sent
            ]
            features.append(feature)

        return features, targets, doc_lens, ext_sums, abs_sums, docs

    def make_predict_features(
        self, docs, doc_trunc=50, sent_trunc=128,
    ):

        sents_list, doc_lens = [], []
        for doc in docs:
            max_sent_num = min(doc_trunc, len(doc))
            sents = doc[:max_sent_num]

            sents_list.extend(sents)
            doc_lens.append(len(sents))

        max_sent_len = 0
        batch_sents = []
        for sent in sents_list:
            words = self.tokenizer.morphs(sent)
            if len(words) > sent_trunc:
                words = words[:sent_trunc]
            max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
            batch_sents.append(words)

        features = []
        for sent in batch_sents:
            feature = [self.PAD_IDX for _ in range(max_sent_len - len(sent))] + [
                self.word_to_index(w) for w in sent
            ]
            features.append(feature)

        return features, doc_lens, docs

## 02. Dataset & collate function

### 1) Dataset Class

In [7]:
class SumDataset(Dataset):
    def __init__(self, path, phase="train"):
        self.phase = phase
        with open(path, "r", encoding="utf-8") as f:
            jsonl = list(f)

        self.data = []
        for json_str in jsonl:
            self.data.append(json.loads(json_str))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        doc_id = self.data[idx]["id"]
        doc = self.data[idx]["article_original"]

        if self.phase == "train":
            ext_indices = self.data[idx]["extractive"]
            summaries = self.data[idx]["abstractive"]
            return doc, ext_indices, summaries, doc_id
        else:
            return doc, doc_id

### 2) collate function

- Trainset DataLoader에 필요한 collate function을 정의합니다.

In [8]:
def collate_fn(batch, feature):
    docs = [entry[0] for entry in batch]
    labels_list = [entry[1] for entry in batch]
    summaries_list = [entry[2] for entry in batch]
    doc_ids = [entry[3] for entry in batch]

    (
        features,
        targets,
        doc_lens,
        ext_sums,
        abs_sums,
        origin_docs,
    ) = feature.make_features(docs, labels_list, summaries_list)

    docs = []
    labels = []
    start = 0
    pad_dim = len(features[0])
    max_doc_len = max(doc_lens)
    for doc_len in doc_lens:
        stop = start + doc_len
        doc = features[start:stop]
        target = targets[start:stop]
        start = stop

        doc = torch.LongTensor(doc)
        if len(doc) == max_doc_len:
            docs.append(doc.unsqueeze(0))
        else:
            pad = torch.zeros(max_doc_len - doc_len, pad_dim, dtype=torch.long)
            docs.append(torch.cat([doc, pad]).unsqueeze(0))

        if len(target) == max_doc_len:
            labels.append(torch.FloatTensor(target).unsqueeze(0))
        else:
            pad = torch.zeros(max_doc_len - doc_len)
            target = torch.FloatTensor(target)
            labels.append(torch.cat([target, pad]).unsqueeze(0))

    docs = torch.cat(docs, dim=0)
    labels = torch.cat(labels, dim=0)
    targets = torch.FloatTensor(targets)
    doc_lens = torch.LongTensor(doc_lens)
    return docs, labels, doc_lens, max_doc_len, ext_sums, abs_sums, origin_docs, doc_ids

## 03. Modeling

### 0) global function

In [9]:
def avg_pool1d(sequences, seq_lens):
    out = []
    for idx, tensor in enumerate(sequences):
        tensor = tensor[: seq_lens[idx], :]
        tensor = torch.t(tensor).unsqueeze(0)
        out.append(F.avg_pool1d(tensor, tensor.size(2)))

    out = torch.cat(out).squeeze(2)
    return out

### 1) Sentence Encoder

In [10]:
class SentenceEncoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int = 100,
        hidden_dim: int = 128,
        num_layers: int = 1,
        bidirectional: bool = True,
        dropout_p: float = 0.3,
        pretrained_vectors: np.ndarray = None,
    ):
        super().__init__()

        self.vocab_size = (vocab_size,)
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.num_directs = 1
        if bidirectional:
            self.num_directs = 2

        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if pretrained_vectors is not None:
            self.embed.weight.data.copy_(pretrained_vectors)
        else:
            nn.init.xavier_uniform_(self.embed.weight)

        self.bilstm = nn.LSTM(
            self.embed_dim,
            self.hidden_dim,
            num_layers=self.num_layers,
            batch_first=True,
            bidirectional=self.bidirectional,
            # dropout=dropout,
        )

    def pad_doc(self, sents, doc_lens, max_doc_len):
        pad_dim = sents.size(1)
        sent_input = []
        start = 0
        for doc_len in doc_lens:
            stop = start + doc_len
            valid = sents[start:stop]
            start = stop
            if doc_len == max_doc_len:
                sent_input.append(valid.unsqueeze(0))
            else:
                pad = Variable(torch.zeros(max_doc_len - doc_len, pad_dim)).type_as(
                    sents
                )
                sent_input.append(torch.cat([valid, pad]).unsqueeze(0))

        sent_input = torch.cat(sent_input, dim=0)  # .type_as(sents)
        return sent_input

    def forward(self, docs, doc_lens, max_doc_len):
        sent_input = []
        for idx, doc_len in enumerate(doc_lens):
            doc = docs[idx][:doc_len]
            sent_input.append(doc)
        sent_input = torch.cat(sent_input, dim=0)
        sent_lens = torch.sum(torch.sign(sent_input), dim=1).data

        x = self.embed(sent_input)
        output, _ = self.bilstm(x)
        output = avg_pool1d(output, sent_lens)
        output = self.pad_doc(output, doc_lens, max_doc_len)
        output = output.type_as(docs)
        output = output.type(torch.float)

        return output

### 2) Document Encoder

In [11]:
class DocumentEncoder(nn.Module):
    def __init__(
        self,
        input_dim: int = 128,
        hidden_dim: int = 128,
        num_layers: int = 1,
        bidirectional: bool = True,
        dropout_p: float = 0.3,
    ):
        super().__init__()

        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.num_directs = 1
        if bidirectional:
            self.num_directs = 2

        self.bilstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            # dropout=dropout,
        )
        # self.linear = nn.Linear(hidden_dim * 2, hidden_dim)

    def forward(self, sents, doc_lens):
        output, hidden = self.bilstm(sents)
        return output

### 3) Total Encoder

In [12]:
class Encoder(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int = 100,
        hidden_dim: int = 128,
        num_layers: int = 1,
        bidirectional: bool = True,
        dropout_p: float = 0.3,
        pretrained_vectors: np.ndarray = None,
    ):
        super().__init__()

        self.sent_encoder = SentenceEncoder(
            vocab_size,
            embed_dim,
            hidden_dim,
            num_layers,
            bidirectional=True,
            dropout_p=dropout_p,
            pretrained_vectors=pretrained_vectors,
        )

        self.doc_encoder = DocumentEncoder(
            2 * hidden_dim,
            hidden_dim,
            num_layers,
            bidirectional=True,
            dropout_p=dropout_p,
        )

    def forward(self, docs, doc_lens, max_doc_len):
        encoded_sents = self.sent_encoder(docs, doc_lens, max_doc_len)
        encoded_docs = self.doc_encoder(encoded_sents, doc_lens)
        return encoded_docs

### 4) SummaRuNNer Model

In [13]:
class SummaRunner(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int = 100,
        hidden_dim: int = 128,
        pos_dim: int = 50,
        pos_num: int = 100,
        seg_num: int = 25,
        num_layers: int = 1,
        bidirectional: bool = True,
        dropout_p: float = 0.3,
        pretrained_vectors: np.ndarray = None,
    ):
        super(SummaRunner, self).__init__()
        self.hidden_dim = hidden_dim
        self.abs_pos_embed = nn.Embedding(pos_num, pos_dim)  # absolute postion
        self.rel_pos_embed = nn.Embedding(seg_num, pos_dim)  # relative position

        self.encoder = Encoder(
            vocab_size, embed_dim, hidden_dim, num_layers, bidirectional, dropout_p
        )

        self.fc = nn.Linear(2 * hidden_dim, 2 * hidden_dim)

        # Parameters of Classification Layer
        # P(y_j = 1|h_j, s_j, d), Eq.6 in SummaRuNNer paper
        self.content = nn.Linear(2 * hidden_dim, 1, bias=False)
        self.salience = nn.Bilinear(2 * hidden_dim, 2 * hidden_dim, 1, bias=False)
        self.novelty = nn.Bilinear(2 * hidden_dim, 2 * hidden_dim, 1, bias=False)
        self.abs_pos = nn.Linear(pos_dim, 1, bias=False)
        self.rel_pos = nn.Linear(pos_dim, 1, bias=False)
        self.bias = nn.Parameter(torch.FloatTensor(1).uniform_(-0.1, 0.1))

    def forward(self, docs, doc_lens, max_doc_len):
        sent_out = self.encoder(docs, doc_lens, max_doc_len)
        docs = avg_pool1d(sent_out, doc_lens)

        probs = []
        for index, doc_len in enumerate(doc_lens):
            valid_hidden = sent_out[index, :doc_len, :]
            doc = torch.tanh(self.fc(docs[index])).unsqueeze(0)
            s = Variable(torch.zeros(1, 2 * self.hidden_dim)).type_as(docs)
            for position, h in enumerate(valid_hidden):
                h = h.view(1, -1)
                # get position embeddings
                abs_index = Variable(torch.LongTensor([[position]])).type_as(docs)
                abs_index = abs_index.type(torch.long)
                abs_features = self.abs_pos_embed(abs_index).squeeze(0)

                rel_index = int(round((position + 1) * 9.0 / doc_len.item()))
                rel_index = Variable(torch.LongTensor([[rel_index]])).type_as(docs)
                rel_index = rel_index.type(torch.long)
                rel_features = self.rel_pos_embed(rel_index).squeeze(0)

                # classification layer
                content = self.content(h)
                salience = self.salience(h, doc)
                novelty = -1 * self.novelty(h, torch.tanh(s))
                abs_p = self.abs_pos(abs_features)
                rel_p = self.rel_pos(rel_features)
                # P(y_j = 1|h_j, s_j, d) Eq.6 in SummaRuNNer paper
                prob = torch.sigmoid(
                    content + salience + novelty + abs_p + rel_p + self.bias
                )
                s = s + torch.mm(prob, h)
                probs.append(prob)

        return torch.cat(probs).squeeze()

## 04. Train the Model using Pytorch Lightning

### 1) Experiment Class

- `pytorch_lightning`을 사용하여 학습을 진행할 `Experiment` 클래스를 정의합니다.

In [18]:
class Experiment(pl.LightningModule):
    def __init__(self, model, lr):
        super(Experiment, self).__init__()

        self.model = model
        self.lr = lr
        self._loss = nn.BCELoss()

    # ---------------------
    # TRAINING
    # ---------------------
    def forward(self, docs, doc_lens, max_doc_len):
        return self.model(docs, doc_lens, max_doc_len)

    def loss_fn(self, preds, labels):
        bce_loss = self._loss(preds, labels)
        return bce_loss

    def accuracy(self, preds, labels):
        preds = torch.round(preds)
        corrects = (preds == labels).float().sum()
        acc = corrects / labels.numel()
        return acc

    def training_step(self, batch, batch_idx):
        docs, targets, doc_lens, max_doc_len, _, _, _, _ = batch

        preds = self.forward(docs, doc_lens, max_doc_len)

        labels = []
        for idx, doc_len in enumerate(doc_lens):
            label = targets[idx][:doc_len]
            labels.append(label)
        labels = torch.cat(labels, dim=0)

        train_loss = self.loss_fn(preds, labels)
        train_acc = self.accuracy(preds, labels)
        log_dict = {"train_acc": train_acc.detach(), "train_loss": train_loss.detach()}

        output = OrderedDict(
            {
                "loss": train_loss,
                "progress_bar": {"train_acc": train_acc},
                "log": log_dict,
            }
        )
        return output

    def validation_step(self, batch, batch_idx):
        docs, targets, doc_lens, max_doc_len, _, _, _, _ = batch

        preds = self.forward(docs, doc_lens, max_doc_len)

        labels = []
        for idx, doc_len in enumerate(doc_lens):
            label = targets[idx][:doc_len]
            labels.append(label)
        labels = torch.cat(labels, dim=0)

        val_loss = self.loss_fn(preds, labels)
        val_acc = self.accuracy(preds, labels)

        tqdm_dict = {"val_acc": val_acc.detach(), "val_loss": val_loss.detach()}
        output = OrderedDict(
            {
                "val_loss": val_loss,
                "val_acc": val_acc,
                "log": tqdm_dict,
                "progress_bar": tqdm_dict,
            }
        )
        return output

    def validation_epoch_end(self, outputs):
        val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean()
        val_acc_mean = torch.stack([x["val_acc"] for x in outputs]).mean()
        return {"val_loss": val_loss_mean.detach(), "val_acc": val_acc_mean.detach()}

    # ---------------------
    # TRAINING SETUP
    # ---------------------
    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=self.lr, weight_decay=1e-5)

### 2) Train data Load

- 학습 시킬 데이터셋을 불러 옵니다.

In [19]:
train_path = "./data/train.jsonl"

with open(train_path, "r", encoding="utf-8") as f:
    jsonl = list(f)

train_data = []
for json_str in jsonl:
    train_data.append(json.loads(json_str))

In [20]:
# train_data[0]

### 3) Feature & Dataset & DataLoader 정의하기

In [21]:
train_batch_size = 256
valid_batch_size = 256

In [22]:
word_index, index_word = build_vocab(train_data)

with open("./word_index.pkl", "wb") as f:
    dill.dump(word_index, f)

100%|██████████| 42803/42803 [00:54<00:00, 783.18it/s]


In [54]:
# Feature class
mecab = Mecab()
feature = Feature(word_index, mecab)

# Split Train, Valid Dataset
dataset = SumDataset(train_path)
val_len = int(len(dataset) * 0.2)
train_len = len(dataset) - val_len
trainset, validset = torch.utils.data.random_split(dataset, [train_len, val_len])

# DataLoader
train_loader = DataLoader(
    dataset=trainset,
    batch_size=train_batch_size,
    shuffle=True,
    collate_fn=partial(collate_fn, feature=feature),
    num_workers=8,
)

valid_loader = DataLoader(
    dataset=validset,
    batch_size=valid_batch_size,
    shuffle=False,
    collate_fn=partial(collate_fn, feature=feature),
    num_workers=8,
)

### 4) SetUp Model

In [23]:
model = SummaRunner(vocab_size=len(word_index))
experiment = Experiment(model, lr=0.001)

In [24]:
# ----------------
# TestTubeLogger
# ----------------
tt_logger = TestTubeLogger(
    save_dir="./logs", name="./summarunner", debug=False, create_git_tag=False,
)

# ----------------
# Checkpoint
# ----------------
checkpoint_callback = ModelCheckpoint(
    filepath="./checkpoints/summarunner{epoch:02d}_{val_loss:.3f}",
    monitor="val_loss",
    verbose=True,
    save_top_k=5,
)

early_stopping = EarlyStopping(monitor="val_loss", patience=5, verbose=True)

### 5) Start Train

In [25]:
# ----------------
# Trainer
# ----------------

trainer = Trainer(
    gpus=2,
    min_epochs=1,
    logger=tt_logger,
    num_sanity_val_steps=5,
    callbacks=[early_stopping],
    checkpoint_callback=checkpoint_callback,
    max_epochs=10,
    gradient_clip_val=2,
    distributed_backend="dp",
    precision=32,
)

In [26]:
# ----------------
# Start Train
# ----------------
trainer.fit(experiment, train_loader, valid_loader)

## 05. Testing the Model

### 1) Test data load

In [14]:
test_path = "./data/test_public.jsonl"

# Dataset
testset = SumDataset(test_path, phase="test")

### 2) Load Vocabulary

- 학습에 사용한 Vocabulary를 로드합니다.

In [15]:
vocab_path = "./word_index.pkl"

with open(vocab_path, "rb") as f:
    word_index = dill.load(f)

### 3) Test collate function

In [16]:
def test_collate_fn(batch, feature):
    docs = [entry[0] for entry in batch]
    doc_ids = [entry[1] for entry in batch]

    features, doc_lens, origin_docs = feature.make_predict_features(docs)

    docs = []
    start = 0
    pad_dim = len(features[0])
    max_doc_len = max(doc_lens)
    for doc_len in doc_lens:
        stop = start + doc_len
        doc = features[start:stop]
        start = stop

        doc = torch.LongTensor(doc)
        if len(doc) == max_doc_len:
            docs.append(doc.unsqueeze(0))
        else:
            pad = torch.zeros(max_doc_len - doc_len, pad_dim, dtype=torch.long)
            docs.append(torch.cat([doc, pad]).unsqueeze(0))

    docs = torch.cat(docs, dim=0)
    doc_lens = torch.LongTensor(doc_lens)
    return docs, doc_lens, max_doc_len, doc_ids, origin_docs

### 3) Feature & DataLoader

In [23]:
mecab = Mecab()

# Feature class
feature = Feature(word_index, mecab)

# DataLoader
test_loader = DataLoader(
    dataset=testset,
    batch_size=32,
    shuffle=False,
    collate_fn=partial(test_collate_fn, feature=feature),
    num_workers=8,
)

### 4) Pre-trained Weights Load

In [18]:
ckpt_path = "./checkpoints/summarunnerepoch=00_val_loss=0.448.ckpt"

checkpoint = torch.load(ckpt_path)
checkpoint["state_dict"] = OrderedDict(
    [(key.replace("model.", ""), val) for key, val in checkpoint["state_dict"].items()]
)

### 5) Setup Model

In [19]:
DEVICE = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [20]:
# ----------------
# SetUp Model
# ----------------
model = SummaRunner(vocab_size=len(word_index)).to(DEVICE)
model.load_state_dict(checkpoint["state_dict"])
model.eval()

SummaRunner(
  (abs_pos_embed): Embedding(100, 50)
  (rel_pos_embed): Embedding(25, 50)
  (encoder): Encoder(
    (sent_encoder): SentenceEncoder(
      (embed): Embedding(40002, 100, padding_idx=0)
      (bilstm): LSTM(100, 128, batch_first=True, bidirectional=True)
    )
    (doc_encoder): DocumentEncoder(
      (bilstm): LSTM(256, 128, batch_first=True, bidirectional=True)
    )
  )
  (fc): Linear(in_features=256, out_features=256, bias=True)
  (content): Linear(in_features=256, out_features=1, bias=False)
  (salience): Bilinear(in1_features=256, in2_features=256, out_features=1, bias=False)
  (novelty): Bilinear(in1_features=256, in2_features=256, out_features=1, bias=False)
  (abs_pos): Linear(in_features=50, out_features=1, bias=False)
  (rel_pos): Linear(in_features=50, out_features=1, bias=False)
)

### 6) Test the Model

In [27]:
num_topk = 3
test_batch_size = 32

model_summaries = []
for batch in tqdm(test_loader, total=len(testset) // test_batch_size):
    docs, doc_lens, max_doc_len, doc_ids, orgin_docs = batch
    preds = model(docs.to(DEVICE), doc_lens, max_doc_len)

    start = 0
    for d_idx, doc_len in enumerate(doc_lens):
        stop = start + doc_len
        pred = preds[start:stop]

        topk_indices = pred.topk(num_topk)[1].tolist()
        topk_indices.sort()

        doc = orgin_docs[d_idx]
        doc_id = doc_ids[d_idx]
        hyp = [doc[idx] for idx in topk_indices]

        model_summaries.append((doc_id, "\n".join(hyp)))

157it [00:40,  3.88it/s]                         


### 7) Submission csv 파일 만들기

In [30]:
result = pd.DataFrame(model_summaries, columns=["id", "summary"])
result.head()

Unnamed: 0,id,summary
0,311565946,▲ 당진시의회가 지난 10일 제55회 임시회를 열고 본격적인 의정활동을 시작했다.\...
1,343753227,‘어린이날(5월5일)’을 맞아 롯데백화점 광주점과 아울렛에서 아동 및 나들이 고객을...
2,336239968,‘孫 청문회’ 입장차는 여전 여야의 극한 대치로 올해 들어 폐업 상태였던 국회가 4...
3,343538710,태안 복군 30주년을 축하하고 도시 간 우호교류 협력과 민간분야 교류 활성화를 위해...
4,332820578,사진=뉴미디어팀대전 지역에 '빵 브랜드 전성시대'가 활짝 열렸다.\n모종린 연세대 ...


In [31]:
result.to_csv("./submission.csv", index=False)