# 01. Preprocess

## imports

In [1]:
%load_ext lab_black

In [2]:
import sys

sys.path.append("..")

In [3]:
import warnings

warnings.filterwarnings(action="ignore")

In [4]:
import json
import platform
import pickle
import dill
import yaml

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TestTubeLogger  # pip install test-tube

from functools import partial
from collections import defaultdict, OrderedDict, Counter
from tqdm import tqdm

from model import SummaRunner

# from utils.data import SumDataset, Feature
from model import build_vocab
from model.types_ import *

In [5]:
if platform.system() == "Windows":
    try:
        from eunjeon import Mecab
    except:
        print("please install eunjeon module")
else:  # Ubuntu일 경우
    from konlpy.tag import Mecab

## Data Load

In [11]:
train_path = "../../../../datasets/kor_data/magazine/train.jsonl"
dev_path = "../../../../datasets/kor_data/total_data/dev.jsonl"
test_path = "../../../../datasets/kor_data/total_data/test.jsonl"

In [12]:
with open(train_path, "r", encoding="utf-8") as f:
    jsonl = list(f)

train_data = []
for json_str in jsonl:
    train_data.append(json.loads(json_str))

In [13]:
train_data[0]

{'media': '이코노미스트',
 'id': '330370',
 'article_original': ['경영위기에 놓인 쌍용자동차를 놓고 최대주주인 인도 마힌드라 그룹과 산업은행 간 줄다리기가 이어지고 있다.',
  '마힌드라가 쌍용차 지배권 포기를 언급한 가운데, 산업은행은 쌍용차에 대한 기간산업안정기금 지원에 선을 그었다.',
  '지난 12일(현지시간) 쌍용차 이사회 의장인 파완고엔카 마힌드라 사장은 마힌드라의 컨퍼런스콜에서 "쌍용차는 새로운 투자자를 필요로 한다"며 "투자자를 확보할 수 있을지 모색 중"이라고 말했다.',
  '이 발언은 쌍용차에 대한 마힌드라의 투자 결정 철회와 맞물려 사업 철수에 대한 불안감을 키우고 있다.',
  '마힌드라는 애초 3년 후 흑자전환 목표를 내걸고 쌍용차에 2300억원 투자 계획을 제시했다가 철회했다.',
  '대신 긴급 자금 400억원만 지원하기로 했다.',
  '이후 산업계에선 정부가 쌍용차에 기간산업안정기금을 지원할 수 있다는 의견이 나왔다.',
  '이에 대해 최대현 산은 기업금융부문 부행장은 6월 17일 "기간산업안정기금은 코로나19 사태 이전부터 경영에 문제가 있는 회사를 지원하는 것은 아니다"라며 선을 그었다.',
  '다만 다음 달 만기가 도래하는 900억원의 대출은 만기연장을 할 예정이다.'],
 'abstractive': '지난 12일 쌍용차 파완고엔카 사장은 컨퍼런스 콜에서 "쌍용차는 새 투자자를 기다린다"고 말했는데 그는 3년의 흑자전환의 목표를 세우고 회사에 2300억원을 투자하려다 이를 철회하게 되므로 산업계에서는 정부가 쌍용차에 안정기금을 지원하기를 기대한다.',
 'extractive': [2, 4, 6]}

## Build Vocab Function

In [14]:
def build_vocab(
    dataset: JSONType, stopwords: Optional[List[str]] = None, num_words: int = 40000
):
    # 0. tokenizer
    tokenizer = Mecab()

    # 1. tokenization
    all_tokens = []
    for data in tqdm(dataset):
        sents = data["article_original"]
        for sent in sents:
            tokens = tokenizer.morphs(sent)
            if stopwords:
                all_tokens.extend([token for token in tokens if token not in stopwords])
            else:
                all_tokens.extend(tokens)

    # 2. build vocab
    vocab = Counter(all_tokens)
    vocab = vocab.most_common(num_words)

    # 3. add pad & unk tokens
    word_index = defaultdict()
    word_index["<PAD>"] = 0
    word_index["<UNK>"] = 1

    for idx, (word, _) in enumerate(vocab, 2):
        word_index[word] = idx

    index_word = {idx: word for word, idx in word_index.items()}

    return word_index, index_word

In [15]:
word_index, index_word = build_vocab(train_data)

with open("./word_index_magazine.pkl", "wb") as f:
    dill.dump(word_index, f)

100%|██████████| 53265/53265 [01:27<00:00, 608.73it/s]


In [11]:
# with open("../utils//word_index_v02.pkl", "rb") as f:
#     word_index = dill.load(f)

In [12]:
len(word_index)

40002

## DataSet Class

In [13]:
class SumDataset(Dataset):
    def __init__(self, path):
        with open(path, "r", encoding="utf-8") as f:
            jsonl = list(f)

        self.data = []
        for json_str in jsonl:
            self.data.append(json.loads(json_str))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        doc = self.data[idx]["article_original"]
        ext_indices = self.data[idx]["extractive"]
        summaries = self.data[idx]["abstractive"]

        return doc, ext_indices, summaries

In [14]:
trainset = SumDataset(train_path)

## Feature Class

In [15]:
# class Feature:
#     def __init__(self, word_index, tokenizer):
#         self.word_index = word_index
#         self.index_word = {idx: word for word, idx in word_index.items()}
#         assert len(self.word_index) == len(self.index_word)
#         self.PAD_IDX = 0
#         self.UNK_IDX = 1
#         self.PAD_TOKEN = "<PAD>"
#         self.UNK_TOKEN = "<UNK>"
#         self.tokenizer = tokenizer

#     def __len__(self):
#         return len(self.word_index)

#     def index_to_word(self, idx):
#         return self.index_word[idx]

#     def word_to_index(self, w):
#         if w in self.word_index:
#             return self.word_index[w]
#         else:
#             return self.UNK_IDX

#     ###################
#     # Create Features #
#     ###################
#     def make_features(
#         self,
#         docs,
#         ext_idx_list,
#         summaries_list,
#         doc_trunc=100,
#         sent_trunc=128,
#         split_token="\n",
#     ):

#         # trunc document
#         # 문서 내 doc_trunc 문장 개수까지 가져옴
#         sents_list, targets, doc_lens, ext_sums, abs_sums = [], [], [], [], []
#         for doc, ext_indices, abs_sum in zip(docs, ext_idx_list, summaries_list):
#             labels = []
#             for idx in range(len(doc)):
#                 if idx in ext_indices:
#                     labels.append(1)
#                 else:
#                     labels.append(0)

#             max_sent_num = min(doc_trunc, len(doc))
#             sents = doc[:max_sent_num]
#             labels = labels[:max_sent_num]
#             ext_sum = [sent for sent, label in zip(sents, labels) if label == 1]

#             sents_list.append(sents)
#             targets.append(labels)
#             doc_lens.append(len(sents))
#             ext_sums.append(ext_sum)
#             abs_sums.append(abs_sum)

#         # trunc or pad sent
#         # 문장 내 sent_trunc 단어 개수까지 가져옴
#         max_sent_len = 0
#         batch_sents = []
#         features_list = []
#         for sents in sents_list:
#             for sent in sents:
#                 words = self.tokenizer.morphs(sent)
#                 # words = [word for word in words if len(word) > 1]
#                 if len(words) > sent_trunc:
#                     words = words[:sent_trunc]
#                 max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
#                 batch_sents.append(words)

#             features = []
#             for sent in batch_sents:
#                 feature = [self.PAD_IDX for _ in range(max_sent_len - len(sent))] + [
#                     self.word_to_index(w) for w in sent
#                 ]
#                 features.append(feature)

#             features_list.append(features)

#         return features_list, targets, doc_lens, ext_sums, abs_sums, docs

#     def make_predict_features(
#         self, docs, sent_trunc=128, doc_trunc=100, split_token=". ",
#     ):

#         sents_list, doc_lens = [], []
#         for doc in docs:
#             sents = doc.split(split_token)
#             max_sent_num = min(doc_trunc, len(sents))
#             sents = sents[:max_sent_num]
#             sents_list.extend(sents)
#             doc_lens.append(len(sents))

#         # trunc or pad sent
#         max_sent_len = 0
#         batch_sents = []
#         for sent in sents_list:
#             words = self.tokenizer.morphs(sent)
#             # words = [word for word in words if len(word) > 1]
#             if len(words) > sent_trunc:
#                 words = words[:sent_trunc]
#             max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
#             batch_sents.append(words)

#         features = []
#         for sent in batch_sents:
#             feature = [self.PAD_IDX for _ in range(max_sent_len - len(sent))] + [
#                 self.word_to_index(w) for w in sent
#             ]
#             features.append(feature)

#         return features, doc_lens

In [64]:
class Feature:
    def __init__(self, word_index, tokenizer):
        self.word_index = word_index
        self.index_word = {idx: word for word, idx in word_index.items()}
        assert len(self.word_index) == len(self.index_word)
        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.PAD_TOKEN = "<PAD>"
        self.UNK_TOKEN = "<UNK>"
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.word_index)

    def index_to_word(self, idx):
        return self.index_word[idx]

    def word_to_index(self, w):
        if w in self.word_index:
            return self.word_index[w]
        else:
            return self.UNK_IDX

    ###################
    # Create Features #
    ###################
    def make_features(
        self,
        docs,
        ext_idx_list,
        summaries_list,
        doc_trunc=50,
        sent_trunc=128,
        split_token="\n",
    ):

        # trunc document
        # 문서 내 doc_trunc 문장 개수까지 가져옴
        sents_list, targets, doc_lens, ext_sums, abs_sums = [], [], [], [], []
        for doc, ext_indices, abs_sum in zip(docs, ext_idx_list, summaries_list):
            labels = []
            for idx in range(len(doc)):
                if idx in ext_indices:
                    labels.append(1)
                else:
                    labels.append(0)

            max_sent_num = min(doc_trunc, len(doc))
            sents = doc[:max_sent_num]
            labels = labels[:max_sent_num]
            ext_sum = [sent for sent, label in zip(sents, labels) if label == 1]

            sents_list.extend(sents)
            targets.extend(labels)
            doc_lens.append(len(sents))
            ext_sums.append(ext_sum)
            abs_sums.append(abs_sum)

        # trunc or pad sent
        # 문장 내 sent_trunc 단어 개수까지 가져옴
        max_sent_len = 0
        batch_sents = []
        for sent in sents_list:
            words = self.tokenizer.morphs(sent)
            # words = [word for word in words if len(word) > 1]
            if len(words) > sent_trunc:
                words = words[:sent_trunc]
            max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
            batch_sents.append(words)

        features = []
        for sent in batch_sents:
            feature = [self.PAD_IDX for _ in range(max_sent_len - len(sent))] + [
                self.word_to_index(w) for w in sent
            ]
            features.append(feature)

        return features, targets, doc_lens, ext_sums, abs_sums, docs

    def make_predict_features(
        self, docs, sent_trunc=128, doc_trunc=50, split_token=". ",
    ):

        sents_list, doc_lens = [], []
        for doc in docs:
            sents = doc.split(split_token)
            max_sent_num = min(doc_trunc, len(sents))
            sents = sents[:max_sent_num]
            sents_list.extend(sents)
            doc_lens.append(len(sents))

        # trunc or pad sent
        max_sent_len = 0
        batch_sents = []
        for sent in sents_list:
            words = self.tokenizer.morphs(sent)
            # words = [word for word in words if len(word) > 1]
            if len(words) > sent_trunc:
                words = words[:sent_trunc]
            max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
            batch_sents.append(words)

        features = []
        for sent in batch_sents:
            feature = [self.PAD_IDX for _ in range(max_sent_len - len(sent))] + [
                self.word_to_index(w) for w in sent
            ]
            features.append(feature)

        return features, doc_lens

## DataLoader 

### collate_fn

In [16]:
# def collate_fn(batch, feature):
#     docs = [entry[0] for entry in batch]
#     labels_list = [entry[1] for entry in batch]
#     summaries_list = [entry[2] for entry in batch]

#     features, targets, doc_lens, ext_sums, abs_sums, docs = feature.make_features(
#         docs, labels_list, summaries_list
#     )

#     #     features = torch.LongTensor(features)
#     #     targets = torch.FloatTensor(targets)
#     max_doc_len = max(doc_lens)
#     doc_lens = torch.LongTensor(doc_lens)
#     return features, targets, doc_lens, max_doc_len, ext_sums, abs_sums, docs

In [190]:
def collate_fn(batch, feature):
    docs = [entry[0] for entry in batch]
    labels_list = [entry[1] for entry in batch]
    summaries_list = [entry[2] for entry in batch]

    features, targets, doc_lens, ext_sums, abs_sums, docs = feature.make_features(
        docs, labels_list, summaries_list
    )

    docs = []
    labels = []
    start = 0
    pad_dim = len(features[0])
    max_doc_len = max(doc_lens)
    for doc_len in doc_lens:
        stop = start + doc_len
        doc = features[start:stop]
        target = targets[start:stop]
        start = stop

        doc = torch.LongTensor(doc)
        if len(doc) == max_doc_len:
            docs.append(doc.unsqueeze(0))
        else:
            pad = torch.zeros(max_doc_len - doc_len, pad_dim, dtype=torch.long)
            docs.append(torch.cat([doc, pad]).unsqueeze(0))

        if len(target) == max_doc_len:
            labels.append(torch.FloatTensor(target).unsqueeze(0))
        else:
            pad = torch.zeros(max_doc_len - doc_len)
            target = torch.FloatTensor(target)
            labels.append(torch.cat([target, pad]).unsqueeze(0))

    docs = torch.cat(docs, dim=0)
    labels = torch.cat(labels, dim=0)
    targets = torch.FloatTensor(targets)
    doc_lens = torch.LongTensor(doc_lens)
    return docs, labels, doc_lens, max_doc_len, ext_sums, abs_sums, docs

In [191]:
# Feature class
mecab = Mecab()
feature = Feature(word_index, mecab)

In [192]:
# DataLoader
train_loader = DataLoader(
    dataset=trainset,
    batch_size=32,
    shuffle=True,
    collate_fn=partial(collate_fn, feature=feature),
    num_workers=8,
)

In [193]:
for batch in train_loader:
    docs, targets, doc_lens, max_doc_len, ext_sums, abs_sums, docs = batch
    break

In [196]:
targets.shape

torch.Size([32, 35])

In [197]:
labels = []
for idx, doc_len in enumerate(doc_lens):
    doc = targets[idx][:doc_len]
    labels.append(doc)
labels = torch.cat(labels, dim=0)

In [199]:
labels.shape

torch.Size([447])

In [202]:
a = a.cuda()

In [205]:
b = a.detach()

In [206]:
b

tensor([[0.2594, 0.0480, 0.4768,  ..., 0.6116, 0.2827, 0.9941],
        [0.6936, 0.9198, 0.2577,  ..., 0.8877, 0.9924, 0.9541],
        [0.1422, 0.2006, 0.9687,  ..., 0.3762, 0.4070, 0.9753],
        ...,
        [0.9653, 0.6189, 0.8120,  ..., 0.7876, 0.7225, 0.4006],
        [0.1173, 0.7142, 0.3717,  ..., 0.5497, 0.9576, 0.7127],
        [0.2075, 0.5802, 0.3070,  ..., 0.6827, 0.2326, 0.0602]],
       device='cuda:0')

In [207]:
len(trainset)

42937