In [11]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
import json
from sklearn.model_selection import train_test_split
import os
from typing import Callable, List, Tuple
import pytorch_lightning as pl  
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.callbacks import LearningRateMonitor
import requests
from bs4 import BeautifulSoup, CData
from GPUtil import showUtilization as gpu_usage
from numba import cuda
import logging
import unicodedata
from shutil import copyfile

#kobert
from kobert_transformers.utils import get_tokenizer

#transformers
from transformers import AdamW, BertConfig, BertModel, PreTrainedTokenizer
from seqeval.metrics import f1_score, accuracy_score

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [12]:
device = torch.device("cuda:0")   #gpu 사용
# device = torch.device("cpu")

print('GPUs Available :', torch.cuda.device_count())

if torch.cuda.is_available():
    print('GPU running')
else:
    print('GPU not running')

GPUs Available : 1
GPU running


In [13]:
def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

## 데이터 준비

In [14]:
#모두의말뭉치 파일 내 json 파일 목록
def parse_paths(folder):
    for current, dirs, files in os.walk(folder):
        return [os.path.join(current, file) for file in files if file.endswith(".json")]
    
#json 파일 내 기사의 문장에서 128글자까지만 추출
def parse_contents(files):
    result = []
    for file in tqdm(files, desc="[Contents Parsing]"):    #진행상황 확인용
        with open(file, "r", encoding="utf-8") as f:
            contents = json.load(f)
        for doc in contents["document"]:
            for paragraph in doc["paragraph"]:
                result.append(paragraph["form"][:128])
                
    print("[Tokens Length] {0:,}".format(len(result)))
    return result

data = parse_paths('C:\BootCamp\CP2\Korpus')
contents = parse_contents(data)

[Contents Parsing]: 100%|██████████| 35/35 [00:18<00:00,  1.88it/s]

[Tokens Length] 5,352,250





In [15]:
#train : val = 9 : 1
train, val = train_test_split(contents, test_size=0.1, random_state=111)

len(train), len(val)

(4817025, 535225)

In [16]:
del data
del contents

In [17]:
#test 데이터셋
def testset(test_data_path):
    testset = []
    for (path, dir, files) in os.walk(test_data_path):
        for filename in files :
            if filename[-6:] == '00.xml':   # 페이지 정보
                continue
            elif filename[-4:] == '.xml':
                try : 
                    with open("%s/%s" % (path, filename), "r", encoding="utf-8") as f:
                        soup = BeautifulSoup(f, "html.parser")
                        testset.append(soup.find("headline").text)
                        testset.append(soup.find("subheadline").text)
            
                        t = soup.find("datacontent").text
                        t = t.split('다.')
                        temp = []
                        for tt in t[:-1] :
                            temp.append(tt+'다.')
                        temp.append(t[-1])
                        testset.extend(temp)
                        
                        for i in testset:
                            if i == '전면광고' or i == '전면광고\n':
                                testset.remove(i)

                except:
                    print('not exist ', "%s/%s" % (test_data_path, filename))  # 지면광고, 전면광고
    return testset

testset = testset('C:\BootCamp\CP2\PDF')

print("len: ", len(testset))
testset[0:10]

len:  3304


['차기대통령 첫 덕목은 ‘소통과 통합’',
 '‘청렴·도덕성’ ‘강력 리더십’ 順 반기문·문재인 2강 이재명 1중',
 '조기 대선이 가시화된 가운데 차기대통령이 갖춰야 할 덕목으로 국민 3명 중 1명은 ‘소통 및 사회통합 능력’을꼽았다.',
 ' 차기 대선후보 선호도에서는반기문 전 유엔사무총장(21.7%)과 문재인 전 더불어민주당 대표(18.5%)가오차범위 내 접전인 가운데 이재명 성남시장(11.5%)이 뒤를 쫓는 ‘2강 1중’구도로 나타났다.',
 '1일 서울신문이 새해를 맞아 에이스리서치에 의뢰해 지난달 28~29일 19세이상 남녀 1009명을 대상으로 벌인 여론조사(표본오차 95% 신뢰수준에서±3.1% 포인트)에 따르면 차기 대통령이 갖춰야 할 덕목으로 ‘소통 및 사회통합 능력’(34.3%), ‘청렴성 및 도덕성’(24.8%)이 우선 꼽혔다.',
 ' 이런 덕목은일방통행식 국정 운영과 최순실 국정 농단 등 박근혜 대통령의 탄핵 사유와 밀접한 관련이 있다는 점에서 차기 대선구도와 맞물려 시사하는 바가 크다.',
 '특히 올 경제성장률이 외환위기 이후 20년 만에 2%(정부 2.6%)로 전망되는 등 최악의 위기 상황임에도 ‘강력한 리더십’(13.4%)이나 ‘경제활성화능력’(12.5%)은 후순위였고 ‘정치 경험 및 경륜’(6.4%), ‘외교·안보·통일전문성’(4.5%)에 대한 갈증도 미미했다는 점은 주목할 만하다.',
 '2강 1중을 잇는 여야 차기 대선후보는 안철수 국민의당 전 공동대표(5.7%), 박원순 서울시장(3.0%), 손학규전 민주당 대표(2.1%) 순으로 나타났다.',
 ' 반 전 총장이 범여권 후보로 나서고민주당 문 전 대표와 국민의당 안 전 대표가 ‘가상 3자대결’을 벌인다면 반 전총장과 문 전 대표가 각각 31.1%와 30.4%로 0.7% 포인트 차이로 초박빙 양상으로 조사됐다.',
 ' 안 전 대표는 11.3%에 그쳤다.']

## 데이터 전처리
#### KoBERT모델의 입력 데이터 형태로 만들기
    -CorpusDataset 클래스
    -Preprocessor 클래스

In [18]:
#KoBertTokenizer
logger = logging.getLogger(__name__)

VOCAB_FILES_NAMES = {
    "vocab_file": "tokenizer_78b3253a26.model",
    "vocab_txt": "vocab.txt",
}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/tokenizer_78b3253a26.model",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/tokenizer_78b3253a26.model",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/tokenizer_78b3253a26.model",
    },
    "vocab_txt": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/vocab.txt",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/vocab.txt",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/vocab.txt",
    },
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "monologg/kobert": 512,
    "monologg/kobert-lm": 512,
    "monologg/distilkobert": 512,
}

PRETRAINED_INIT_CONFIGURATION = {
    "monologg/kobert": {"do_lower_case": False},
    "monologg/kobert-lm": {"do_lower_case": False},
    "monologg/distilkobert": {"do_lower_case": False},
}

SPIECE_UNDERLINE = "▁"


class KoBertTokenizer(PreTrainedTokenizer):
    """
    SentencePiece based tokenizer. Peculiarities:
        - requires `SentencePiece <https://github.com/google/sentencepiece>`_
    """

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(
        self,
        vocab_file,
        vocab_txt,
        do_lower_case=False,
        remove_space=True,
        keep_accents=False,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        **kwargs,
    ):
        super().__init__(
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs,
        )

        # Build vocab
        self.token2idx = dict()
        self.idx2token = []
        with open(vocab_txt, "r", encoding="utf-8") as f:
            for idx, token in enumerate(f):
                token = token.strip()
                self.token2idx[token] = idx
                self.idx2token.append(token)

        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning(
                "You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                "pip install sentencepiece"
            )

        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file
        self.vocab_txt = vocab_txt

        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file)

    @property
    def vocab_size(self):
        return len(self.idx2token)

    def get_vocab(self):
        return dict(self.token2idx, **self.added_tokens_encoder)

    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning(
                "You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                "pip install sentencepiece"
            )
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)

    def preprocess_text(self, inputs):
        if self.remove_space:
            outputs = " ".join(inputs.strip().split())
        else:
            outputs = inputs
        outputs = outputs.replace("``", '"').replace("''", '"')

        if not self.keep_accents:
            outputs = unicodedata.normalize("NFKD", outputs)
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
        if self.do_lower_case:
            outputs = outputs.lower()

        return outputs

    def _tokenize(self, text):
        """Tokenize a string."""
        text = self.preprocess_text(text)
        pieces = self.sp_model.encode(text, out_type=str)
        new_pieces = []
        for piece in pieces:
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                new_pieces.append(piece)

        return new_pieces

    def _convert_token_to_id(self, token):
        """ Converts a token (str/unicode) in an id using the vocab. """
        return self.token2idx.get(token, self.token2idx[self.unk_token])

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (string/unicode) using the vocab."""
        return self.idx2token[index]

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A KoBERT sequence has the following format:
            single sequence: [CLS] X [SEP]
            pair of sequences: [CLS] A [SEP] B [SEP]
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
            token_ids_0: list of ids (must not contain special tokens)
            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
                for sequence pairs
            already_has_special_tokens: (default False) Set to True if the token list is already formated with
                special tokens for the model
        Returns:
            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """

        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
            return list(
                map(
                    lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
                    token_ids_0,
                )
            )

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        A KoBERT sequence pair mask has the following format:
        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence
        if token_ids_1 is None, only returns the first portion of the mask (0's).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory):
        """Save the sentencepiece vocabulary (copy original file) and special tokens file
        to a directory.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return

        # 1. Save sentencepiece model
        out_vocab_model = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_model):
            copyfile(self.vocab_file, out_vocab_model)

        # 2. Save vocab.txt
        index = 0
        out_vocab_txt = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_txt"])
        with open(out_vocab_txt, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.token2idx.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!".format(out_vocab_txt)
                    )
                    index = token_index
                writer.write(token + "\n")
                index += 1

        return out_vocab_model, out_vocab_txt

In [19]:
class CorpusDataset(Dataset):
    def __init__(self, sentences, transform: Callable[[List, List], Tuple]):
        self.sentences = []
        self.slot_labels = ["UNK", "PAD", "B", "I"]
        self.transform = transform
        self._load_data(sentences)

    def _load_data(self, sentences):
        """data를 file에서 불러온다.

        Args:
            data_path: file 경로
        """
        self.sentences = [sen.split() for sen in sentences]
        # with open(data_path, mode="r", encoding="utf-8") as f:
        #     lines = f.readlines()
        #     self.sentences = [line.split() for line in lines]

    def _get_tags(self, sentence: List[str]) -> List[str]:
        """문장에 대해 띄어쓰기 tagging을 한다.
        character 단위로 분리하여 BI tagging을 한다.

        Args:
            sentence: 문장

        Retrns:
            문장의 각 토큰에 대해 tagging한 결과 리턴
            ["B", "I"]
        """

        tags = []
        for word in sentence:
            for i in range(len(word)):
                if i == 0:
                    tags.append("B")
                else:
                    tags.append("I")
        return tags

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = "".join(self.sentences[idx])
        sentence = [s for s in sentence]
        tags = self._get_tags(self.sentences[idx])
        tags = [self.slot_labels.index(t) for t in tags]

        (
            input_ids,
            attention_mask,
            token_type_ids,
            slot_label_ids, 
        ) = self.transform(sentence, tags)

        return input_ids, attention_mask, token_type_ids, slot_label_ids

In [20]:
class Preprocessor:
    def __init__(self, max_len: int):
        self.tokenizer = KoBertTokenizer.from_pretrained("monologg/kobert")
        self.max_len = max_len
        self.pad_token_id = 0

    def get_input_features(
        self, sentence: List[str], tags: List[str]
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """문장과 띄어쓰기 tagging에 대해 feature로 변환한다.

        Args:
            sentence: 문장
            tags: 띄어쓰기 tagging

        Returns:
            feature를 리턴한다.
            input_ids, attention_mask, token_type_ids, slot_labels
        """

        input_tokens = []
        slot_label_ids = []
					
        # tokenize
        for word, tag in zip(sentence, tags):
            tokens = self.tokenizer.tokenize(word)

            if len(tokens) == 0:
                tokens = self.tokenizer.unk_token

            input_tokens.extend(tokens)

            for i in range(len(tokens)):
                if i == 0:
                    slot_label_ids.extend([tag])
                else:
                    slot_label_ids.extend([self.pad_token_id])

        # max_len보다 길이가 길면 뒤에 자르기
        if len(input_tokens) > self.max_len - 2:
            input_tokens = input_tokens[: self.max_len - 2]
            slot_label_ids = slot_label_ids[: self.max_len - 2]

        # cls, sep 추가
        input_tokens = (
            [self.tokenizer.cls_token] + input_tokens + [self.tokenizer.sep_token]
        )
        slot_label_ids = [self.pad_token_id] + slot_label_ids + [self.pad_token_id]

        # token을 id로 변환
        input_ids = self.tokenizer.convert_tokens_to_ids(input_tokens)

        attention_mask = [1] * len(input_ids)
        token_type_ids = [0] * len(input_ids)

        # padding
        pad_len = self.max_len - len(input_tokens)
        input_ids = input_ids + ([self.tokenizer.pad_token_id] * pad_len)
        slot_label_ids = slot_label_ids + ([self.pad_token_id] * pad_len)
        attention_mask = attention_mask + ([0] * pad_len)
        token_type_ids = token_type_ids + ([0] * pad_len)

        input_ids = torch.tensor(input_ids, dtype=torch.long)
        attention_mask = torch.tensor(attention_mask, dtype=torch.long)
        token_type_ids = torch.tensor(token_type_ids, dtype=torch.long)
        slot_label_ids = torch.tensor(slot_label_ids, dtype=torch.long)

        return input_ids, attention_mask, token_type_ids, slot_label_ids

## 모델

In [21]:
class SpacingBertModel(pl.LightningModule):
    def __init__(self, config, dataset):
        super().__init__()
        self.config = config
        self.dataset = dataset
        self.slot_labels_type = ["UNK", "PAD", "B", "I"]
        self.pad_token_id = 0

        self.bert_config = BertConfig.from_pretrained(
            self.config.model, num_labels=len(self.slot_labels_type)
        )
        self.model = BertModel.from_pretrained(
            self.config.bert_model, config=self.bert_config
        )
        self.dropout = nn.Dropout(self.config.dropout_rate)
        self.linear = nn.Linear(
            self.bert_config.hidden_size, len(self.slot_labels_type)
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        return self.linear(self.dropout(outputs[0]))

    def training_step(self, batch, batch_nb):

        input_ids, attention_mask, token_type_ids, slot_label_ids = batch

        outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        loss = self._calculate_loss(outputs, slot_label_ids) # slot_labels : labels

        return {"loss": loss, "log": {"train_loss": loss}}

    def validation_step(self, batch, batch_nb):

        input_ids, attention_mask, token_type_ids, slot_label_ids = batch

        outputs = self(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        val_loss = self._calculate_loss(outputs, slot_label_ids)
        pred_slot_labels, gt_slot_labels = self._convert_ids_to_labels(
            outputs, slot_label_ids
        )

        val_f1 = self._f1_score(gt_slot_labels, pred_slot_labels)

        return {"val_loss": val_loss, "val_f1": val_f1}

    def validation_epoch_end(self, outputs):
        val_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_log = {
            "val_loss": val_loss,
            "val_f1": torch.stack([x["val_f1"] for x in outputs]).mean(),
        }

        return {"val_loss": val_loss, "progress_bar": tensorboard_log}

    def test_step(self, batch, batch_nb):

        input_ids, attention_mask, token_type_ids, slot_label_ids = batch

        pred_slot_labels, gt_slot_labels = self._convert_ids_to_labels(
            self(
              input_ids=input_ids,
              attention_mask=attention_mask,
              token_type_ids=token_type_ids,
            ), slot_label_ids
        )

        test_f1 = self._f1_score(gt_slot_labels, pred_slot_labels)

        return {"test_f1": test_f1, }

    def test_epoch_end(self, outputs):
        test_f1 = torch.stack([x["test_f1"] for x in outputs]).mean()

        test_step_outputs = {
            "test_f1": test_f1,
        }

        return test_step_outputs

    def configure_optimizers(self):
        return AdamW(self.model.parameters(), lr=2e-5, eps=1e-8)

    def train_dataloader(self):
        return DataLoader(self.dataset["train"], batch_size=self.config.train_batch_size)

    def val_dataloader(self):
        return DataLoader(self.dataset["val"], batch_size=self.config.eval_batch_size)

    def test_dataloader(self):
        return DataLoader(self.dataset["test"], batch_size=self.config.eval_batch_size)

    def _calculate_loss(self, outputs, labels):
        active_logits = outputs.view(-1, len(self.slot_labels_type))
        active_labels = labels.view(-1)
        loss = F.cross_entropy(active_logits, active_labels)

        return loss

    def _f1_score(self, gt_slot_labels, pred_slot_labels):
        return torch.tensor(
            f1_score(gt_slot_labels, pred_slot_labels), dtype=torch.float32
        )

    def _convert_ids_to_labels(self, outputs, slot_labels):
        _, y_hat = torch.max(outputs, dim=2)
        y_hat = y_hat.detach().cpu().numpy()
        slot_label_ids = slot_labels.detach().cpu().numpy()

        slot_label_map = {i: label for i, label in enumerate(self.slot_labels_type)}
        slot_gt_labels = [[] for _ in range(slot_label_ids.shape[0])]
        slot_pred_labels = [[] for _ in range(slot_label_ids.shape[0])]

        for i in range(slot_label_ids.shape[0]):
            for j in range(slot_label_ids.shape[1]):
                if slot_label_ids[i, j] != self.pad_token_id:
                    slot_gt_labels[i].append(slot_label_map[slot_label_ids[i][j]])
                    slot_pred_labels[i].append(slot_label_map[y_hat[i][j]])

        return slot_pred_labels, slot_gt_labels

In [22]:
class Config():
    def __init__(self) :
        self.task= 'korean_spacing'
        self.log_path= 'C:\BootCamp\CP2\logs'
        self.model =  'monologg/kobert'
        self.train_data_path= 'C:\BootCamp\CP2\train.txt'
        self.val_data_path= 'C:\BootCamp\CP2\val.txt'
        self.test_data_path= 'C:\BootCamp\CP2\test.txt'
        self.max_len= 128
        self.train_batch_size= 16
        self.eval_batch_size= 16
        self.dropout_rate= 0.1
        self.gpus= torch.cuda.device_count()

config = Config()

preprocessor = Preprocessor(config.max_len)

dataset = {}
dataset["train"] = CorpusDataset(train, preprocessor.get_input_features)
dataset["val"] = CorpusDataset(val, preprocessor.get_input_features)
dataset["test"] = CorpusDataset(testset, preprocessor.get_input_features)

model = SpacingBertModel(config, dataset).cuda()

logger = TensorBoardLogger(
    save_dir=os.path.join(config.log_path, config.task), version=1, name=config.task
)

checkpoint = ModelCheckpoint(
    filename="checkpoints/"+ config.task + "/{epoch}_{val_loss:35f}",
    verbose=True,
    monitor="val_loss",
    mode="min",
    save_top_k=1,
    save_last=True
)

lrmonitor = LearningRateMonitor(logging_interval='step')

trainer = pl.Trainer(
    gpus=config.gpus,
    callbacks=[checkpoint, lrmonitor],
    logger=logger,
    max_epochs=10,
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.


AttributeError: 'Config' object has no attribute 'bert_model'

In [None]:
free_gpu_cache()   

device = torch.device("cuda:0")   #gpu 사용
print('GPUs Available :', torch.cuda.device_count())

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 27% | 10% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 27% |  4% |
GPUs Available : 1


In [None]:
trainer.fit(model)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type      | Params
--------------------------------------
0 | model   | BertModel | 92.2 M
1 | dropout | Dropout   | 0     
2 | linear  | Linear    | 3.1 K 
--------------------------------------
92.2 M    Trainable params
0         Non-trainable params
92.2 M    Total params
368.760   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

RuntimeError: CUDA error: invalid argument

In [None]:
#테스트
trainer.test()

  f"`.{fn}(ckpt_path=None)` was called without a model."


MisconfigurationException: `.test(ckpt_path="best")` is set but `ModelCheckpoint` is not configured to save the best model.

In [None]:
#모델의 파라미터들을 저장 : torch.save(model.state_dict(), path)
def save_checkpoint(epoch, model, optimizer, filename): 
    state = {'Epoch': epoch, 
             'State_dict': model.state_dict(), 
             'optimizer': optimizer.state_dict()} 
    torch.save(state, filename)

In [None]:
#모델 불러오기
model = SpacingBertModel()   #모델 선언
optimizer = optim.???    #옵티마이저 선언

ckpt = torch.load(path)

state_dict = model.load_state_dict(ckpt['State_dict'])
opt = optimizer.load_state_dict(ckpt['optimizer'])

In [None]:


xml_path = 'C:\BootCamp\CP2\PDF\news_PDF'

#xml내 텍스트 가져오기
dic_count = {'head' : 0, 'sub' : 0, 'content':0, 'not exist' : 0, 'all' : 0, 'right' : 0}
dic_article = {}

for (path, dir, files) in os.walk(data_path):
    for filename in files :
        if filename[-6:] == '00.xml':   # 페이지 정보
            with open("%s/%s" % (path, filename), "r", encoding="EUC-KR") as f:
                soup = BeautifulSoup(f, "html.parser")
                file_name = soup.find("file_name").text
                articles = soup.findAll("article")
                article_list = []
                for article in articles :
                    article_list.append(article.file_name.string)
                    dic_article[file_name] = article_list
    
        elif filename[-4:] == '.xml':
            try :
                with open("%s/%s" % (path, filename), "r", encoding="utf-8") as f:
                b = True
                dic_count['all'] += 1
                soup = BeautifulSoup(f, "html.parser")
                headline = soup.find("headline")
                subheadline = soup.find("subheadline")
                content = soup.find("datacontent")
                
#텍스트를 input 형태로
preprocessor = Preprocessor(config.max_len)
dataset = {}
dataset['test'] = CorpusDataset(content, preprocessor.get_input_features)

In [None]:
xml = '''조기 대선이 가시화된 가운데 차기
대통령이 갖춰야할덕목으로 국민 3
명 중 1명은 ‘소통 및사회통합 능력’을
꼽았다. 차기 대선후보 선호도에서는
반기문 전 유엔사무총장(21.7%)과 문
재인 전 더불어민주당 대표(18.5%)가
오차범위 내 접전인 가운데 이재명 성
남시장(11.5%)이 뒤를 쫓는 ‘2강 1중’
구도로나타났다.
1일 서울신문이 새해를 맞아 에이스
리서치에 의뢰해 지난달 28~29일 19세
이상 남녀 1009명을 대상으로 벌인 여
론조사(표본오차 95% 신뢰수준에서
±3.1% 포인트)에 따르면 차기 대통령
이 갖춰야 할 덕목으로 ‘소통 및사회통
합 능력’(34.3%), ‘청렴성 및도덕성’
(24.8%)이 우선 꼽혔다. 이런 덕목은
일방통행식국정운영과최순실국정농
단 등 박근혜 대통령의 탄핵 사유와 밀
접한 관련이 있다는 점에서 차기 대선
구도와맞물려시사하는바가크다.
특히 올 경제성장률이 외환위기 이
후 20년 만에 2%(정부 2.6%)로 전망
되는 등 최악의 위기 상황임에도 ‘강력
한 리더십’(13.4%)이나 ‘경제활성화
능력’(12.5%)은 후순위였고 ‘정치 경
험 및경륜’(6.4%), ‘외교·안보·통일
전문성’(4.5%)에 대한 갈증도 미미했
다는 점은주목할만하다.
2강 1중을 잇는 여야 차기 대선후보
는 안철수 국민의당 전 공동대표(5.7
%), 박원순 서울시장(3.0%), 손학규
전 민주당 대표(2.1%) 순으로 나타났
다. 반 전 총장이 범여권 후보로 나서고
민주당 문 전 대표와 국민의당 안전대
표가 ‘가상 3자대결’을 벌인다면 반 전
총장과 문 전 대표가 각각 31.1%와 30.
4%로 0.7% 포인트 차이로 초박빙 양
상으로 조사됐다. 안 전 대표는 11.3%
에 그쳤다.
국회 개헌특위가 본격 가동되는 등
정치권의 화두로 떠오른 대통령 임기
축소를 중심으로 한 개헌 방안에 대해
서는 찬성(44.5%)이 반대(38.7%)보
다 5.8% 포인트 높았지만, 여전히 ‘모
름·무응답’도 16.8%에 이르는 것으로
나타났다.'''.replace(' ','').replace('\n','')

trainer.predict(???, return_predictions=True)