In [1]:
!git clone https://github.com/e9t/nsmc.git
!pip install tensorflow_addons
!pip install torch>=1.8.1
!pip install mxnet
!pip install gluonnlp==0.8.0
!pip install sentencepiece
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
!pip install transformers

Cloning into 'nsmc'...
remote: Enumerating objects: 14763, done.[K
remote: Counting objects: 100% (14762/14762), done.[K
remote: Compressing objects: 100% (13012/13012), done.[K
remote: Total 14763 (delta 1748), reused 14762 (delta 1748), pack-reused 1[K
Receiving objects: 100% (14763/14763), 56.19 MiB | 16.83 MiB/s, done.
Resolving deltas: 100% (1748/1748), done.
Updating files: 100% (14737/14737), done.
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (612 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.3/612.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.22.0 typeguard-2.13.3
Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[2

In [5]:
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
import matplotlib.pyplot as plt
import tensorflow as tf
from kobert_tokenizer import KoBERTTokenizer
import gluonnlp as nlp
from transformers import pipeline, AutoTokenizer, BertTokenizer, BertTokenizerFast
from transformers import AutoModel, BertModel, TFBertModel, TFBertForSequenceClassification
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader



In [41]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
import os
from google.colab import drive
drive.mount('/content/drive/')
device = torch.device("cpu")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [15]:
# 하이퍼파라미터
device = torch.device("cpu")
max_len = 64
batch_size = 32
epoch = 100
learning_rate =  5e-5
warmup_ratio = 0.1
max_grad_norm = 1
log_interval = 200

class BERTSentenceTransform:
    r"""BERT style data transformation.

    Parameters
    ----------
    tokenizer : BERTTokenizer.
        Tokenizer for the sentences.
    max_seq_length : int.
        Maximum sequence length of the sentences.
    pad : bool, default True
        Whether to pad the sentences to maximum length.
    pair : bool, default True
        Whether to transform sentences or sentence pairs.
    """

    def __init__(self, tokenizer, max_seq_length,vocab, pad=True, pair=True):
        self._tokenizer = tokenizer
        self._max_seq_length = max_seq_length
        self._pad = pad
        self._pair = pair
        self._vocab = vocab

    def __call__(self, line):
        """Perform transformation for sequence pairs or single sequences.

        The transformation is processed in the following steps:
        - tokenize the input sequences
        - insert [CLS], [SEP] as necessary
        - generate type ids to indicate whether a token belongs to the first
        sequence or the second sequence.
        - generate valid length

        For sequence pairs, the input is a tuple of 2 strings:
        text_a, text_b.

        Inputs:
            text_a: 'is this jacksonville ?'
            text_b: 'no it is not'
        Tokenization:
            text_a: 'is this jack ##son ##ville ?'
            text_b: 'no it is not .'
        Processed:
            tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
            type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
            valid_length: 14

        For single sequences, the input is a tuple of single string:
        text_a.

        Inputs:
            text_a: 'the dog is hairy .'
        Tokenization:
            text_a: 'the dog is hairy .'
        Processed:
            text_a: '[CLS] the dog is hairy . [SEP]'
            type_ids: 0     0   0   0  0     0 0
            valid_length: 7

        Parameters
        ----------
        line: tuple of str
            Input strings. For sequence pairs, the input is a tuple of 2 strings:
            (text_a, text_b). For single sequences, the input is a tuple of single
            string: (text_a,).

        Returns
        -------
        np.array: input token ids in 'int32', shape (batch_size, seq_length)
        np.array: valid length in 'int32', shape (batch_size,)
        np.array: input token type ids in 'int32', shape (batch_size, seq_length)

        """

        # convert to unicode
        text_a = line[0]
        if self._pair:
            assert len(line) == 2
            text_b = line[1]

        tokens_a = self._tokenizer.tokenize(text_a)
        tokens_b = None

        if self._pair:
            tokens_b = self._tokenizer(text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            self._truncate_seq_pair(tokens_a, tokens_b,
                                    self._max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self._max_seq_length - 2:
                tokens_a = tokens_a[0:(self._max_seq_length - 2)]

        # The embedding vectors for `type=0` and `type=1` were learned during
        # pre-training and are added to the wordpiece embedding vector
        # (and position vector). This is not *strictly* necessary since
        # the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.

        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        #vocab = self._tokenizer.vocab
        vocab = self._vocab
        tokens = []
        tokens.append(vocab.cls_token)
        tokens.extend(tokens_a)
        tokens.append(vocab.sep_token)
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens.extend(tokens_b)
            tokens.append(vocab.sep_token)
            segment_ids.extend([1] * (len(tokens) - len(segment_ids)))

        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)

        # The valid length of sentences. Only real  tokens are attended to.
        valid_length = len(input_ids)

        if self._pad:
            # Zero-pad up to the sequence length.
            padding_length = self._max_seq_length - valid_length
            # use padding tokens for the rest
            input_ids.extend([vocab[vocab.padding_token]] * padding_length)
            segment_ids.extend([0] * padding_length)

        return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
            np.array(segment_ids, dtype='int32')

class BERTDataset():
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        #transform = nlp.data.BERTSentenceTransform(
        #    tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict=False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [52]:
# 모델 및 예측데이터 로딩
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
# model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
model = BERTClassifier(bertmodel, hidden_size=768, num_classes=2, dr_rate=0.5).to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/small_model2.pt', map_location=torch.device('cpu'))) #네이버 영화리뷰를 학습한 모델
csv_file_path = '/content/NaverNews_2021-01-03-2021.01.31_KK.csv' #예시 csv파일
df_news = pd.read_csv(csv_file_path)

## 예측 함수
## 다중분류를 위한 softmax 적용
def predict_sentiment(sentence):
    data = [sentence, '0']
    dataset = [data]
    test_data = BERTDataset(dataset, 0, 1, tokenizer, vocab, max_len, True, False)
    test_dataloader = DataLoader(test_data, batch_size=batch_size, num_workers=5)

    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        probabilities = torch.nn.functional.softmax(out, dim=1) # softmax를 통한 확률, dim=1은 열별 연산, =0는 행별 연산

    # 부정 확률
    negative_prob = probabilities[:, 0].item()
    # 긍정 확률
    positive_prob = probabilities[:, 1].item()
    # 라벨링
    predicted_label = 1 if positive_prob >= 0.5 else 0

    return pd.Series({'제목': sentence, '부정확률': negative_prob,
                      '긍정확률': positive_prob, '긍부정': predicted_label})

# 예측 및 결과
tqdm.pandas()
df_news_sentiment_multi = df_news['Title'][:50].progress_apply(predict_sentiment)

print(df_news_sentiment_multi)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
100%|██████████| 50/50 [00:34<00:00,  1.44it/s]

                                                 제목      부정확률      긍정확률  긍부정
0                 양경수 위원장 "민주노총도 고령화…청년 조직화에 명운 걸려"  0.024646  0.975354    1
1                     저출산·고령화 심각… 충청권 인구 1년새 4천명 감소  0.985197  0.014802    0
2                    R]경북 인구절벽 최고..고령화, 저출산, 수도권 유출  0.989214  0.010786    0
3                단양군 공동육묘장 본격 가동…농촌 고령화·노동력 부족 등 해소  0.694741  0.305259    0
4                      3기 인구정책 TF 출범…저출산·고령화 대응 총력전  0.033690  0.966310    1
5                     고용정보원 조직개편…'빅데이터·고령화' 대응 강화한다  0.051503  0.948497    1
6                   안동시, 경북 최초 ‘벼 재배농가 개량 물꼬 지원’ 추진  0.159654  0.840346    1
7                      "인천 초고령화로 복지지출 커져...재정대책 필요"  0.918600  0.081400    0
8                         ‘양식업 면허’ 개인에게도 이전·분할 허용되나  0.966788  0.033212    0
9                            고흥 딸기·오이, 강소농 부농 꿈 이룬다  0.030680  0.969320    1
10                        고령화시대 성장판 '1000조 신탁시장' 집중  0.032354  0.967646    1
11                  롯데건설, 실버주택 평면 3종 개발…"고령화 사회 대비"  0.027718  0.972282    1




In [53]:
df_news_sentiment_multi

Unnamed: 0,제목,부정확률,긍정확률,긍부정
0,"양경수 위원장 ""민주노총도 고령화…청년 조직화에 명운 걸려""",0.024646,0.975354,1
1,저출산·고령화 심각… 충청권 인구 1년새 4천명 감소,0.985197,0.014802,0
2,"R]경북 인구절벽 최고..고령화, 저출산, 수도권 유출",0.989214,0.010786,0
3,단양군 공동육묘장 본격 가동…농촌 고령화·노동력 부족 등 해소,0.694741,0.305259,0
4,3기 인구정책 TF 출범…저출산·고령화 대응 총력전,0.03369,0.96631,1
5,고용정보원 조직개편…'빅데이터·고령화' 대응 강화한다,0.051503,0.948497,1
6,"안동시, 경북 최초 ‘벼 재배농가 개량 물꼬 지원’ 추진",0.159654,0.840346,1
7,"""인천 초고령화로 복지지출 커져...재정대책 필요""",0.9186,0.0814,0
8,‘양식업 면허’ 개인에게도 이전·분할 허용되나,0.966788,0.033212,0
9,"고흥 딸기·오이, 강소농 부농 꿈 이룬다",0.03068,0.96932,1


In [54]:
## 예측 함수
## 이진분류를 하는 로지스틱 적용
def predict_sentiment(sentence):
    data = [sentence, '0']
    dataset = [data]
    test_data = BERTDataset(dataset, 0, 1, tokenizer, vocab, max_len, True, False)
    test_dataloader = DataLoader(test_data, batch_size=batch_size, num_workers=5)

    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)

        # 로짓 정보
        logits = out

    # 각각 긍부정에 대한 로짓을 추출
    negative_logit = logits[:, 0].item()
    positive_logit = logits[:, 1].item()

    # 로짓을 확률로 변환
    negative_prob = 1 / (1 + np.exp(-negative_logit))
    positive_prob = 1 / (1 + np.exp(-positive_logit))
    # 라벨링(0,1)
    predicted_label = 1 if positive_prob >= 0.5 else 0

    return pd.Series({'제목': sentence, '부정확률': negative_prob,
                      '긍정확률': positive_prob, '긍부정': predicted_label})
# 예측 및 결과
tqdm.pandas()
df_news_sentiment = df_news['Title'][:50].progress_apply(predict_sentiment)

print(df_news_sentiment)

100%|██████████| 50/50 [00:36<00:00,  1.37it/s]

                                                 제목      부정확률      긍정확률  긍부정
0                 양경수 위원장 "민주노총도 고령화…청년 조직화에 명운 걸려"  0.141874  0.867425    1
1                     저출산·고령화 심각… 충청권 인구 1년새 4천명 감소  0.896993  0.115700    0
2                    R]경북 인구절벽 최고..고령화, 저출산, 수도권 유출  0.911386  0.100833    0
3                단양군 공동육묘장 본격 가동…농촌 고령화·노동력 부족 등 해소  0.605434  0.402702    0
4                      3기 인구정책 TF 출범…저출산·고령화 대응 총력전  0.159733  0.845019    1
5                     고용정보원 조직개편…'빅데이터·고령화' 대응 강화한다  0.194827  0.816723    1
6                   안동시, 경북 최초 ‘벼 재배농가 개량 물꼬 지원’ 추진  0.305936  0.698806    1
7                      "인천 초고령화로 복지지출 커져...재정대책 필요"  0.780801  0.239917    0
8                         ‘양식업 면허’ 개인에게도 이전·분할 허용되나  0.849371  0.162275    0
9                            고흥 딸기·오이, 강소농 부농 꿈 이룬다  0.153042  0.850949    1
10                        고령화시대 성장판 '1000조 신탁시장' 집중  0.157850  0.848620    1
11                  롯데건설, 실버주택 평면 3종 개발…"고령화 사회 대비"  0.152726  0.863442    1




In [55]:
df_news_sentiment

Unnamed: 0,제목,부정확률,긍정확률,긍부정
0,"양경수 위원장 ""민주노총도 고령화…청년 조직화에 명운 걸려""",0.141874,0.867425,1
1,저출산·고령화 심각… 충청권 인구 1년새 4천명 감소,0.896993,0.1157,0
2,"R]경북 인구절벽 최고..고령화, 저출산, 수도권 유출",0.911386,0.100833,0
3,단양군 공동육묘장 본격 가동…농촌 고령화·노동력 부족 등 해소,0.605434,0.402702,0
4,3기 인구정책 TF 출범…저출산·고령화 대응 총력전,0.159733,0.845019,1
5,고용정보원 조직개편…'빅데이터·고령화' 대응 강화한다,0.194827,0.816723,1
6,"안동시, 경북 최초 ‘벼 재배농가 개량 물꼬 지원’ 추진",0.305936,0.698806,1
7,"""인천 초고령화로 복지지출 커져...재정대책 필요""",0.780801,0.239917,0
8,‘양식업 면허’ 개인에게도 이전·분할 허용되나,0.849371,0.162275,0
9,"고흥 딸기·오이, 강소농 부농 꿈 이룬다",0.153042,0.850949,1
