In [1]:
# https://dacon.io/en/codeshare/1803

In [2]:
!git clone https://github.com/e9t/nsmc.git
!pip install tensorflow_addons
!pip install torch>=1.8.1
!pip install mxnet
!pip install gluonnlp==0.8.0
!pip install sentencepiece
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'
!pip install transformers

fatal: destination path 'nsmc' already exists and is not an empty directory.
Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-q3qommtj/kobert-tokenizer_f391321aa37449d7b4b9bb0a511e89d1
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-q3qommtj/kobert-tokenizer_f391321aa37449d7b4b9bb0a511e89d1
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [3]:
import os
from google.colab import drive
drive.mount('/content/gdrive/')

TPU = False
if TPU:
  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
  tf.config.experimental_connect_to_cluster(resolver)
  tf.tpu.experimental.initialize_tpu_system(resolver)
else:
  pass

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
import matplotlib.pyplot as plt
import tensorflow as tf
from kobert_tokenizer import KoBERTTokenizer
import gluonnlp as nlp
from transformers import pipeline, AutoTokenizer, BertTokenizer, BertTokenizerFast
from transformers import AutoModel, BertModel, TFBertModel, TFBertForSequenceClassification
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# 하이퍼파라미터
device = torch.device("cuda:0")
max_len = 64
batch_size = 32
epoch = 100
learning_rate =  5e-5
warmup_ratio = 0.1
max_grad_norm = 1
log_interval = 200
modelsave_location = os.path.join(os.getcwd(), 'gdrive', 'MyDrive', 'Research',
                                  'Colab', 'Model', 'modeling_KoBERT_20231018.pt')
predfile_location = os.path.join(os.getcwd(), 'gdrive', 'MyDrive', 'Research',
                                  'Colab', 'Data', 'df_news.csv')

class BERTSentenceTransform:
    r"""BERT style data transformation.

    Parameters
    ----------
    tokenizer : BERTTokenizer.
        Tokenizer for the sentences.
    max_seq_length : int.
        Maximum sequence length of the sentences.
    pad : bool, default True
        Whether to pad the sentences to maximum length.
    pair : bool, default True
        Whether to transform sentences or sentence pairs.
    """

    def __init__(self, tokenizer, max_seq_length,vocab, pad=True, pair=True):
        self._tokenizer = tokenizer
        self._max_seq_length = max_seq_length
        self._pad = pad
        self._pair = pair
        self._vocab = vocab

    def __call__(self, line):
        """Perform transformation for sequence pairs or single sequences.

        The transformation is processed in the following steps:
        - tokenize the input sequences
        - insert [CLS], [SEP] as necessary
        - generate type ids to indicate whether a token belongs to the first
        sequence or the second sequence.
        - generate valid length

        For sequence pairs, the input is a tuple of 2 strings:
        text_a, text_b.

        Inputs:
            text_a: 'is this jacksonville ?'
            text_b: 'no it is not'
        Tokenization:
            text_a: 'is this jack ##son ##ville ?'
            text_b: 'no it is not .'
        Processed:
            tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
            type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
            valid_length: 14

        For single sequences, the input is a tuple of single string:
        text_a.

        Inputs:
            text_a: 'the dog is hairy .'
        Tokenization:
            text_a: 'the dog is hairy .'
        Processed:
            text_a: '[CLS] the dog is hairy . [SEP]'
            type_ids: 0     0   0   0  0     0 0
            valid_length: 7

        Parameters
        ----------
        line: tuple of str
            Input strings. For sequence pairs, the input is a tuple of 2 strings:
            (text_a, text_b). For single sequences, the input is a tuple of single
            string: (text_a,).

        Returns
        -------
        np.array: input token ids in 'int32', shape (batch_size, seq_length)
        np.array: valid length in 'int32', shape (batch_size,)
        np.array: input token type ids in 'int32', shape (batch_size, seq_length)

        """

        # convert to unicode
        text_a = line[0]
        if self._pair:
            assert len(line) == 2
            text_b = line[1]

        tokens_a = self._tokenizer.tokenize(text_a)
        tokens_b = None

        if self._pair:
            tokens_b = self._tokenizer(text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            self._truncate_seq_pair(tokens_a, tokens_b,
                                    self._max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > self._max_seq_length - 2:
                tokens_a = tokens_a[0:(self._max_seq_length - 2)]

        # The embedding vectors for `type=0` and `type=1` were learned during
        # pre-training and are added to the wordpiece embedding vector
        # (and position vector). This is not *strictly* necessary since
        # the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.

        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        #vocab = self._tokenizer.vocab
        vocab = self._vocab
        tokens = []
        tokens.append(vocab.cls_token)
        tokens.extend(tokens_a)
        tokens.append(vocab.sep_token)
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens.extend(tokens_b)
            tokens.append(vocab.sep_token)
            segment_ids.extend([1] * (len(tokens) - len(segment_ids)))

        input_ids = self._tokenizer.convert_tokens_to_ids(tokens)

        # The valid length of sentences. Only real  tokens are attended to.
        valid_length = len(input_ids)

        if self._pad:
            # Zero-pad up to the sequence length.
            padding_length = self._max_seq_length - valid_length
            # use padding tokens for the rest
            input_ids.extend([vocab[vocab.padding_token]] * padding_length)
            segment_ids.extend([0] * padding_length)

        return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
            np.array(segment_ids, dtype='int32')

class BERTDataset():
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = BERTSentenceTransform(bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        #transform = nlp.data.BERTSentenceTransform(
        #    tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=5,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device),return_dict=False)
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)



In [5]:
# # 데이터처리
# train = pd.read_table("nsmc/"+"ratings_train.txt")
# train = train.dropna().sample(5000).reset_index().iloc[:,1:].iloc[:,1:].values.tolist()
# test = pd.read_table("nsmc/"+"ratings_test.txt")
# test = test.dropna().sample(5000).reset_index().iloc[:,1:].iloc[:,1:].values.tolist()

# tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
# vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
# bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)

# data_train = BERTDataset(train, 0, 1, tokenizer, vocab, max_len, True, False)
# data_test = BERTDataset(test, 0, 1, tokenizer, vocab, max_len, True, False)
# train_dataloader = DataLoader(data_train, batch_size=batch_size, num_workers=2)
# test_dataloader = DataLoader(data_test, batch_size=batch_size, num_workers=2)

# # 모델링세팅
# model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
# loss_fn = nn.CrossEntropyLoss()
# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]
# optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# t_total = len(train_dataloader) * epoch
# warmup_step = int(t_total * warmup_ratio)
# scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# def calc_accuracy(X,Y):
#     max_vals, max_indices = torch.max(X, 1)
#     train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
#     return train_acc

# # 학습
# train_history = []
# test_history = []
# loss_history = []

# for e in range(epoch):
#     train_acc = 0.0
#     test_acc = 0.0
#     model.train()
#     for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
#         optimizer.zero_grad()
#         token_ids = token_ids.long().to(device)
#         segment_ids = segment_ids.long().to(device)
#         valid_length= valid_length
#         label = label.long().to(device)
#         out = model(token_ids, valid_length, segment_ids)
#         # print(label.shape, out.shape)
#         loss = loss_fn(out, label)
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
#         optimizer.step()
#         scheduler.step()  # Update learning rate schedule
#         train_acc += calc_accuracy(out, label)
#         if batch_id % log_interval == 0:
#             print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
#             train_history.append(train_acc / (batch_id+1))
#             loss_history.append(loss.data.cpu().numpy())
#     print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
#     # train_history.append(train_acc / (batch_id+1))
#     model.eval() # 모델을 평가 모드로 설정
#     for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
#         token_ids = token_ids.long().to(device)
#         segment_ids = segment_ids.long().to(device)
#         valid_length= valid_length
#         label = label.long().to(device)
#         out = model(token_ids, valid_length, segment_ids) # 모델에 입력 데이터 전달하여 출력 얻기
#         test_acc += calc_accuracy(out, label) # 정확도 계산하여 누적
#     print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
#     test_history.append(test_acc / (batch_id+1)) # 테스트 정확도의 추이를 기록하고 후에 시각화하거나 분석하는 데 사용

# # 모델저장
# torch.save(model.state_dict(), modelsave_location)

# # 시각화
# epochs = range(1, epoch + 1)
# display(train_history, test_history)
# ## training and test accuracy
# plt.figure(figsize=(12, 6))
# plt.subplot(1, 2, 1)
# plt.plot(epochs, train_history, 'bo-', label='Training Accuracy')
# plt.plot(epochs, test_history, 'ro-', label='Test Accuracy')
# plt.title('Training and Test Accuracy')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend()
# ## training loss
# plt.subplot(1, 2, 2)
# plt.plot(epochs, loss_history, 'go-')
# plt.title('Loss-Epochs')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.tight_layout()
# plt.show()


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/157 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.6132630109786987 train acc 0.25
epoch 1 train acc 0.464171974522293


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/157 [00:00<?, ?it/s]

epoch 1 test acc 0.6194267515923567


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/157 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 0.901033878326416 train acc 0.59375


KeyboardInterrupt: ignored

In [None]:
# 모델 및 예측데이터 로딩
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
model.load_state_dict(torch.load(modelsave_location))
df_news = pd.read_csv(predfile_location)
## 예측함수
def predict_sentiment(sentence):
    data = [sentence, '0']
    dataset = [data]
    test_data = BERTDataset(dataset, 0, 1, tokenizer, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, num_workers=5)

    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        predicted_index = out.argmax() + 1

    return predicted_index.item()
## 예측
tqdm.pandas()
# df_news_sentiment = df_news['제목'][:100000].progress_apply(predict_sentiment)
# predsave_location = os.path.join(os.getcwd(), 'gdrive', 'MyDrive', 'Research',
#                                   'Colab', 'Data', 'df_news_sentiment1.csv')
# df_news_sentiment.to_csv(predsave_location)
for i in range(200000, 300000, 10000):
    df_news_sentiment = df_news['제목'][i:i+10000].progress_apply(predict_sentiment)
    predsave_location = os.path.join(os.getcwd(), 'gdrive', 'MyDrive', 'Research',
                                     'Colab', 'Data', 'df_news_sentiment'+str(i)+'.csv')
    df_news_sentiment.to_csv(predsave_location)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.
100%|██████████| 10000/10000 [51:00<00:00,  3.27it/s]
 68%|██████▊   | 6804/10000 [44:57<24:31,  2.17it/s]

In [None]:
# # !git clone https://github.com/e9t/nsmc.git
# train = pd.read_table("nsmc/"+"ratings_train.txt")
# train = train.dropna().sample(5000).reset_index().iloc[:,1:]
# test = pd.read_table("nsmc/"+"ratings_test.txt")
# test = test.dropna().sample(5000).reset_index().iloc[:,1:]

# data_train = BERTDataset(train, 0, 1, tokenizer, vocab, max_len, True, False)
# data_test = BERTDataset(test, 0, 1, tokenizer, vocab, max_len, True, False)


# def preprocessing_sentence_to_BERTinput(df, tokenizer, colname_data, colname_target=None, seq_len=128,
#                                         return_type='tensor'):
#     tokens, masks, segments, targets = [], [], [], []
#     for i in tqdm(range(len(df))):
#         # 변환
#         token = tokenizer.encode_plus(df[colname_data][i], max_length=seq_len,
#                                       pad_to_max_length=True, truncation=True,
#                                       return_attention_mask=True,
#                                       add_special_tokens=True)

#         # 정리
#         tokens.append(token['input_ids'])
#         masks.append(token['attention_mask'])
#         segments.append(token['token_type_ids'])
#         if colname_target != None:
#             targets.append(df[colname_target][i])

#     # array 변환
#     tokens = np.array(tokens)
#     masks = np.array(masks)
#     segments = np.array(segments)
#     if colname_target != None:
#         targets = np.array(targets)

#     # tensor 변환
#     if return_type == 'tensor':
#         tokens = tf.convert_to_tensor(tokens, dtype=tf.int32)
#         masks = tf.convert_to_tensor(masks, dtype=tf.int32)
#         segments = tf.convert_to_tensor(segments, dtype=tf.int32)

#     return [tokens, masks, segments], targets

# import tensorflow_addons as tfa
# from transformers import pipeline, AutoTokenizer, BertTokenizer, BertTokenizerFast
# from transformers import AutoModel, AutoModelForTokenClassification, TFBertModel, TFBertForSequenceClassification

# MODEL_NAME = 'monologg/kobert'    # 'bert-base-multilingual-cased', 'klue/roberta-base'
# # OPTIMIZER = tfa.optimizers.RectifiedAdam(lr=1.0e-5, weight_decay=0.0025, warmup_proportion=0.05)
# OPTIMIZER = tf.keras.optimizers.Adam(lr=1.0e-5)
# NUM_LABELS = 2
# SEQ_LEN = 64

# tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
# X_train, Y_train = preprocessing_sentence_to_BERTinput(train, tokenizer=tokenizer,
#                                                        colname_target='label', colname_data='document', seq_len=SEQ_LEN)
# X_test, Y_test = preprocessing_sentence_to_BERTinput(test, tokenizer=tokenizer,
#                                                        colname_target='label', colname_data='document', seq_len=SEQ_LEN)

# def modeling_BERTsentiment(model_name, optimizer, num_labels=2, seq_len=128):
#     # 모델 로딩
#     model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
#     loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#     metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
#     model.compile(optimizer=optimizer, loss=loss, metrics=metric)

#     return model

# # def modeling_BERTsentiment(model_name, optimizer, num_labels=2, seq_len=128):
# #     # 모델 로딩
# #     model = TFBertModel.from_pretrained(model_name, num_labels=num_labels, output_hidden_states=True)
# #     outputs = model([tokens, masks, segments])[1]

# #     # 모델 구성
# #     layer = tf.keras.layers.Dense(1, activation='sigmoid',
# #                                   kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(outputs)
# #     model_sentiment = tf.keras.Model([tokens, masks, segments], layer)
# #     model_sentiment.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])

# #     return model_sentiment


# model = modeling_BERTsentiment(model_name=MODEL_NAME, optimizer=OPTIMIZER, num_labels=NUM_LABELS, seq_len=SEQ_LEN)
# model.fit(X_train, Y_train, epochs=10, shuffle=True, batch_size=100, validation_data=(X_test, Y_test))
