In [1]:
!git clone https://github.com/e9t/nsmc.git
!pip install tensorflow_addons
!pip install transformers

Cloning into 'nsmc'...
remote: Enumerating objects: 14763, done.[K
remote: Counting objects: 100% (14762/14762), done.[K
remote: Compressing objects: 100% (13012/13012), done.[K
remote: Total 14763 (delta 1748), reused 14762 (delta 1748), pack-reused 1[K
Receiving objects: 100% (14763/14763), 56.19 MiB | 17.50 MiB/s, done.
Resolving deltas: 100% (1748/1748), done.
Updating files: 100% (14737/14737), done.
Collecting tensorflow_addons
  Downloading tensorflow_addons-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (612 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m612.1/612.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.21.0 typeguard-2.13.3
Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K  

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from transformers import pipeline, AutoTokenizer, BertTokenizer, BertTokenizerFast
from transformers import AutoModel, TFBertModel, TFBertForSequenceClassification

In [3]:
import os
from google.colab import drive
drive.mount('/content/gdrive/')

TPU = True
if TPU:
  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
  tf.config.experimental_connect_to_cluster(resolver)
  tf.tpu.experimental.initialize_tpu_system(resolver)
else:
  pass

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
# !git clone https://github.com/e9t/nsmc.git
train = pd.read_table("nsmc/"+"ratings_train.txt")
train = train.dropna().sample(5000).reset_index().iloc[:,1:]
test = pd.read_table("nsmc/"+"ratings_test.txt")
test = test.dropna().sample(5000).reset_index().iloc[:,1:]

def preprocessing_sentence_to_BERTinput(df, tokenizer, colname_data, colname_target=None, seq_len=128,
                                        return_type='tensor'):
    tokens, masks, segments, targets = [], [], [], []
    for i in tqdm(range(len(df))):
        # 변환
        token = tokenizer.encode_plus(df[colname_data][i], max_length=seq_len,
                                      pad_to_max_length=True, truncation=True,
                                      return_attention_mask=True,
                                      add_special_tokens=True)

        # 정리
        tokens.append(token['input_ids'])
        masks.append(token['attention_mask'])
        segments.append(token['token_type_ids'])
        if colname_target != None:
            targets.append(df[colname_target][i])

    # array 변환
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    if colname_target != None:
        targets = np.array(targets)

    # tensor 변환
    if return_type == 'tensor':
        tokens = tf.convert_to_tensor(tokens, dtype=tf.int32)
        masks = tf.convert_to_tensor(masks, dtype=tf.int32)
        segments = tf.convert_to_tensor(segments, dtype=tf.int32)

    return [tokens, masks, segments], targets

import tensorflow_addons as tfa
from transformers import pipeline, AutoTokenizer, BertTokenizer, BertTokenizerFast
from transformers import AutoModel, AutoModelForTokenClassification, TFBertModel, TFBertForSequenceClassification

MODEL_NAME = 'monologg/kobert'    # 'bert-base-multilingual-cased'
# OPTIMIZER = tfa.optimizers.RectifiedAdam(lr=1.0e-5, weight_decay=0.0025, warmup_proportion=0.05)
OPTIMIZER = tf.keras.optimizers.Adam(lr=1.0e-5)
NUM_LABELS = 2
SEQ_LEN = 64

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
X_train, Y_train = preprocessing_sentence_to_BERTinput(train, tokenizer=tokenizer,
                                                       colname_target='label', colname_data='document', seq_len=SEQ_LEN)
X_test, Y_test = preprocessing_sentence_to_BERTinput(test, tokenizer=tokenizer,
                                                       colname_target='label', colname_data='document', seq_len=SEQ_LEN)

def modeling_BERTsentiment(model_name, optimizer, num_labels=2, seq_len=128):
    # 모델 로딩
    model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=metric)

    return model

# def modeling_BERTsentiment(model_name, optimizer, num_labels=2, seq_len=128):
#     # 모델 로딩
#     model = TFBertModel.from_pretrained(model_name, num_labels=num_labels, output_hidden_states=True)
#     outputs = model([tokens, masks, segments])[1]

#     # 모델 구성
#     layer = tf.keras.layers.Dense(1, activation='sigmoid',
#                                   kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(outputs)
#     model_sentiment = tf.keras.Model([tokens, masks, segments], layer)
#     model_sentiment.compile(optimizer=optimizer, loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])

#     return model_sentiment


model = modeling_BERTsentiment(model_name=MODEL_NAME, optimizer=OPTIMIZER, num_labels=NUM_LABELS, seq_len=SEQ_LEN)
model.fit(X_train, Y_train, epochs=10, shuffle=True, batch_size=100, validation_data=(X_test, Y_test))


100%|██████████| 5000/5000 [00:03<00:00, 1615.69it/s]
100%|██████████| 5000/5000 [00:02<00:00, 1761.60it/s]
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7b5ef435f190>