<a href="https://colab.research.google.com/github/wookddang/PythonApplication2/blob/master/Ko_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install SpeechRecognition
!pip install transformers
!pip install sentencepiece
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
!pip install speechrecognition kobert-transformers transformers tensorflow-addons tqdm seaborn matplotlib scikit-learn

In [None]:
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import warnings
from sklearn.metrics import classification_report
from transformers import TFBertModel
import tensorflow_addons as tfa
from kobert_tokenizer import KoBERTTokenizer

warnings.filterwarnings(action='ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')

# KoBERT Setup
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
model = TFBertModel.from_pretrained('skt/kobert-base-v1', from_pt=True)

SEQ_LEN = 128
BATCH_SIZE = 16
EPOCHS = 2
LR = 1e-5

# Data Loading Functions
def convert_data(data_df):
    global tokenizer
    indices, targets = [], []
    for i in tqdm(range(len(data_df))):
        tokenized = tokenizer(data_df.iloc[i]["comment"], padding='max_length', max_length=SEQ_LEN, truncation=True, return_tensors="tf")
        indices.append(tokenized['input_ids'][0].numpy())
        targets.append(data_df.iloc[i]["label"])
    indices = np.array(indices)
    return [indices, np.zeros_like(indices)], np.array(targets)

def load_data(pandas_dataframe):
    data_df = pandas_dataframe
    data_df["comment"] = data_df["comment"].astype(str)
    data_x, data_y = convert_data(data_df)
    return data_x, data_y

# Load the data
#!git clone https://github.com/e9t/nsmc.git || true
train = pd.read_table("/content/train_set2.csv", encoding='cp949')
test = pd.read_table("/content/test_set2.csv", encoding='cp949')

# Use only 500 samples
train = train.sample(n=500, random_state=42)
test = test.sample(n=500, random_state=42)

train_x, train_y = load_data(train)
test_x, test_y = load_data(test)

# Define the model architecture
class SentimentClassifier(tf.keras.Model):
    def __init__(self, bert):
        super(SentimentClassifier, self).__init__()
        self.bert = bert
        self.classifier = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs, **kwargs):
        input_ids = inputs[0]
        attention_mask = tf.cast(tf.math.not_equal(input_ids, 0), tf.int32)
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.classifier(cls_output)
        return cls_output

kobert_model = TFBertModel.from_pretrained('skt/kobert-base-v1', from_pt=True)
model = SentimentClassifier(kobert_model)
optimizer = tfa.optimizers.AdamW(learning_rate=LR, weight_decay=1e-5)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_x, train_y, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1, validation_data=(test_x, test_y), shuffle=True)

# Save the model weights
model.save_weights("kobert_model.h5")

# Load the model weights
model.load_weights("kobert_model.h5")

# Predictions
def predict_load_data(x):
    data_df = x
    data_df["comment"] = data_df["comment"].astype(str)
    data_x, data_y = convert_data(data_df)
    return data_x

test_set = predict_load_data(test)

# Prediction
preds = model.predict(test_set)

# F1 Score 확인
y_true = test['label']
print(classification_report(y_true, np.round(preds, 0)))

def sentence_convert_data(data):
    global tokenizer
    indices = []
    tokenized = tokenizer(data, padding='max_length', max_length=SEQ_LEN, truncation=True, return_tensors="tf")
    indices.append(tokenized['input_ids'][0].numpy())
    indices = np.array(indices)
    return [indices, np.zeros_like(indices)]

def movie_evaluation_predict(sentence):
    data_x = sentence_convert_data(sentence)
    predict = model.predict(data_x)
    predict_answer = np.round(np.ravel(predict), 0).item()
    if predict_answer == 0:
        print("보이스피싱입니다.")
    elif predict_answer == 1:
        print("보이스피싱이 아닙니다
movie_evaluation_predict("서울중앙지검으로 송금하세요 ")