In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from tensorflow.keras.models import load_model
from transformers import TFRobertaModel
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from sklearn.metrics import mean_squared_error, mean_absolute_error

tokenizer_save_path = '/content/drive/MyDrive/tokenizer/'


tokenizer = BertTokenizer.from_pretrained(tokenizer_save_path)

# 데이터 로드
file_path = '/content/drive/MyDrive/val_noun_df.csv'
model_path_pre = '/content/drive/MyDrive/불용어제거이후모델/rm_stopwrd_base_256_'
def remove_nouns(text, nouns):
    for noun in nouns:
        text = text.replace(noun, '')
    return text

for index, row in df.iterrows():
  nouns_list = row['NOUNS'].split('_SEP_')
  df.at[index, 'paragraphs'] = remove_nouns(row['paragraphs'], nouns_list)

In [9]:
test_target = ['대안제시', '글짓기', '찬성반대', '주장', '설명글']

for target in test_target:

  model_path = model_path_pre+f'{target}.h5'  # 모델 경로 지정
  model = load_model(model_path, custom_objects={'TFRobertaModel': TFRobertaModel})
  df = pd.read_csv(file_path)
  df = df[df['class']==target]

  paragraphs = df['paragraphs'].values
  scores = df['score'].values
  max_length = 512
  X_ids = np.zeros((len(paragraphs), max_length))
  X_mask = np.zeros((len(paragraphs), max_length))

  for i, paragraph in enumerate(paragraphs):
      tokens = tokenizer.encode_plus(paragraph, max_length=max_length, truncation=True,
                                    padding='max_length', add_special_tokens=True,
                                    return_tensors='tf')
      X_ids[i, :] = tokens['input_ids']
      X_mask[i, :] = tokens['attention_mask']

  predictions = model.predict([X_ids, X_mask])


  mse = mean_squared_error(scores, predictions)
  mae = mean_absolute_error(scores, predictions)
  rmse = np.sqrt(mse)

  print('불용어 제거 후 모델 성능 평가')
  print(f'{target}의 성능')
  print(f"MSE: {mse}, MAE: {mae}, RMSE: {rmse}")
  print(f'======================================')

불용어 제거 후 모델 성능 평가
대안제시의 성능
MSE: 40.975307834544445, MAE: 4.448691100115203, RMSE: 6.401195812857504
불용어 제거 후 모델 성능 평가
글짓기의 성능
MSE: 45.589908160802786, MAE: 3.9905332569393126, RMSE: 6.752029928903069
불용어 제거 후 모델 성능 평가
찬성반대의 성능
MSE: 47.44617187661635, MAE: 4.468396201491044, RMSE: 6.888118166568889
불용어 제거 후 모델 성능 평가
주장의 성능
MSE: 58.31462836213272, MAE: 5.303009699930387, RMSE: 7.636401532275049
불용어 제거 후 모델 성능 평가
설명글의 성능
MSE: 17.725752766913885, MAE: 3.18753626543037, RMSE: 4.210196286031553
