In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from tensorflow.keras.models import load_model
from transformers import TFRobertaModel
import pandas as pd
import numpy as np
from transformers import BertTokenizer
from sklearn.metrics import mean_squared_error, mean_absolute_error

model_path = '/content/drive/MyDrive/base_256_대안제시.h5'  # 모델 경로 지정
tokenizer_save_path = '/content/drive/MyDrive/tokenizer/'

model = load_model(model_path, custom_objects={'TFRobertaModel': TFRobertaModel})
tokenizer = BertTokenizer.from_pretrained(tokenizer_save_path)

# 데이터 로드
file_path = '/content/drive/MyDrive/val_sorted_paragraphs.csv'
df = pd.read_csv(file_path)
df = df[df['class']=='대안제시']
paragraphs = df['paragraphs'].values
scores = df['score'].values

max_length = 512
X_ids = np.zeros((len(paragraphs), max_length))
X_mask = np.zeros((len(paragraphs), max_length))

for i, paragraph in enumerate(paragraphs):
    tokens = tokenizer.encode_plus(paragraph, max_length=max_length, truncation=True,
                                   padding='max_length', add_special_tokens=True,
                                   return_tensors='tf')
    X_ids[i, :] = tokens['input_ids']
    X_mask[i, :] = tokens['attention_mask']

predictions = model.predict([X_ids, X_mask])


mse = mean_squared_error(scores, predictions)
mae = mean_absolute_error(scores, predictions)
rmse = np.sqrt(mse)

print(f'{model_path.split("/")[-1]}의 성능')
print(f"MSE: {mse}, MAE: {mae}, RMSE: {rmse}")

base_256_대안제시.h5의 성능
MSE: 40.12066068142136, MAE: 4.410586042993095, RMSE: 6.334087201911682


In [None]:
from transformers import TFRobertaModel, BertTokenizer

# RoBERTa 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained("klue/roberta-base")
roberta = TFRobertaModel.from_pretrained("klue/roberta-base")

# 저장할 경로 지정
tokenizer_save_path = '/content/drive/MyDrive/tokenizer/'
model_save_path = '/content/drive/MyDrive/model/'

# 토크나이저와 모델 저장저장
tokenizer.save_pretrained(tokenizer_save_path)
roberta.save_pretrained(model_save_path)



tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.bias', 'roberta.embeddings.position_ids', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
# from transformers import TFRobertaModel, BertTokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from transformers import TFRobertaModel, BertTokenizer

tokenizer_save_path = '/content/drive/MyDrive/tokenizer/'
model_save_path = '/content/drive/MyDrive/model/'

# 조기 종료 콜백 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=2)

# 데이터 로드
file_path = '/content/drive/MyDrive/df_withPrompt_SEP.csv'
df = pd.read_csv(file_path)
df = df[df['class']=='대안제시']

def remove_nouns(text, nouns):
    for noun in nouns:
        text = text.replace(noun, '')
    return text
for i in range(len(df)):
    nouns_list = df.loc[i, 'NOUNS'].split('_SEP_')
    df.loc[i, 'paragraphs'] = remove_nouns(df.loc[i, 'paragraphs'], nouns_list)

paragraphs = df['paragraphs'].values
scores = df['score'].values

# # 모델 및 토크나이저 로드 경로 설정
# save_directory = "klue-roberta-large"

# RoBERTa 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained(tokenizer_save_path)
roberta = TFRobertaModel.from_pretrained(model_save_path)

# 텍스트를 토큰화하고 RoBERTa 입력 형식에 맞게 변환
max_length = 512
X_ids = np.zeros((len(paragraphs), max_length))
X_mask = np.zeros((len(paragraphs), max_length))

for i, paragraph in enumerate(paragraphs):
    tokens = tokenizer.encode_plus(paragraph, max_length=max_length, truncation=True,
                                   padding='max_length', add_special_tokens=True,
                                   return_tensors='tf')
    X_ids[i, :] = tokens['input_ids']
    X_mask[i, :] = tokens['attention_mask']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_ids, scores, test_size=0.2)
X_mask_train, X_mask_test = train_test_split(X_mask, test_size=0.2)

# 모델 구축
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
input_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

embeddings = roberta(input_ids, attention_mask=input_mask)[0]
out = tf.keras.layers.GlobalAveragePooling1D()(embeddings)
out = Dense(256, activation='relu')(out)
out = Dense(1, activation='relu')(out)

model = Model(inputs=[input_ids, input_mask], outputs=out)
model.compile(Adam(learning_rate=1e-5), loss='mean_squared_error')

# 모델 학습
model.fit(
    [X_train, X_mask_train], y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=8,
    callbacks=[early_stopping]  # 조기 종료 콜백만 사용
)

# 모델 평가
model.evaluate([X_test, X_mask_test], y_test)

# 모델 저장
model_save_path = '/content/drive/MyDrive/rm_stopwrd_base_256_대안제시.h5'
model.save(model_save_path)

print(f"Model saved to {model_save_path}")

All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at /content/drive/MyDrive/model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/30




Epoch 2/30
Epoch 3/30
Epoch 4/30


  saving_api.save_model(


Model saved to /content/drive/MyDrive/rm_stopwrd_base_256_대안제시.h5


In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
# from transformers import TFRobertaModel, BertTokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from transformers import TFRobertaModel, BertTokenizer

tokenizer_save_path = '/content/drive/MyDrive/tokenizer/'
model_save_path = '/content/drive/MyDrive/model/'

# 조기 종료 콜백 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=2)

# 데이터 로드
file_path = '/content/drive/MyDrive/sorted_paragraphs.csv'
df = pd.read_csv(file_path)
df = df[df['class']=='대안제시']
paragraphs = df['paragraphs'].values
scores = df['score'].values

# # 모델 및 토크나이저 로드 경로 설정
# save_directory = "klue-roberta-large"

# RoBERTa 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained(tokenizer_save_path)
roberta = TFRobertaModel.from_pretrained(model_save_path)

# 텍스트를 토큰화하고 RoBERTa 입력 형식에 맞게 변환
max_length = 512
X_ids = np.zeros((len(paragraphs), max_length))
X_mask = np.zeros((len(paragraphs), max_length))

for i, paragraph in enumerate(paragraphs):
    tokens = tokenizer.encode_plus(paragraph, max_length=max_length, truncation=True,
                                   padding='max_length', add_special_tokens=True,
                                   return_tensors='tf')
    X_ids[i, :] = tokens['input_ids']
    X_mask[i, :] = tokens['attention_mask']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_ids, scores, test_size=0.2)
X_mask_train, X_mask_test = train_test_split(X_mask, test_size=0.2)

# 모델 구축
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
input_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

embeddings = roberta(input_ids, attention_mask=input_mask)[0]
out = tf.keras.layers.GlobalAveragePooling1D()(embeddings)
out = Dense(512, activation='relu')(out)
out = Dense(1, activation='relu')(out)

model = Model(inputs=[input_ids, input_mask], outputs=out)
model.compile(Adam(learning_rate=1e-5), loss='mean_squared_error')

# 모델 학습
model.fit(
    [X_train, X_mask_train], y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=8,
    callbacks=[early_stopping]  # 조기 종료 콜백만 사용
)

# 모델 평가
model.evaluate([X_test, X_mask_test], y_test)

# 모델 저장
model_save_path = '/content/drive/MyDrive/base_512_대안.h5'
model.save(model_save_path)

print(f"Model saved to {model_save_path}")

All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at /content/drive/MyDrive/model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/30




Epoch 2/30
Epoch 3/30
Epoch 4/30


  saving_api.save_model(


Model saved to /content/drive/MyDrive/base_512_대안.h5


In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
# from transformers import TFRobertaModel, BertTokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from transformers import TFRobertaModel, BertTokenizer

tokenizer_save_path = '/content/drive/MyDrive/tokenizer/'
model_save_path = '/content/drive/MyDrive/model/'

# 조기 종료 콜백 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=2)

# 데이터 로드
file_path = '/content/drive/MyDrive/sorted_paragraphs.csv'
df = pd.read_csv(file_path)
df = df[df['class']=='대안제시']
paragraphs = df['paragraphs'].values
scores = df['score'].values

# # 모델 및 토크나이저 로드 경로 설정
# save_directory = "klue-roberta-large"

# RoBERTa 토크나이저 초기화
tokenizer = BertTokenizer.from_pretrained(tokenizer_save_path)
roberta = TFRobertaModel.from_pretrained(model_save_path)

# 텍스트를 토큰화하고 RoBERTa 입력 형식에 맞게 변환
max_length = 512
X_ids = np.zeros((len(paragraphs), max_length))
X_mask = np.zeros((len(paragraphs), max_length))

for i, paragraph in enumerate(paragraphs):
    tokens = tokenizer.encode_plus(paragraph, max_length=max_length, truncation=True,
                                   padding='max_length', add_special_tokens=True,
                                   return_tensors='tf')
    X_ids[i, :] = tokens['input_ids']
    X_mask[i, :] = tokens['attention_mask']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X_ids, scores, test_size=0.2)
X_mask_train, X_mask_test = train_test_split(X_mask, test_size=0.2)

# 모델 구축
input_ids = Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
input_mask = Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

embeddings = roberta(input_ids, attention_mask=input_mask)[0]
out = tf.keras.layers.GlobalAveragePooling1D()(embeddings)
out = Dense(256, activation='relu')(out)
out = Dense(1, activation='relu')(out)

model = Model(inputs=[input_ids, input_mask], outputs=out)
model.compile(Adam(learning_rate=1e-5), loss='mean_squared_error')

# 모델 학습
model.fit(
    [X_train, X_mask_train], y_train,
    validation_split=0.2,
    epochs=30,
    batch_size=8,
    callbacks=[early_stopping]  # 조기 종료 콜백만 사용
)

# 모델 평가
model.evaluate([X_test, X_mask_test], y_test)

# 모델 저장
model_save_path = '/content/drive/MyDrive/base_256_daean.h5'
model.save(model_save_path)

print(f"Model saved to {model_save_path}")


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at /content/drive/MyDrive/model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/30




Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30


  saving_api.save_model(


Model saved to /content/drive/MyDrive/base_256_daean.h5
