In [1]:
import warnings
warnings.filterwarnings('ignore')

## 1. 학습 준비

In [2]:
from sentence_transformers import SentenceTransformer, models

transformer_model = models.Transformer('klue/roberta-base')

pooling_layer = models.Pooling(
    transformer_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

embedding_model = SentenceTransformer(modules=[transformer_model, pooling_layer])

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
%%capture
! pip install datasets

In [4]:
from datasets import load_dataset

klue_sts_train = load_dataset('klue', 'sts', split='train')
klue_sts_test = load_dataset('klue', 'sts', split='validation')

klue_sts_train[0]

{'guid': 'klue-sts-v1_train_00000',
 'source': 'airbnb-rtt',
 'sentence1': '숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.',
 'sentence2': '숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.',
 'labels': {'label': 3.7, 'real-label': 3.714285714285714, 'binary-label': 1}}

In [5]:
klue_sts_train = klue_sts_train.train_test_split(test_size=0.1, seed=42)
klue_sts_train, klue_sts_eval = klue_sts_train['train'], klue_sts_train['test']

klue_sts_train.shape, klue_sts_eval.shape

((10501, 5), (1167, 5))

In [6]:
from sentence_transformers import InputExample

def prepare_sts_examples(dataset):
  examples = []
  for data in dataset:
    examples.append(
        InputExample(
            texts=[data['sentence1'], data['sentence2']],
            label=data['labels']['label'] / 5.0
        )
    )

  return examples

train_examples = prepare_sts_examples(klue_sts_train)
eval_examples = prepare_sts_examples(klue_sts_eval)
test_examples = prepare_sts_examples(klue_sts_test)

print(train_examples[0])

<InputExample> label: 0.6799999999999999, texts: 시설은 좋으나 잠자리 민감하신 분들은 다른 곳으로 가세요; 시설도 좋지만, 잠자리라면 다른 곳으로 가보세요.


In [7]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [8]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

eval_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(eval_examples)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples)

In [9]:
test_evaluator(embedding_model)

{'pearson_cosine': 0.34770702227285755, 'spearman_cosine': 0.35560473197486514}

## 2. 학습 하기

In [10]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [11]:
from datasets import Dataset

In [12]:
from sentence_transformers import losses

num_epochs = 4
model_name = 'klue/roberta-base'
model_save_path = f'output/training_sts_{model_name.replace("/", "-")}'

train_loss = losses.CosineSimilarityLoss(model=embedding_model)

embedding_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=eval_evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=len(train_dataloader) * 5,
    output_path=model_save_path
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
657,0.0855,No log,0.943838,0.898069
1000,0.0137,No log,0.951915,0.906743
1314,0.0137,No log,0.956083,0.909438
1971,0.01,No log,0.95763,0.913547
2000,0.0074,No log,0.959272,0.917946
2628,0.0048,No log,0.959204,0.918576


In [13]:
trained_embedding_model = SentenceTransformer(model_save_path)
test_evaluator(trained_embedding_model)

{'pearson_cosine': 0.8805530909573577, 'spearman_cosine': 0.87982425613103}

## 3. Push to HuggingFace

In [14]:
from huggingface_hub import login, HfApi

login(token='hf_...')
repo_id = 'whatwant/klue-roberta-base-klue-sts'

api = HfApi()
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path=model_save_path,
    repo_id=repo_id,
    repo_type='model'
)

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/whatwant/klue-roberta-base-klue-sts/commit/ddba5b6aa13632f0c54a36e55ff250dd635cd195', commit_message='Upload folder using huggingface_hub', commit_description='', oid='ddba5b6aa13632f0c54a36e55ff250dd635cd195', pr_url=None, repo_url=RepoUrl('https://huggingface.co/whatwant/klue-roberta-base-klue-sts', endpoint='https://huggingface.co', repo_type='model', repo_id='whatwant/klue-roberta-base-klue-sts'), pr_revision=None, pr_num=None)