In [1]:
import math
import logging
from datetime import datetime
import torch
from torch import nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x7fe75463b228>

In [3]:
train_batch_size = 32

In [4]:
datasets = load_dataset("klue", "sts")
train_samples = []
dev_samples = []

# KLUE STS 내 훈련, 검증 데이터 예제 변환
for phase in ["train", "validation"]:
    examples = datasets[phase]

    for example in examples:
        score = float(example["labels"]["label"]) / 5.0  # 0.0 ~ 1.0 스케일로 유사도 정규화

        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]], 
            label=score,
        )

        if phase == "validation":
            dev_samples.append(inp_example)
        else:
            train_samples.append(inp_example)

train_dataloader = DataLoader(
    train_samples,
    shuffle=True,
    batch_size=train_batch_size,
)

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    dev_samples,
    name="sts-dev",
)

Reusing dataset klue (/home/yobi/.cache/huggingface/datasets/klue/sts/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90)


### train 768

In [5]:
embedding_model = models.Transformer("klue/roberta-base")

pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)
model = SentenceTransformer(modules=[embedding_model,  pooling_model])
# dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())
# model = SentenceTransformer(modules=[embedding_model,  pooling_model, dense_model])

num_epochs = 4
model_save_path = "output/768"
train_loss = losses.CosineSimilarityLoss(model=model)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1)  # 10% of train data for warm-up
logging.info(f"Warmup-steps: {warmup_steps}")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…





### train 256 epochs-8

In [6]:
embedding_model = models.Transformer("klue/roberta-base")

pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)
# model = SentenceTransformer(modules=[embedding_model,  pooling_model])
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())
model = SentenceTransformer(modules=[embedding_model, pooling_model, dense_model])

num_epochs = 8
model_save_path = "output/256"
train_loss = losses.CosineSimilarityLoss(model=model)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1)  # 10% of train data for warm-up
logging.info(f"Warmup-steps: {warmup_steps}")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=8.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…





In [19]:
## load 768, add pool, add 256 epochs-4
embedding_model = models.Transformer("./output/768/0_Transformer")

pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)
# model = SentenceTransformer(modules=[embedding_model,  pooling_model])
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())
model = SentenceTransformer(modules=[embedding_model,  pooling_model, dense_model])

num_epochs = 4
model_save_path = "output/768-pool-256"
train_loss = losses.CosineSimilarityLoss(model=model)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1)  # 10% of train data for warm-up
logging.info(f"Warmup-steps: {warmup_steps}")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…





### load 768-pool, add 256 epochs-4

In [7]:
embedding_model = SentenceTransformer('./output/768/')

# pooling_model = models.Pooling(
#     embedding_model.get_word_embedding_dimension(),
#     pooling_mode_mean_tokens=True,
#     pooling_mode_cls_token=False,
#     pooling_mode_max_tokens=False,
# )
# model = SentenceTransformer(modules=[embedding_model,  pooling_model])
dense_model = models.Dense(in_features=embedding_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())
model = SentenceTransformer(modules=[embedding_model, dense_model])

num_epochs = 4
model_save_path = "output/768-256"
train_loss = losses.CosineSimilarityLoss(model=model)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1)  # 10% of train data for warm-up
logging.info(f"Warmup-steps: {warmup_steps}")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=365.0, style=ProgressStyle(description_wi…





### load custom

In [34]:
model_256 = SentenceTransformer('./output/768-pool-256/')

In [35]:
len(model_256.encode('안녕'))

256

In [5]:
model_labse = SentenceTransformer('sentence-transformers/LaBSE')

Exception when trying to download https://sbert.net/models/sentence-transformers/LaBSE.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/LaBSE.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/LaBSE with mean pooling


In [7]:
model = SentenceTransformer('https://drive.google.com/file/d/13iNZAp1CR125WxOkO11bPAmk9Y8izs_q/view?usp=sharing')

90.2kB [00:00, 865kB/s]


FileNotFoundError: [Errno 2] No such file or directory: '/home/yobi/.cache/torch/sentence_transformers/drive.google.com_file_d_13iNZAp1CR125WxOkO11bPAmk9Y8izs_q_view?usp=sharing'

In [33]:
len(model.encode('안녕'))

768

In [26]:
embedding_model = models.Transformer("./output/768/0_Transformer")

pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)
# model = SentenceTransformer(modules=[embedding_model,  pooling_model])
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())
model = SentenceTransformer(modules=[embedding_model,  pooling_model, dense_model])


In [22]:
pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

In [23]:
model_notpool = SentenceTransformer(modules=[embedding_model,  pooling_model])

In [24]:
model_pool = SentenceTransformer("./output/768/")

In [28]:
model_notpool.encode("hi", convert_to_tensor=True) == model_pool.encode("hi", convert_to_tensor=True)

tensor([True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, True, True, True, True, True, True, True,
        True, True, True, True, True, Tr

In [27]:
model_pool.encode("hi", convert_to_tensor=True)

tensor([-1.9588e-01, -2.6076e-02, -7.0387e-01, -1.9161e-01,  3.7078e-01,
        -8.0734e-02, -1.2706e+00,  9.3710e-02,  5.2949e-02, -2.9981e-01,
        -6.8515e-01, -8.1802e-01, -3.0679e-01,  1.6045e-01,  2.2344e-01,
         2.4976e-01,  5.0006e-02,  2.0250e-01, -6.1771e-01, -1.4783e+00,
         1.1222e-01, -2.0073e-01,  5.4832e-01,  4.0254e-02,  3.9147e-01,
        -1.5457e-01,  1.7514e-01, -3.7323e-01,  1.4839e-01,  9.8344e-01,
         8.8302e-02,  4.1626e-01, -5.2299e-01, -3.5881e-01,  7.5960e-01,
        -2.4401e-01, -2.6519e-01, -6.7280e-01, -2.1843e-01, -1.8963e-01,
        -4.8162e-01, -6.0493e-02, -2.8714e-01, -8.3946e-02,  2.9732e-02,
         4.3027e-01, -1.0314e-01,  5.0224e-01, -4.5368e-01, -1.9866e-01,
        -1.2079e-01, -3.5560e-01,  1.6925e-01,  2.7737e-01,  1.6428e-01,
         1.6426e-01,  5.9284e-02, -2.3207e-01,  2.4944e-01, -1.7201e-02,
         4.8223e-01, -4.5565e-01, -4.8210e-01, -1.5422e-03,  4.2232e-01,
         2.0771e-01, -5.5926e-01, -3.9452e-01,  2.5