In [1]:
import json
import os
import sys

from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses, models
from datasets import Dataset

sys.path.insert(0, os.path.abspath('..'))
from puddle import AttentionPooling

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
"""
dataset.json format:
{
  "anchor": [
    anchor_text_1,
    anchor_text_2,
    anchor_text_3,
    ...
  ],
  "positive": [
    positive_text_1,
    positive_text_2,
    positive_text_3,
    ...
  ],
  "negative": [
    negative_text_1,
    negative_text_2,
    negative_text_3,
    ...
  ]
}
"""

with open("dataset.json", "r", encoding="utf-8") as f:
    train_dataset = Dataset.from_dict(json.load(f))

In [3]:
transformer_layer = models.Transformer("cl-nagoya/ruri-v3-310m")

hidden_size: int = transformer_layer.get_word_embedding_dimension()
pooling_layer = AttentionPooling(
    hidden_size=hidden_size,
    intermediate_size=hidden_size * 4,
)

model = SentenceTransformer(modules=[transformer_layer, pooling_layer])

In [4]:
triplet_loss = losses.TripletLoss(
    model=model, distance_metric=losses.TripletDistanceMetric.COSINE
)

In [5]:
trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=train_dataset,
    loss=triplet_loss,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
                                                                     

In [6]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=6, training_loss=4.427654266357422, metrics={'train_runtime': 17.8029, 'train_samples_per_second': 1.685, 'train_steps_per_second': 0.337, 'total_flos': 0.0, 'train_loss': 4.427654266357422, 'epoch': 3.0})

In [7]:
model.save_pretrained("models/attention-pooled")