In [None]:
from sentence_transformers import SentenceTransformer, losses, models
from sentence_transformers.readers import STSDataReader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader

In [None]:
sts_reader = STSDataReader(dataset_folder="./stsvv")

In [None]:
train_set = sts_reader.get_examples("train.csv")
eval_set = sts_reader.get_examples("eval.csv")

In [None]:
train_dataloader = DataLoader(train_set, batch_size=8, shuffle=True)

In [None]:
model = models.Transformer("openai-gpt")
# model = models.Transformer("enoch/llama-65b-hf")
model.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.auto_model.resize_token_embeddings(len(model.tokenizer))

In [None]:
pooling_model = models.Pooling(model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[model, pooling_model])

In [None]:
train_loss = losses.CosineSimilarityLoss(model)

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(eval_set)

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)], epochs=10,
    checkpoint_path="ckpt.zip", evaluator=evaluator, output_path="model.zip",
    save_best_model=True
)

In [None]:
print(model.evaluate(evaluator=evaluator))