In [None]:
data_folder = "./stsvv-clean"
output_folder = "./model.zip"
wrong_case_folder = "./res.csv"

In [None]:
!pip install -U sentence-transformers
!huggingface-cli login --token=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

In [None]:
from sentence_transformers import SentenceTransformer, losses, models
from sentence_transformers.readers import STSDataReader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator,BinaryClassificationEvaluator
from torch.utils.data import DataLoader

In [None]:
sts_reader = STSDataReader(dataset_folder=data_folder)

In [None]:
train_set = sts_reader.get_examples("train.csv")
eval_set = sts_reader.get_examples("eval.csv")

In [None]:
train_dataloader = DataLoader(train_set, batch_size=8, shuffle=True)

In [None]:
model = models.Transformer("NlpHUST/gpt2-vietnamese", do_lower_case=True)
# model = models.Transformer("vinai/phobert-base-v2", do_lower_case=True)
# model = models.Transformer("/kaggle/working/model.zip", do_lower_case=True)
model.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.auto_model.resize_token_embeddings(len(model.tokenizer))

In [None]:
pooling_model = models.Pooling(model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[model, pooling_model])

In [None]:
train_loss = losses.CosineSimilarityLoss(model)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(eval_set, show_progress_bar=False)

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)], epochs=3,
    evaluator=evaluator, output_path=output_folder,
    save_best_model=True, 
)

In [None]:
print(model.evaluate(evaluator=evaluator))

In [None]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(train_set)
print(model.evaluate(evaluator=evaluator))

In [None]:
test_set = sts_reader.get_examples("test.csv")
# for example in test_set:
#     example.label = round(example.label)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_set, show_progress_bar=False, write_csv=True)
print(model.evaluate(evaluator=evaluator))

In [None]:
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset

predictions = []
for example in test_set:
    sentence1 = example.texts[0]
    sentence2 = example.texts[1]
    label = example.label

    sentence_embeddings1 = model.encode(sentence1, show_progress_bar=False)
    sentence_embeddings2 = model.encode(sentence2, show_progress_bar=False)

    cos_similarity = util.cos_sim(sentence_embeddings1, sentence_embeddings2)
    
    predictions.append((sentence1, sentence2, label, round(float(cos_similarity), 2)))

# Identify incorrect predictions
incorrect_predictions = []
for example in predictions:
    sentence1, sentence2, label, prediction = example
    
    temp_pred = 1 if prediction >= 0.5 else 0
    temp_label = 1 if label >= 0.5 else 0
    
    if temp_pred != temp_label:
        incorrect_predictions.append(example)

print(len(incorrect_predictions)/len(predictions))

In [None]:
import pandas as pd

results = pd.DataFrame(incorrect_predictions, columns=['Sentence1', 'Sentence2', 'label', 'predictions'])
results.to_csv(wrong_case_folder)