In [2]:
!pip install datasets sentence-transformers



In [3]:
import math
from datasets import load_dataset
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import random

In [4]:
#### Just some code to print debug information to stdout
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout

model_name = "distilroberta-base"
train_batch_size = 128  # The larger you select this, the better the results (usually). But it requires more GPU memory
max_seq_length = 75
num_epochs = 1

# Save path of the model
model_save_path = (
    "output/2d_matryoshka_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)

In [5]:
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode="mean")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Check if dataset exists. If not, download and extract  it
nli_dataset_path = "data/AllNLI.tsv.gz"

if not os.path.exists(nli_dataset_path):
    util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path)

# Read the AllNLI.tsv.gz file and create the training dataset
logging.info("Read AllNLI train dataset")


2024-03-30 12:51:52 - Use pytorch device_name: mps
2024-03-30 12:51:53 - Read AllNLI train dataset


In [6]:
def add_to_samples(sent1, sent2, label):
    if sent1 not in train_data:
        train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
    train_data[sent1][label].add(sent2)

In [7]:
train_data = {}
with gzip.open(nli_dataset_path, "rt", encoding="utf8") as fIn:
    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
    for row in reader:
        if row["split"] == "train":
            sent1 = row["sentence1"].strip()
            sent2 = row["sentence2"].strip()

            add_to_samples(sent1, sent2, row["label"])
            add_to_samples(sent2, sent1, row["label"])  # Also add the opposite

In [8]:
train_samples = []
for sent1, others in train_data.items():
    if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
        train_samples.append(
            InputExample(
                texts=[sent1, random.choice(list(others["entailment"])), random.choice(list(others["contradiction"]))]
            )
        )
        train_samples.append(
            InputExample(
                texts=[random.choice(list(others["entailment"])), sent1, random.choice(list(others["contradiction"]))]
            )
        )

logging.info("Train samples: {}".format(len(train_samples)))


2024-03-30 12:52:01 - Train samples: 563648


In [9]:
train_samples1 = train_samples[0:5000]

In [10]:
# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples1, batch_size=train_batch_size)


# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.Matryoshka2dLoss(model, train_loss, [768, 512, 256, 128, 64])

stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
    stsb_dev["sentence1"],
    stsb_dev["sentence2"],
    [score / 5 for score in stsb_dev["score"]],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=int(len(train_dataloader) * 0.1),
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    use_amp=False,  # Set to True, if your GPU supports FP16 operations
)

2024-03-30 12:52:02 - Warmup-steps: 4


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

2024-03-30 12:52:12 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 3 steps:
2024-03-30 12:52:21 - Cosine-Similarity :	Pearson: 0.6428	Spearman: 0.6719
2024-03-30 12:52:21 - Manhattan-Distance:	Pearson: 0.7034	Spearman: 0.7066
2024-03-30 12:52:21 - Euclidean-Distance:	Pearson: 0.6630	Spearman: 0.6689
2024-03-30 12:52:21 - Dot-Product-Similarity:	Pearson: 0.1875	Spearman: 0.1982
2024-03-30 12:52:21 - Save model to output/2d_matryoshka_nli_distilroberta-base-2024-03-30_12-51-52
2024-03-30 12:52:33 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 6 steps:
2024-03-30 12:52:38 - Cosine-Similarity :	Pearson: 0.7670	Spearman: 0.7793
2024-03-30 12:52:38 - Manhattan-Distance:	Pearson: 0.7762	Spearman: 0.7790
2024-03-30 12:52:38 - Euclidean-Distance:	Pearson: 0.7679	Spearman: 0.7690
2024-03-30 12:52:38 - Dot-Product-Similarity:	Pearson: 0.1627	Spearman: 0.1401
2024-03-30 12:52:38 - Save model to output/2d_matryoshka

In [23]:
model = SentenceTransformer(model_save_path)
stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
    stsb_test["sentence1"],
    stsb_test["sentence2"],
    [score / 5 for score in stsb_test["score"]],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-test",
)
test_evaluator(model, output_path=model_save_path)


0.7306425785634405

Fine Tune

In [24]:
train_samples2 = train_samples[5000:10000]

In [1]:
# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples2, batch_size=train_batch_size)


# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.Matryoshka2dLoss(model, train_loss, [768, 512, 256, 128, 64])

stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
    stsb_dev["sentence1"],
    stsb_dev["sentence2"],
    [score / 5 for score in stsb_dev["score"]],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=int(len(train_dataloader) * 0.1),
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    use_amp=False,  # Set to True, if your GPU supports FP16 operations
)

NameError: name 'datasets' is not defined

In [26]:
model = SentenceTransformer(model_save_path)
stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
    stsb_test["sentence1"],
    stsb_test["sentence2"],
    [score / 5 for score in stsb_test["score"]],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-test",
)
test_evaluator(model, output_path=model_save_path)


0.7434575853533087

In [28]:
train_samples3 = train_samples[10000:15000]

In [29]:
# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples3, batch_size=train_batch_size)


# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.Matryoshka2dLoss(model, train_loss, [768, 512, 256, 128, 64])

stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
    stsb_dev["sentence1"],
    stsb_dev["sentence2"],
    [score / 5 for score in stsb_dev["score"]],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=int(len(train_dataloader) * 0.1),
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    use_amp=False,  # Set to True, if your GPU supports FP16 operations
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

TODO

- load nomic model
- evaluate via the same method
- finetune nomic model (same data)
- then ask about stanford NLI




In [18]:
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

2024-03-30 13:11:27 - Load pretrained SentenceTransformer: nomic-ai/nomic-embed-text-v1.5


A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-embed-text-v1-unsupervised:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/52.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/nomic-ai/nomic-embed-text-v1-unsupervised:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


2024-03-30 13:11:30 - <All keys matched successfully>
2024-03-30 13:11:30 - Use pytorch device_name: mps


In [19]:
#model = SentenceTransformer(model_save_path)




stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
    stsb_test["sentence1"],
    stsb_test["sentence2"],
    [score / 5 for score in stsb_test["score"]],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-test",
)
test_evaluator(model, output_path=model_save_path)


2024-03-30 13:11:37 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2024-03-30 13:11:47 - Cosine-Similarity :	Pearson: 0.8432	Spearman: 0.8425
2024-03-30 13:11:47 - Manhattan-Distance:	Pearson: 0.8430	Spearman: 0.8406
2024-03-30 13:11:47 - Euclidean-Distance:	Pearson: 0.8431	Spearman: 0.8407
2024-03-30 13:11:47 - Dot-Product-Similarity:	Pearson: 0.7654	Spearman: 0.7542


0.8424746411143469

In [20]:
train_samples3 = train_samples[10000:15000]

# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples3, batch_size=train_batch_size)


# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.Matryoshka2dLoss(model, train_loss, [768, 512, 256, 128, 64])

stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
    stsb_dev["sentence1"],
    stsb_dev["sentence2"],
    [score / 5 for score in stsb_dev["score"]],
    main_similarity=SimilarityFunction.COSINE,
    name="sts-dev",
)

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=int(len(train_dataloader) * 0.1),
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    use_amp=False,  # Set to True, if your GPU supports FP16 operations
)

2024-03-30 13:11:51 - Warmup-steps: 4


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/39 [00:00<?, ?it/s]

IndexError: tuple index out of range

In [32]:
from mteb import MTEB

class ClassificationModel(nn.Module):
    def __init__(self, sentence_model_name, num_labels):
        super(ClassificationModel, self).__init__()
        # Initialize the SentenceTransformer model.
        self.sentence_transformer = SentenceTransformer(sentence_model_name, trust_remote_code=True)
        # Initialize the classifier layer.
        # Use the get_sentence_embedding_dimension method to dynamically determine the size of input features.
        embedding_dimension = self.sentence_transformer.get_sentence_embedding_dimension()
        self.classifier = nn.Linear(embedding_dimension, num_labels)

    def forward(self, sentences):
        # Generate sentence embeddings.
        with torch.no_grad():
            embeddings = self.sentence_transformer.encode(sentences, convert_to_tensor=True)
        # Pass the embeddings through the classifier to get logits.
        logits = self.classifier(embeddings)
        return logits

    def encode(self, sentences, batch_size=32, **kwargs):
        # Provide a method to generate embeddings, making the model compatible with MTEB's expectations.
        return self.sentence_transformer.encode(sentences, batch_size=batch_size, **kwargs)

    def get_sentence_embedding_dimension(self):
        # Expose the method to get the embedding dimension.
        return self.sentence_transformer.get_sentence_embedding_dimension()

model_name = "nomic-ai/nomic-embed-text-v1.5"
num_labels = 77  # Banking77 has 77 classes.

# Initialize your model.
model = ClassificationModel(model_name, num_labels)

2024-03-30 14:39:32 - Load pretrained SentenceTransformer: nomic-ai/nomic-embed-text-v1.5
2024-03-30 14:39:35 - <All keys matched successfully>
2024-03-30 14:39:35 - Use pytorch device_name: mps


In [33]:

# Initialize the MTEB evaluation with your desired tasks.
evaluation = MTEB(tasks=["Banking77Classification"])

# Run the evaluation.
results = evaluation.run(model, output_folder=f"results/{model_name}")

2024-03-30 14:40:19 - 

## Evaluating 1 tasks:


2024-03-30 14:40:19 - 

********************** Evaluating Banking77Classification **********************
2024-03-30 14:40:19 - Loading dataset for Banking77Classification
2024-03-30 14:40:20 - 
Task: Banking77Classification, split: test. Running...
2024-03-30 14:40:20 - Encoding 616 training sentences...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2024-03-30 14:40:27 - Encoding 3080 test sentences...


Batches:   0%|          | 0/97 [00:00<?, ?it/s]

2024-03-30 14:40:39 - Fitting logistic regression classifier...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2024-03-30 14:40:42 - Evaluating...
2024-03-30 14:40:42 - Encoding 616 training sentences...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2024-03-30 14:40:45 - Encoding 3080 test sentences...
2024-03-30 14:40:45 - Fitting logistic regression classifier...
2024-03-30 14:40:45 - Evaluating...
2024-03-30 14:40:45 - Encoding 616 training sentences...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2024-03-30 14:40:48 - Encoding 3080 test sentences...
2024-03-30 14:40:48 - Fitting logistic regression classifier...
2024-03-30 14:40:49 - Evaluating...
2024-03-30 14:40:49 - Encoding 616 training sentences...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2024-03-30 14:40:51 - Encoding 3080 test sentences...
2024-03-30 14:40:51 - Fitting logistic regression classifier...
2024-03-30 14:40:53 - Evaluating...
2024-03-30 14:40:53 - Encoding 616 training sentences...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2024-03-30 14:40:55 - Encoding 3080 test sentences...
2024-03-30 14:40:55 - Fitting logistic regression classifier...
2024-03-30 14:40:56 - Evaluating...
2024-03-30 14:40:56 - Encoding 616 training sentences...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2024-03-30 14:40:58 - Encoding 3080 test sentences...
2024-03-30 14:40:58 - Fitting logistic regression classifier...
2024-03-30 14:40:59 - Evaluating...
2024-03-30 14:40:59 - Encoding 616 training sentences...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2024-03-30 14:41:02 - Encoding 3080 test sentences...
2024-03-30 14:41:02 - Fitting logistic regression classifier...
2024-03-30 14:41:03 - Evaluating...
2024-03-30 14:41:03 - Encoding 616 training sentences...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2024-03-30 14:41:06 - Encoding 3080 test sentences...
2024-03-30 14:41:06 - Fitting logistic regression classifier...
2024-03-30 14:41:06 - Evaluating...
2024-03-30 14:41:06 - Encoding 616 training sentences...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2024-03-30 14:41:09 - Encoding 3080 test sentences...
2024-03-30 14:41:09 - Fitting logistic regression classifier...
2024-03-30 14:41:10 - Evaluating...
2024-03-30 14:41:10 - Encoding 616 training sentences...


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

2024-03-30 14:41:13 - Encoding 3080 test sentences...
2024-03-30 14:41:13 - Fitting logistic regression classifier...
2024-03-30 14:41:13 - Evaluating...
2024-03-30 14:41:13 - Evaluation for Banking77Classification on test took 53.60 seconds
2024-03-30 14:41:13 - Scores: {'accuracy': 0.8482467532467532, 'f1': 0.84778513285915, 'accuracy_stderr': 0.009668785378381452, 'f1_stderr': 0.009959944555820489, 'main_score': 0.8482467532467532, 'evaluation_time': 53.6}
