In [None]:
import sys
import os

Install basic python requirements

In [None]:
%pip install -r ../requirements.txt --quiet

Clone version of sentence-transformers that includes energy distance implementation.

In [None]:
!git clone https://github.com/gnatesan/sentence-transformers-energydistance

Install custom sentence-transformers using `pip`

In [None]:
%cd sentence-transformers-energydistance
%pip install --upgrade pip --quiet
%pip install . --quiet
%cd ..

In [None]:
sys.path.append(f'{os.getcwd()}/sentence_transformers_energydistance')

Ensure that you're using a GPU with enough available memory.

In [None]:
import torch

def check_available_gpus():
    gpu_stats = []
    for i in range(torch.cuda.device_count()):
        torch.cuda.set_device(i)
        total_memory = torch.cuda.get_device_properties(i).total_memory
        allocated_memory = torch.cuda.memory_allocated(i)
        free_memory = total_memory - allocated_memory
        gpu_stats.append((i, free_memory))
    # Sort GPUs by the most free memory
    gpu_stats.sort(key=lambda x: x[1], reverse=True)
    return gpu_stats

print(check_available_gpus())

In [None]:
from sentence_transformers import SentenceTransformer, models
import torch

for gpu_id, _ in check_available_gpus():
    try:
        ## Step 1: use an existing language model
        word_embedding_model = models.Transformer('distilroberta-base')

        ## Step 2: use a pool function over the token embeddings
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

        ## Join steps 1 and 2 using the modules argument
        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    except RuntimeError as e:
        if 'out of memory' in str(e):
            print(f"GPU {gpu_id} ran out of memory, trying next available GPU.")
            torch.cuda.empty_cache()  # Clear memory cache
            continue
        else:
            raise e

In [None]:
from datasets import load_dataset

dataset_id = "embedding-data/QQP_triplets"
dataset = load_dataset(dataset_id)

In [None]:
print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['set'])} as value.")
print(f"- Examples look like this: {dataset['train'][0]}")

In [None]:
from sentence_transformers import InputExample

train_examples = []
train_data = dataset['train']['set']
# For agility we only 1/2 of our available data
n_examples = dataset['train'].num_rows // 2

for i in range(n_examples):
  example = train_data[i]
  train_examples.append(InputExample(texts=[example['query'], example['pos'][0], example['neg'][0]]))

In [None]:
print(f"We have a {type(train_examples)} of length {len(train_examples)} containing {type(train_examples[0])}'s.")

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [None]:
from sentence_transformers import losses

train_loss = losses.ContrastiveLoss(model=model)

In [None]:
num_epochs = 10

warmup_steps = int(len(train_dataloader) * num_epochs * 0.8) #80% of train data

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps) 

Save output to `/models`

In [None]:
model_name = 'custom-model'
os.makedirs(f'{os.getcwd()}/../models', exist_ok=True)
model_path = f'{os.getcwd()}/../models/{model_name}'
model.save(model_path)