In [None]:
# %uv pip install sentence-transformers datasets

In [None]:
from sentence_transformers import SentenceTransformer, models

## Step 1: use an existing language model
word_embedding_model = models.Transformer("distilroberta-base")

## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension()
)

## Join steps 1 and 2 using the modules argument
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
model

## Push a train-test split for our dataset

In [None]:
from datasets import load_dataset

dataset_name = "zenml/rag_qa_embedding_questions"
dataset = load_dataset(dataset_name, split="train")

In [None]:
from datasets import Dataset, DatasetDict
from zenml.client import Client

# Assuming you have a Dataset object named 'dataset'

# Split the dataset into train and test sets
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Create a DatasetDict with train and test splits
dataset_dict = DatasetDict(
    {
        "train": train_test_split["train"],
        "test": train_test_split["test"],
    }
)

# Upload the dataset to the same repository on the Hugging Face Hub as a new branch
client = Client()
hf_token = client.get_secret("huggingface_datasets").secret_values["token"]

branch_name = "train_test_split"

dataset_dict.push_to_hub(
    repo_id=dataset_name,
    private=True,
    token=hf_token,
    branch=branch_name,
    create_pr=True,
)

## Finetuning our embeddings

In [None]:
from sentence_transformers import SentenceTransformer, models

modelB = SentenceTransformer(
    "embedding-data/distilroberta-base-sentence-transformer"
)

In [None]:
from datasets import load_dataset

dataset_name = "zenml/rag_qa_embedding_questions"
datasetB = load_dataset(dataset_name, split="train")

In [None]:
from rich import inspect

inspect(datasetB[0])

In [None]:
datasetB.num_rows

In [None]:
from sentence_transformers import InputExample

train_examplesB = []
train_dataB = datasetB
n_examples = datasetB.num_rows

for i in range(n_examples):
    example = train_dataB[i]
    train_examplesB.append(
        InputExample(
            texts=[example["generated_questions"][0], example["page_content"]]
        )
    )

In [None]:
from sentence_transformers import losses
from torch.utils.data import DataLoader

train_dataloaderB = DataLoader(train_examplesB, shuffle=True, batch_size=64)
train_lossB = losses.MultipleNegativesRankingLoss(model=modelB)
num_epochsB = 10
warmup_stepsB = int(
    len(train_dataloaderB) * num_epochsB * 0.1
)  # 10% of train data

In [None]:
modelB.fit(
    train_objectives=[(train_dataloaderB, train_lossB)],
    epochs=num_epochsB,
    warmup_steps=warmup_stepsB,
)

In [1]:
from sentence_transformers import InputExample
from torch.utils.data import DataLoader, Dataset


class InputExampleDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        if isinstance(idx, list):
            return [self.__getitem__(i) for i in idx]
        example = self.examples[idx]
        return example.texts[0], example.texts[1]


# Create some sample InputExamples
examples = [
    InputExample(texts=["Question 1", "Context 1"]),
    InputExample(texts=["Question 2", "Context 2"]),
    InputExample(texts=["Question 3", "Context 3"]),
    InputExample(texts=["Question 4", "Context 4"]),
]

# Create an instance of InputExampleDataset
dataset = InputExampleDataset(examples)

# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)

# Iterate over the batches
for batch in dataloader:
    question_texts, context_texts = batch
    print("Question texts:", question_texts)
    print("Context texts:", context_texts)
    print("---")

Question texts: ('Question 1', 'Question 2')
Context texts: ('Context 1', 'Context 2')
---
Question texts: ('Question 3', 'Question 4')
Context texts: ('Context 3', 'Context 4')
---
