# core

> This is the core module that will include everything needed for the semantic cleaning

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

## Loading parameters

If you want the check the code you'll need your Huggingface tokens. You can do it using login or by loading the tokens from a file.

My tokens are in a jason file with is loadded to a Parameters class

In [3]:
## This is for colab integration - uncomment the lines below
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from reinautils import Parameters

In [4]:
params=Parameters().from_json ('/content/drive/MyDrive/tokens.json')

## Lets do some imports

In [6]:
#| export
import os
from tqdm.auto import tqdm
from typing import List, Dict, Set, Union, Callable
import torch
from torch.utils.data import DataLoader
from datasets import Dataset, load_dataset
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch.nn.functional as F
import transformers

In [7]:
os.environ["TOKENIZERS_PARALLELISM"]="True"

## Define a function for the data preprocessing

In [9]:
#| export
def preprocess_data(dataset: Dataset, splits: Union[str, List[str]] = None, schema: str = "") -> Dataset:
    """
    Preprocesses the dataset by merging selected keys into a formatted string.
    
    Args:
        dataset: A HuggingFace Dataset.
        splits: The specific splits of the dataset to preprocess. Defaults to all splits.
        schema: A string defining how to format the merged string. 
                It should contain keys from the dataset encapsulated in {}.
                Example: "<human>:{user} <bot>:{response}", 
                where 'user' and 'response' are keys in the dataset.

    Returns:
        The processed Dataset with an additional "_merged" field containing the formatted strings.
    """

    # If no splits are specified, use all splits
    if not splits:
        splits = list(dataset.keys())

    # Ensure 'splits' is a list
    if not isinstance(splits, (list, tuple)):
        splits = [splits] 

    # If no schema is specified, use a default schema that includes all keys
    if not schema:
        schema = "".join([f"<{key}>: {{{key}}} " for key in dataset[splits[0]].features.keys()])

    # Extract key names from the schema
    key_names = [s.split("}")[0] for s in schema.split("{")[1:]]

    # Define a function to merge the columns into a single string
    def merge_columns(example):
        example["_merged"] = schema.format(**{key: example[key] for key in key_names})
        return example
    
    # Apply the function to the selected splits
    for split in splits:
        dataset[split] = dataset[split].map(merge_columns)
        
    return dataset


## Define a function to compute the embeddings

In [10]:
#| export
def compute_embeddings(
    data: Dataset, 
    embedding_model: torch.nn.Module,
    tokenizer, 
    batch_size: int = 8,
    num_workers: int = 1,
    dataset_feature : str = '_merged'
) -> np.array:
    """
    Compute sentence embeddings using an embedding model.

    Args:
        data: A list of dictionary containing tokenized text.
        embedding_model: A callable model that returns embeddings for input tokens.
        batch_size: The number of samples per batch.
        num_workers: The number of worker processes for data loading.
        dataset_feature : The name of the feature to tokenize in the dataset
    Returns:
        A numpy array of embeddings for the input data.
    """
    dataloader = DataLoader(data, batch_size=batch_size, num_workers=num_workers)
    embeddings_list = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            tokenized_batch = tokenizer(batch[dataset_feature], padding='max_length', return_tensors="pt")

            # Find the maximum length of tokens in the current batch
            max_len = tokenized_batch['attention_mask'].sum(1).max()

            # Trim input tensors to the max length
            input_ids = tokenized_batch['input_ids'][:, :max_len].to(embedding_model.device)
            attention_mask = tokenized_batch['attention_mask'][:, :max_len].to(embedding_model.device)

            # Compute embeddings
            model_output = embedding_model(input_ids=input_ids, attention_mask=attention_mask)

            # Average pooling and L2 normalization
            sentence_embeddings = mean_pooling(model_output, attention_mask)
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

            # Store embeddings
            embeddings_list.append(sentence_embeddings.to('cpu').numpy())

    # Clear CUDA memory
    torch.cuda.empty_cache()

    return np.concatenate(embeddings_list, 0)


## This function will do the deduplication

In [12]:
#| export
def deduplicate_embeddings(embedded, epsilon=1e-2, batch_size=20000):
    """
    Perform deduplication on the provided embeddings.

    Args:
        embedded: A numpy array or PyTorch tensor holding the embeddings.
        epsilon: The maximum distance for two embeddings to be considered duplicates (using cosine similarity).
        batch_size: The size of the batches to process at a time.

    Note: The embeddings must be L2 normalized.

    Returns:
        A tensor of indices that should be deleted due to duplication.
    """

    to_delete = torch.empty(0, dtype=int)
    embedded_tensor = torch.tensor(embedded, dtype=torch.float16, device='cuda', requires_grad=False)

    for i in range(embedded.shape[0]//batch_size+1):
        
        # Calculate the cosine distance within the current batch
        cosine_dist = 1 - torch.matmul(embedded_tensor[i*batch_size:(i+1)*batch_size],
                                        torch.transpose(embedded_tensor[i*batch_size:(i+1)*batch_size], 0, 1))
        
        cosine_dist = cosine_dist + torch.eye(cosine_dist.shape[0],device='cuda')
        
        # Find duplicate indices within the batch
        dup_indices = torch.where(cosine_dist < epsilon)
        to_delete = torch.cat((to_delete, dup_indices[0][torch.where(dup_indices[0] > dup_indices[1])].to('cpu') + (i*batch_size)))

        # Find duplicate indices across the current batch and remaining batches
        for k in range(i+1, embedded.shape[0]//batch_size+1):
            cosine_dist = 1 - torch.matmul(embedded_tensor[i*batch_size:(i+1)*batch_size],
                                            torch.transpose(embedded_tensor[k*batch_size:(k+1)*batch_size], 0, 1))

            dup_indices = torch.where(cosine_dist < epsilon)
            to_delete = torch.cat((to_delete, dup_indices[1].to('cpu') + k*batch_size))

            torch.cuda.empty_cache()

    return to_delete

## And in this function we will combine everythin

In [13]:
#| export
def deduplicate_dataset(
    dataset: Dataset, 
    model: torch.nn.Module, 
    tokenizer,
    epsilon: float = 1e-2, 
    model_batch_size: int = 64, 
    deduplication_batch_size: int =20000, 
    num_workers: int = 16,
    dataset_feature: str = '_merged'
) -> Dataset:
    """
    Deduplicate data in a dataset based on the embeddings computed by a given model.

    Args:
        dataset: Dataset to be deduplicated.
        model: Model to compute embeddings.
        epsilon: Threshold for cosine similarity to consider embeddings as duplicates.
        model_batch_size: Batch size for the model.
        deduplication_batch_size: Batch size for deduplication process.
        num_workers: Number of worker processes for data loading.
        dataset_feature: Feature in the dataset to use for deduplication.

    Returns:
        Deduplicated dataset.
    """
    # Compute embeddings for the dataset
    embeddings = compute_embeddings(dataset, 
                                    model, 
                                    tokenizer,
                                    batch_size=model_batch_size, 
                                    num_workers=num_workers, 
                                    dataset_feature=dataset_feature)
    
    # Find duplicate indices in the embeddings
    duplicate_indices = deduplicate_embeddings(embeddings, epsilon, deduplication_batch_size)
    
    # Filter out duplicate instances from the dataset
    deduplicated_dataset = dataset.filter(lambda example, idx: idx not in duplicate_indices, with_indices=True)

    return deduplicated_dataset
  

## Now let's test it all together

### Load and preprocess the data

We will do the test using a dataset from Huggingface : [0-hero/OIG-small-chip2](https://huggingface.co/datasets/0-hero/OIG-small-chip2)

In [None]:

data = load_dataset("0-hero/OIG-small-chip2")
_ = preprocess_data(data,schema = "<human>:{user} <bot>:{chip2}")
data['train']['_merged'][0]

### Load the tokenizer and model

As a model for the semantic embedding we'll use [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2').to('cuda')

### Run De-duplication

In [None]:
deduplicated = deduplicate_dataset(
    dataset = data['train'], 
    model = model, 
    tokenizer = tokenizer,
    epsilon = 1e-2, 
    model_batch_size = 64, 
    deduplication_batch_size = 20000, 
    num_workers = 16,
    dataset_feature = '_merged'
)

What is the precentage of the dataset we cleaned

1-len(deduplicated)/len(data['train'])

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()