# Mutli-Staged Training Procedure for Synthetic Item Correlations

## Setup and Imports

In [2]:
import re
import math
import yaml
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, InputExample, util
from sentence_transformers.losses import CosineSimilarityLoss
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datasets import Dataset, load_dataset
from IPython.display import display, Markdown
from torch.utils.data import DataLoader

config_path = 'config.yaml'

with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
    

## Loading the base model
A list of suitable base models can be found the [SBERT website](https://www.sbert.net/docs/pretrained_models.html). This research utilized the `all-mpnet-base-v2` model. More models can be found on the [huggingface model hub](https://huggingface.co/models). Note that the script requires sentence transformer models (bi-encoder architecture models).

In [2]:
base_model = SentenceTransformer(
    model_name_or_path=config['base_model_path'],
    device=device
)

  return self.fget.__get__(instance, owner)()


## Stage 1 - Polarity Calibration
Loads a natural language inference (nli) dataset from the huggingface dataset hub ([i.e., the `snli`-dataset](https://huggingface.co/datasets/snli)). The dataset is used to create a synthetic (augmented dataset) in an attempt to teach the model to produce sentence embeddings that will exhibit negative cosine similarities for texts that contain contradictive information. A more sophisticated method to achieve this is described in Opitz & Frank (2022).

Opitz, J., & Frank, A. (2022). SBERT studies Meaning Representations: Decomposing Sentence Embeddings into Explainable Semantic Features (arXiv:2206.07023). arXiv. https://doi.org/10.48550/arXiv.2206.07023

### Data Preperation

In [3]:
nli_data = load_dataset('snli')

df = nli_data['train'].to_pandas()
df = df.rename(columns={'premise': 'sentence1', 'hypothesis': 'sentence2'})
df['label'] = df['label'].replace({0: 'entailment', 1: 'neutral', 2: 'contradiction'})
df = df.sample(frac=1, random_state=420)

'''
note: for demonstration purposes the dataset is restricted to 200 cases.
Comment out the following line to use the full dataset in training
'''
df = df.head(200)

display(Markdown('#### Natural Language Inference dataset (Preview)'))
display(df.head(5))

def similarity_from_nli(example):
    
    sentence1 = example['sentence1']
    sentence2 = example['sentence2']
    label = example['label']
    sign = -1 if label == 'contradiction' else 1

    embeddings = base_model.encode([sentence1, sentence2], convert_to_tensor=True, batch_size=2000)
    
    sentence1_embedding = embeddings[0]
    sentence2_embedding = embeddings[1]
    similarity = util.pytorch_cos_sim(sentence1_embedding, sentence2_embedding).item()
    score = similarity * sign

    return {
        'sentence1': sentence1,
        'sentence2': sentence2,
        'score': score
    }

train_dataset = Dataset.from_pandas(df, preserve_index=False)
train_dataset = train_dataset.map(similarity_from_nli)

display(Markdown('#### Training data for Polarity Calibration (Preview)'))
display(train_dataset.to_pandas().head(5))


#### Natural Language Inference dataset (Preview)

Unnamed: 0,sentence1,sentence2,label
398119,A man is cutting wood with a power tool.,A man is using a power tool to cut metal.,contradiction
76379,a man wearing black pants and a red shirt doin...,"A man is at the park, doing tricks on his bike.",entailment
530523,"There are four boys playing soccer, but not al...",Some children are playing a ball game.,entailment
255625,A dog with a stick in its mouth climbs out of ...,A dog is hunting.,contradiction
130058,A farmhand in training points over the barrier...,The farmhand is a man.,neutral


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

#### Training data for Polarity Calibration (Preview)

Unnamed: 0,sentence1,sentence2,label,score
0,A man is cutting wood with a power tool.,A man is using a power tool to cut metal.,contradiction,-0.781675
1,a man wearing black pants and a red shirt doin...,"A man is at the park, doing tricks on his bike.",entailment,0.782243
2,"There are four boys playing soccer, but not al...",Some children are playing a ball game.,entailment,0.464564
3,A dog with a stick in its mouth climbs out of ...,A dog is hunting.,contradiction,-0.356073
4,A farmhand in training points over the barrier...,The farmhand is a man.,neutral,0.554376


### Training (Polarity Calibration)

In [4]:
input_examples = [
    InputExample(
        texts=[sentence1, sentence2],
        label=score
    ) 
    for sentence1, sentence2, score in zip(
        train_dataset['sentence1'],
        train_dataset['sentence2'], 
        train_dataset['score']
    )
]

train_loss = CosineSimilarityLoss(base_model)

train_dataloader = DataLoader(
    input_examples,
    batch_size=config['polarity_calibration']['batch_size'],
    shuffle=True
)

warmup_steps = math.ceil(len(train_dataloader) * config['polarity_calibration']['num_epochs'] * 0.1)

'''
optional: use the EmbeddingSimilarityEvaluator-class to evaluate model performance to 
a) evaluate performance during training (validation_evaluator)
b) evaluate performance after training is completed (test_evaluator)

note that using evaluators relies on cross validation (train-validation-test-splitting)

'''
# from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
# validation_evaluator = EmbeddingSimilarityEvaluator(sentence1, sentence2, scores)
# test_evaluator = EmbeddingSimilarityEvaluator(sentence1, sentence2, scores)

base_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=config['polarity_calibration']['num_epochs'],    
    optimizer_params={
        'lr': float(config['polarity_calibration']['learning_rate'])
    },
    weight_decay=float(config['polarity_calibration']['weight_decay']),
    warmup_steps=warmup_steps,
    # evaluator=validation_evaluator,
    # evaluation_steps=config['polarity_calibration']['evaluation_steps'],
    output_path=config['save_model_path'],
    save_best_model=True
)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7 [00:00<?, ?it/s]

Iteration:   0%|          | 0/7 [00:00<?, ?it/s]

## Stage 2 - Domain Adaptation
In this stage, we use `CosineSimilarityLoss` to teach the model that similarity is defined by the observed item correlation, obtained from various empirical sources.

### Load the previously calibrated model
**Important**: To save GPU memory, do not run the entire script but each training stage independently! You may want to restart the notebook at before running this section (do not forget to run setup and imports in that case).

In [3]:
calibrated_model = SentenceTransformer(
    model_name_or_path=config['save_model_path'],
    device=device
)

### Training (Domain Adaptation)

In [4]:
df = pd.read_csv(filepath_or_buffer='train_domain-adaptation.csv').sample(frac=1, random_state=420)

display(Markdown('#### Dataset for Domain Adaptation (Preview)'))
display(df.head(5))

training_data = df.query('partition == "train"')
validation_data = df.query('partition == "dev"')
test_data = df.query('partition == "test"')

validation_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=validation_data['item_a'].tolist(),
    sentences2=validation_data['item_b'].tolist(),
    scores=validation_data['correlation'].tolist()
)

test_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=test_data['item_a'].tolist(),
    sentences2=test_data['item_b'].tolist(),
    scores=test_data['correlation'].tolist()
)

input_examples = [
    InputExample(
        texts=[sentence1, sentence2],
        label=score
    ) 
    for sentence1, sentence2, score in zip(
        training_data['item_a'],
        training_data['item_b'], 
        training_data['correlation']
    )
]

train_dataloader = DataLoader(
    dataset=input_examples,
    shuffle=True,
    batch_size=config['domain_adaptation']['batch_size']
)

warmup_steps = math.ceil(len(train_dataloader) * config['domain_adaptation']['num_epochs'] * 0.1)

n_layers = len(calibrated_model[0].auto_model.encoder.layer)
n_freeze = math.ceil(.15 * n_layers)
regex_pattern = f'0\.auto_model\.encoder\.layer\.[0-{n_freeze}]\..+'

for name, param in calibrated_model.named_parameters():
    param.requires_grad = (re.match(regex_pattern, name) is None)

train_loss = CosineSimilarityLoss(calibrated_model)

calibrated_model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=validation_evaluator,
        epochs=int(config['domain_adaptation']['num_epochs']),
        evaluation_steps=int(config['domain_adaptation']['evaluation_steps']),
        optimizer_params={
            'lr': float(config['domain_adaptation']['learning_rate'])
        },
        weight_decay=float(config['domain_adaptation']['weight_decay']),
        warmup_steps=warmup_steps,
        output_path=config['save_model_path'],
        save_best_model=True
    )

#### Dataset for Domain Adaptation (Preview)

Unnamed: 0,item_a,item_b,correlation,partition
1742,i am good at many things.,i find it hard to forgive others.,-0.010869,train
524,i am often in a bad mood.,i want to be left alone.,0.282357,train
630,i find political discussions interesting.,i am happy with my life.,0.085023,train
382,i always know what i am doing.,i usually enjoy being with people.,0.071936,train
971,i love to be the center of attention.,i am under constant pressure.,0.006074,train


Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16 [00:00<?, ?it/s]

Iteration:   0%|          | 0/16 [00:00<?, ?it/s]

### Evaluation

In [13]:
display(Markdown(f'#### Validation-set Evaluation: {np.round(calibrated_model.evaluate(validation_evaluator), 2)}'))
display(Markdown(f'#### Test-set Evaluation: {np.round(calibrated_model.evaluate(test_evaluator), 2)}'))

#### Validation-set Evaluation: 0.48

#### Test-set Evaluation: 0.36

## Export Model
Save the model locally

In [12]:
# calibrated_model.save('enter_model_save_path_here')