In [3]:
from sentence_transformers import SentenceTransformer, models, InputExample, losses, evaluation
from torch import nn
from torch.utils.data import DataLoader
from datasets import load_dataset
import math
import numpy as np
import torch
from hyperopt import fmin, tpe, space_eval, STATUS_OK
from hyperopt import hp
import os
import pandas as pd

In [1]:
!rm -rf ./best_srubert

In [2]:
!mkdir ./best_srubert

In [6]:
np.random.seed(0)
torch.manual_seed(0)

batch_size = 64
num_epochs = 8
lr = 1.85e-6

word_embedding_model = models.Transformer('DeepPavlov/rubert-base-cased', max_seq_length=512)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
#в обычных языковых моделях добавляется pooler layer с Tanh activation, здесь тоже его добавлю
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=pooling_model.get_sentence_embedding_dimension(), activation_function=nn.Tanh())

net = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

sts_df_train = pd.read_csv('./STS_train.csv')
sts_df_test = pd.read_csv('./STS_test.csv')
sts_df_val = pd.read_csv('./STS_dev.csv')

sts_ds_train = []
#sts_ds_test = []
sts_ds_val = []

for row in sts_df_train.iterrows():
    sts_ds_train.append(InputExample(texts=[row[1]['sentence1'], row[1]['sentence2']], label=row[1]['similarity_score']))

evaluator_val = evaluation.EmbeddingSimilarityEvaluator(sts_df_val['sentence1'], 
                                                        sts_df_val['sentence2'], 
                                                        sts_df_val['similarity_score'])
evaluator_test = evaluation.EmbeddingSimilarityEvaluator(sts_df_test['sentence1'], 
                                                        sts_df_test['sentence2'], 
                                                        sts_df_test['similarity_score'])


train_dataloader = DataLoader(sts_ds_train, shuffle=False, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(net)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

net.fit(train_objectives=[(train_dataloader, train_loss)], 
        steps_per_epoch = len(train_dataloader), \
        optimizer_params = {'lr': lr}, \
        warmup_steps=warmup_steps, \
        evaluator = evaluator_val, \
        scheduler = 'warmupcosinewithhardrestarts', \
        #evaluator = evaluator_val, \
        save_best_model=True, \
        epochs=num_epochs, \
        output_path = './best_srubert/')
        #evaluation_steps=len(train_dataloader))
    


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

Iteration:   0%|          | 0/90 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
print('Pearson correlation on a dev set in the end of the training:')
print(net.evaluate(evaluator_val))
print('Pearson correlation on a test set in the end of the training:')
print(net.evaluate(evaluator_test))
print('\n\n')
net = SentenceTransformer('./best_srubert')
print('Pearson correlation on a dev set of the best model:')
print(net.evaluate(evaluator_val))
print('Pearson correlation on a test set of the best model:')
print(net.evaluate(evaluator_test))

Pearson correlation on a dev set in the end of the training:
0.5487853411608976
Pearson correlation on a test set in the end of the training:
0.44158279612124124



Pearson correlation on a dev set of the best model:
