# Install necessary libraries

In [1]:
%%capture

!pip install datasets
!pip install sentence-transformers
!pip install transformers

# Import libraries

In [1]:
import torch
from sentence_transformers import SentenceTransformer, models
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
from datasets import load_dataset

# Fetch data for training and test, as well as the tokenizer

In [2]:
import torch
from transformers import AutoModel, AutoTokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
from datasets import load_dataset

In [3]:
# Dataset for training
dataset = load_dataset("stsb_multi_mt", name="ru", split="train")
similarity = [i['similarity_score'] for i in dataset]
normalized_similarity = [i/5.0 for i in similarity]

# Dataset for test
test_dataset = load_dataset("stsb_multi_mt", name="ru", split="test")

# Prepare test data
sentence_1_test = [i['sentence1'] for i in test_dataset]
sentence_2_test = [i['sentence2'] for i in test_dataset]
text_cat_test = [[str(x), str(y)] for x,y in zip(sentence_1_test, sentence_2_test)]

# Set the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = AutoModel.from_pretrained('DeepPavlov/rubert-base-cased')

Found cached dataset stsb_multi_mt (/home/jovyan/.cache/huggingface/datasets/stsb_multi_mt/ru/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)
Found cached dataset stsb_multi_mt (/home/jovyan/.cache/huggingface/datasets/stsb_multi_mt/ru/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- Thi

In [None]:
class STSRuBertModel(torch.nn.Module):

    def __init__(self):

        super(STSRuBertModel, self).__init__()

        self.bert = model
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_data):

        output = self.bert(input_data['input_ids'], attention_mask=input_data['attention_mask'])
        output = self.dropout(output.last_hidden_state[:, 0, :])
        output = self.linear(output)

        return output

# Define Dataloader for training

In [5]:
class DataSequence(torch.utils.data.Dataset):

    def __init__(self, dataset):

        similarity = [i['similarity_score'] for i in dataset]
        self.label = [i/5.0 for i in similarity]
        self.sentence_1 = [i['sentence1'] for i in dataset]
        self.sentence_2 = [i['sentence2'] for i in dataset]
        self.text_cat = [[str(x), str(y)] for x,y in zip(self.sentence_1, self.sentence_2)]

    def __len__(self):

        return len(self.text_cat)

    def get_batch_labels(self, idx):

        return torch.tensor(self.label[idx])

    def get_batch_texts(self, idx):

        return tokenizer(self.text_cat[idx], padding='max_length', max_length = 128, truncation=True, return_tensors="pt")

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

def collate_fn(texts):

  num_texts = len(texts['input_ids'])
  features = list()
  for i in range(num_texts):
      features.append({'input_ids':texts['input_ids'][i], 'attention_mask':texts['attention_mask'][i]})
  
  return features

# Define loss function for training

In [6]:
class CosineSimilarityLoss(torch.nn.Module):

    def __init__(self,  loss_fct = torch.nn.MSELoss(), cos_score_transformation=torch.nn.Identity()):
      
        super(CosineSimilarityLoss, self).__init__()
        self.loss_fct = loss_fct
        self.cos_score_transformation = cos_score_transformation
        self.cos = torch.nn.CosineSimilarity(dim=1)

    def forward(self, input, label):

        embedding_1 = torch.stack([inp[0] for inp in input])
        embedding_2 = torch.stack([inp[1] for inp in input])

        output = self.cos_score_transformation(self.cos(embedding_1, embedding_2))

        return self.loss_fct(output, label.squeeze())

# Train the Model

In [7]:
def model_train(dataset, epochs, learning_rate, bs):

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    model = STSRuBertModel()

    criterion = CosineSimilarityLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    train_dataset = DataSequence(dataset)
    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=bs, shuffle=True)

    model = model.cuda()
    criterion = criterion.cuda()

    best_acc = 0.0
    best_loss = 1000

    for i in range(epochs):

        total_acc_train = 0
        total_loss_train = 0.0

        for train_data, train_label in tqdm(train_dataloader):

            train_data['input_ids'] = train_data['input_ids'].to(device)
            train_data['attention_mask'] = train_data['attention_mask'].to(device)
            del train_data['token_type_ids']

            train_data = collate_fn(train_data)

            output = [model(feature).squeeze(0) for feature in train_data]

            loss = criterion(output, train_label.to(device))
            total_loss_train += loss.item()

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        print(f'Epochs: {i + 1} | Loss: {total_loss_train / len(dataset): .3f}')
        model.train()

    return model

In [8]:
EPOCHS = 8
LEARNING_RATE = 1e-6
BATCH_SIZE = 16

In [9]:
# Train the model
trained_model = model_train(dataset, EPOCHS, LEARNING_RATE, BATCH_SIZE)

100%|██████████| 360/360 [05:13<00:00,  1.15it/s]


Epochs: 1 | Loss:  0.021


100%|██████████| 360/360 [05:23<00:00,  1.11it/s]


Epochs: 2 | Loss:  0.021


100%|██████████| 360/360 [05:24<00:00,  1.11it/s]


Epochs: 3 | Loss:  0.023


100%|██████████| 360/360 [05:23<00:00,  1.11it/s]


Epochs: 4 | Loss:  0.025


100%|██████████| 360/360 [05:22<00:00,  1.12it/s]


Epochs: 5 | Loss:  0.025


100%|██████████| 360/360 [05:19<00:00,  1.13it/s]


Epochs: 6 | Loss:  0.021


100%|██████████| 360/360 [05:21<00:00,  1.12it/s]


Epochs: 7 | Loss:  0.021


100%|██████████| 360/360 [05:19<00:00,  1.13it/s]

Epochs: 8 | Loss:  0.020





In [10]:
model.save_pretrained('NLI_vers_2')

In [11]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = AutoModel.from_pretrained('NLI_vers_2')

# Move the model to GPU
device = torch.device('cuda')
model.to(device)

# Load the data
data = pd.read_csv("STS_For_Paraphrase.csv", encoding='utf-8-sig', sep=';')

def predict_sts(text1, text2):
    text1_str = str(text1)
    text2_str = str(text2)
    inputs = tokenizer(text1_str, text2_str, padding='max_length', max_length=128, truncation=True, return_tensors="pt").to(device)
    model_output = model(**inputs)
    embeddings = model_output.last_hidden_state.squeeze(dim=0)
    cosine_similarity_score = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1], dim=0).item()
    similarity_score = 2.5 * (cosine_similarity_score + 1)
    return similarity_score

# Make predictions for the data
preds = []
for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    pred = predict_sts(row['sentence1'], row['sentence2'])
    preds.append(pred)

# Add predictions as a new column in the dataframe and save it to a new csv file
data['similarity_score'] = preds
data['similarity_score'] = data['similarity_score'].apply(lambda x: round(x, 2))  # round to two decimal places
data['similarity_score'] = data['similarity_score'].apply(lambda x: min(max(1, x), 5))  # clip to range [1, 5]
data.to_csv('predicted_STS.csv', index=False)

  data = pd.read_csv("STS_For_Testing.csv", encoding='utf-8-sig', sep=';')
100%|██████████| 1009458/1009458 [3:55:48<00:00, 71.35it/s] 
