# 导包

In [1]:
import nlp_basictasks
import os,json
import numpy as np
import torch
import torch.nn as nn
import random
from tqdm.autonotebook import tqdm, trange
from torch.utils.data import DataLoader
from nlp_basictasks.modules import SBERT
from nlp_basictasks.modules.transformers import BertTokenizer,BertModel,BertConfig
from nlp_basictasks.readers.sts import InputExample,convert_examples_to_features,getExamples,convert_sentences_to_features
from nlp_basictasks.modules.utils import get_optimizer,get_scheduler
from nlp_basictasks.Trainer import Trainer
from nlp_basictasks.evaluation import stsEvaluator
from sentence_transformers import SentenceTransformer,models
model_path1='/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/distill-simcse/'
# model_path2="/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/distiluse-base-multilingual-cased-v1/"
model_path3='/data/nfs14/nfs/aisearch/asr/xhsun/CommonModel/chinese-roberta-wwm/'
# data_folder='/data/nfs14/nfs/aisearch/asr/xhsun/datasets/lcqmc/'
# train_file=os.path.join(data_folder,'lcqmc_train.tsv')
# dev_file=os.path.join(data_folder,'lcqmc_dev.tsv')
#tokenizer=BertTokenizer.from_pretrained(os.path.join(model_path1,'0_Transformer'))
tokenizer=BertTokenizer.from_pretrained(model_path3)
max_seq_len=64
batch_size=128

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


2021-10-19 08:23:14 - INFO - <module> - 54 : Loading faiss with AVX2 support.
2021-10-19 08:23:14 - INFO - <module> - 58 : Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
2021-10-19 08:23:14 - INFO - <module> - 64 : Loading faiss.
2021-10-19 08:23:14 - INFO - <module> - 66 : Successfully loaded faiss.
2021-10-19 08:23:15 - INFO - from_pretrained - 125 : loading vocabulary file /data/nfs14/nfs/aisearch/asr/xhsun/CommonModel/chinese-roberta-wwm/vocab.txt


# 获取数据

In [2]:
train_file='/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/STS-B/cnsd-sts-train.txt'
dev_file='/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/STS-B/cnsd-sts-dev.txt'
test_file='/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/STS-B/cnsd-sts-test.txt'
def read_data(file_path):
    sentences=[]
    labels=[]
    with open(file_path) as f:
        lines=f.readlines()
    for line in lines:
        line_split=line.strip().split('||')
        sentences.append([line_split[1],line_split[2]])
        labels.append(line_split[3])
    return sentences,labels

In [3]:
train_sentences,train_labels=read_data(train_file)
dev_sentences,dev_labels=read_data(dev_file)
test_sentences,test_labels=read_data(test_file)

In [4]:
print(train_sentences[:2],train_labels[:2])
print(dev_sentences[:2],dev_labels[:2])
print(test_sentences[:2],test_labels[:2])

[['一架飞机要起飞了。', '一架飞机正在起飞。'], ['一个男人在吹一支大笛子。', '一个人在吹长笛。']] ['5', '3']
[['一个戴着安全帽的男人在跳舞。', '一个戴着安全帽的男人在跳舞。'], ['一个小孩在骑马。', '孩子在骑马。']] ['5', '4']
[['一个女孩在给她的头发做发型。', '一个女孩在梳头。'], ['一群男人在海滩上踢足球。', '一群男孩在海滩上踢足球。']] ['2', '3']


## create unsupervised train_dataset

In [5]:
train_sentences=[sentence[0] for sentence in train_sentences]#只取一般数据作为训练集
print(len(train_sentences))
print(train_sentences[:3])
train_examples=[InputExample(text_list=[sentence,sentence],label=1) for sentence in train_sentences]
train_dataloader=DataLoader(train_examples,shuffle=True,batch_size=batch_size)
def smart_batching_collate(batch):
    features_of_a,features_of_b,labels=convert_examples_to_features(examples=batch,tokenizer=tokenizer,max_seq_len=max_seq_len)
    return features_of_a,features_of_b,labels
train_dataloader.collate_fn=smart_batching_collate
print(train_examples[0])

5231
['一架飞机要起飞了。', '一个男人在吹一支大笛子。', '一个人正把切碎的奶酪撒在比萨饼上。']
<InputExample> label: 1, text pairs : 一架飞机要起飞了。; 一架飞机要起飞了。


# SimCSE

In [6]:
class SimCSE(nn.Module):
    def __init__(self,
                 bert_model_path,
                 is_sbert_model=True,
                temperature=0.05,
                is_distilbert=False,
                device='cpu'):
        super(SimCSE,self).__init__()
        if is_sbert_model:
            self.encoder=SentenceTransformer(model_name_or_path=bert_model_path,device=device)
        else:
            word_embedding_model = models.Transformer(bert_model_path, max_seq_length=max_seq_len)
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
            self.encoder=SentenceTransformer(modules=[word_embedding_model, pooling_model],device=device)
        self.temperature=temperature
        self.is_distilbert=is_distilbert#蒸馏版本的BERT不支持token_type_ids
    def cal_cos_sim(self,embeddings1,embeddings2):
        embeddings1_norm=torch.nn.functional.normalize(embeddings1,p=2,dim=1)
        embeddings2_norm=torch.nn.functional.normalize(embeddings2,p=2,dim=1)
        return torch.mm(embeddings1_norm,embeddings2_norm.transpose(0,1))#(batch_size,batch_size)
        
    def forward(self,batch_inputs):
        '''
        为了实现兼容，所有model的batch_inputs最后一个位置必须是labels，即使为None
        get token_embeddings,cls_token_embeddings,sentence_embeddings
        sentence_embeddings是经过Pooling层后concat的embedding。维度=768*k，其中k取决于pooling的策略
        一般来讲，只会取一种pooling策略，要么直接cls要么mean last or mean last2 or mean first and last layer，所以sentence_embeddings的维度也是768
        '''
        batch1_features,batch2_features,_=batch_inputs
        if self.is_distilbert:
            del batch1_features['token_type_ids']
            del batch2_features['token_type_ids']
        new_input_ids=[]
        batch1_embeddings=self.encoder(batch1_features)['sentence_embedding']
        batch2_embeddings=self.encoder(batch2_features)['sentence_embedding']
        cos_sim=self.cal_cos_sim(batch1_embeddings,batch2_embeddings)/self.temperature#(batch_size,batch_size)
        batch_size=cos_sim.size(0)
        assert cos_sim.size()==(batch_size,batch_size)
        labels=torch.arange(batch_size).to(cos_sim.device)
        return nn.CrossEntropyLoss()(cos_sim,labels)
    
    def encode(self, sentences,
               batch_size: int = 32,
               show_progress_bar: bool = None,
               output_value: str = 'sentence_embedding',
               convert_to_numpy: bool = True,
               convert_to_tensor: bool = False,
               device: str = None,
               normalize_embeddings: bool = False):
        '''
        传进来的sentences只能是single_batch
        '''
        return self.encoder.encode(sentences=sentences,
                                         batch_size=batch_size,
                                         show_progress_bar=show_progress_bar,
                                         output_value=output_value,
                                         convert_to_numpy=convert_to_numpy,
                                         convert_to_tensor=convert_to_tensor,
                                         device=device,
                                         normalize_embeddings=normalize_embeddings)
    
    def save(self,output_path):
        os.makedirs(output_path,exist_ok=True)
        with open(os.path.join(output_path, 'model_param_config.json'), 'w') as fOut:
            json.dump(self.get_config_dict(output_path), fOut)
        self.encoder.save(output_path)
        
    def get_config_dict(self,output_path):
        '''
        一定要有dict，这样才能初始化Model
        '''
        return {'output_path':output_path,'temperature': self.temperature, 'is_distilbert': self.is_distilbert}
    @staticmethod
    def load(input_path):
        with open(os.path.join(input_path, 'model_param_config.json')) as fIn:
            config = json.load(fIn)
        return SimCSE(**config)

In [7]:
device='cpu'
#simcse=SimCSE(bert_model_path=model_path3,is_distilbert=False,device=device,is_sbert_model=False)
simcse=SimCSE(bert_model_path="/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/unSimCSE_STS-B/",is_distilbert=False,device=device,is_sbert_model=True)

2021-10-19 08:23:30 - INFO - __init__ - 41 : Load pretrained SentenceTransformer: /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/unSimCSE_STS-B/
2021-10-19 08:23:30 - INFO - __init__ - 107 : Load SentenceTransformer from folder: /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/unSimCSE_STS-B/


In [24]:
e_simcse=SimCSE(bert_model_path="/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE/",is_distilbert=False,is_sbert_model=True)

2021-10-19 08:29:02 - INFO - __init__ - 41 : Load pretrained SentenceTransformer: /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE/
2021-10-19 08:29:02 - INFO - __init__ - 107 : Load SentenceTransformer from folder: /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE/


# 构造evaluator

In [9]:
#dev_sentences=[example.text_list for example in dev_examples]
#dev_labels=[example.label for example in dev_examples]
print(dev_sentences[0],dev_labels[0])
sentences1_list=[sen[0] for sen in dev_sentences]
sentences2_list=[sen[1] for sen in dev_sentences]
dev_labels=[int(score) for score in dev_labels]
evaluator=stsEvaluator(sentences1=sentences1_list,sentences2=sentences2_list,batch_size=64,write_csv=True,scores=dev_labels)

['一个戴着安全帽的男人在跳舞。', '一个戴着安全帽的男人在跳舞。'] 5


In [10]:
evaluator(simcse)

2021-10-19 08:24:26 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-19 08:24:39 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7585	Spearman: 0.7632
2021-10-19 08:24:39 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7228	Spearman: 0.7396
2021-10-19 08:24:39 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7195	Spearman: 0.7360
2021-10-19 08:24:39 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7377	Spearman: 0.7477





0.7632353820991463

In [25]:
evaluator(e_simcse)

2021-10-19 08:29:13 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-19 08:29:26 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7900	Spearman: 0.7929
2021-10-19 08:29:26 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7555	Spearman: 0.7709
2021-10-19 08:29:26 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7549	Spearman: 0.7702
2021-10-19 08:29:26 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7507	Spearman: 0.7531





0.7929133672005957

# 训练模型温度的SimCSE

In [None]:
model_temperature=SimCSE(bert_model_path=model_path3,is_distilbert=False,device=device,is_sbert_model=False,temperature=1)
epochs=5
output_path='/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/SimCSE_tauis1'
tensorboard_logdir=os.path.join(output_path,'log')
optimizer_type='AdamW'
scheduler='WarmupLinear'
warmup_proportion=0.1
optimizer_params={'lr': 2e-5}
weight_decay=0.01
num_train_steps = int(len(train_dataloader) * epochs)
warmup_steps = num_train_steps*warmup_proportion
optimizer = get_optimizer(model=model_temperature,optimizer_type=optimizer_type,weight_decay=weight_decay,optimizer_params=optimizer_params)
scheduler = get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps)

trainer=Trainer(epochs=epochs,output_path=output_path,tensorboard_logdir=tensorboard_logdir,early_stop_patience=20)
trainer.train(train_dataloader=train_dataloader,
             model=model_temperature,
             optimizer=optimizer,
             scheduler=scheduler,
             evaluator=evaluator,
             )

Some weights of the model checkpoint at /data/nfs14/nfs/aisearch/asr/xhsun/CommonModel/chinese-roberta-wwm/ were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2021-10-17 20:00:33 - INFO - train - 56 : 一个epoch 下，每隔8个step会输出一次loss，每隔20个step会评估一次模型


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-10-17 20:00:43 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 0 after 1 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-17 20:00:57 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.6641	Spearman: 0.6808
2021-10-17 20:00:57 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.6740	Spearman: 0.6882
2021-10-17 20:00:57 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.6680	Spearman: 0.6824
2021-10-17 20:00:57 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.4705	Spearman: 0.4646
2021-10-17 20:00:57 - INFO - save - 371 : Save model to /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/SimCSE_tauis1





2021-10-17 20:00:58 - INFO - train - 98 : In epoch 0, training_step 0, the eval score is 0.6807509857943427, previous eval score is -9999999, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/SimCSE_tauis1
2021-10-17 20:02:00 - INFO - train - 75 : Epoch : 0, train_step : 8/205, loss_value : 5.105614900588989 
2021-10-17 20:03:12 - INFO - train - 75 : Epoch : 0, train_step : 16/205, loss_value : 4.387591302394867 
2021-10-17 20:03:53 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 0 after 21 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-17 20:04:07 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.6652	Spearman: 0.6653
2021-10-17 20:04:07 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.6667	Spearman: 0.6747
2021-10-17 20:04:07 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.6616	Spearman: 0.6695
2021-10-17 20:04:07 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.6107	Spearman: 0.6048
2021-10-17 20:04:07 - INFO - train - 102 : No improvement over previous best eval score (0.665281 vs 0.680751), patience = 19





2021-10-17 20:04:38 - INFO - train - 75 : Epoch : 0, train_step : 24/205, loss_value : 4.135252892971039 
2021-10-17 20:05:53 - INFO - train - 75 : Epoch : 0, train_step : 32/205, loss_value : 4.0153725147247314 
2021-10-17 20:07:04 - INFO - train - 75 : Epoch : 0, train_step : 40/205, loss_value : 3.9507781267166138 
2021-10-17 20:07:09 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 0 after 41 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-17 20:07:23 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.6622	Spearman: 0.6624
2021-10-17 20:07:23 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.6658	Spearman: 0.6662
2021-10-17 20:07:23 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.6652	Spearman: 0.6654
2021-10-17 20:07:23 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.6401	Spearman: 0.6303
2021-10-17 20:07:23 - INFO - train - 102 : No improvement over previous best eval score (0.662450 vs 0.680751), patience = 18






HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-10-17 20:07:32 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 1 after 1 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-17 20:07:46 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.6643	Spearman: 0.6647
2021-10-17 20:07:46 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.6678	Spearman: 0.6678
2021-10-17 20:07:46 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.6673	Spearman: 0.6670
2021-10-17 20:07:46 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.6439	Spearman: 0.6343
2021-10-17 20:07:46 - INFO - train - 102 : No improvement over previous best eval score (0.664672 vs 0.680751), patience = 17





2021-10-17 20:08:57 - INFO - train - 75 : Epoch : 1, train_step : 16/205, loss_value : 4.440356075763702 
2021-10-17 20:10:08 - INFO - train - 75 : Epoch : 1, train_step : 32/205, loss_value : 3.9337053894996643 
2021-10-17 20:10:53 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 1 after 21 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-17 20:11:07 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.6755	Spearman: 0.6784
2021-10-17 20:11:07 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.6834	Spearman: 0.6821
2021-10-17 20:11:07 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.6831	Spearman: 0.6818
2021-10-17 20:11:07 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.6553	Spearman: 0.6463
2021-10-17 20:11:07 - INFO - train - 102 : No improvement over previous best eval score (0.678443 vs 0.680751), patience = 16





2021-10-17 20:11:35 - INFO - train - 75 : Epoch : 1, train_step : 48/205, loss_value : 3.9258109629154205 
2021-10-17 20:12:49 - INFO - train - 75 : Epoch : 1, train_step : 64/205, loss_value : 3.925343096256256 
2021-10-17 20:13:59 - INFO - train - 75 : Epoch : 1, train_step : 80/205, loss_value : 3.906319409608841 
2021-10-17 20:14:05 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 1 after 41 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-17 20:14:19 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.6762	Spearman: 0.6784
2021-10-17 20:14:19 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.6839	Spearman: 0.6822
2021-10-17 20:14:19 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.6836	Spearman: 0.6819
2021-10-17 20:14:19 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.6568	Spearman: 0.6464
2021-10-17 20:14:19 - INFO - train - 102 : No improvement over previous best eval score (0.678400 vs 0.680751), patience = 15






HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-10-17 20:14:29 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 2 after 1 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-17 20:14:43 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.6762	Spearman: 0.6784
2021-10-17 20:14:43 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.6841	Spearman: 0.6822
2021-10-17 20:14:43 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.6838	Spearman: 0.6820
2021-10-17 20:14:43 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.6569	Spearman: 0.6466
2021-10-17 20:14:43 - INFO - train - 102 : No improvement over previous best eval score (0.678441 vs 0.680751), patience = 14





2021-10-17 20:15:50 - INFO - train - 75 : Epoch : 2, train_step : 24/205, loss_value : 4.410860866308212 
2021-10-17 20:17:05 - INFO - train - 75 : Epoch : 2, train_step : 48/205, loss_value : 3.9177099466323853 
2021-10-17 20:17:46 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 2 after 21 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-17 20:18:00 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.6762	Spearman: 0.6775
2021-10-17 20:18:00 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.6837	Spearman: 0.6818
2021-10-17 20:18:00 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.6834	Spearman: 0.6816
2021-10-17 20:18:00 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.6560	Spearman: 0.6456
2021-10-17 20:18:00 - INFO - train - 102 : No improvement over previous best eval score (0.677485 vs 0.680751), patience = 13





2021-10-17 20:18:32 - INFO - train - 75 : Epoch : 2, train_step : 72/205, loss_value : 3.916303902864456 
2021-10-17 20:19:41 - INFO - train - 75 : Epoch : 2, train_step : 96/205, loss_value : 3.9153249859809875 
2021-10-17 20:20:54 - INFO - train - 75 : Epoch : 2, train_step : 120/205, loss_value : 3.8989281952381134 
2021-10-17 20:20:59 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 2 after 41 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-17 20:21:13 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.6763	Spearman: 0.6780
2021-10-17 20:21:13 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.6845	Spearman: 0.6826
2021-10-17 20:21:13 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.6840	Spearman: 0.6826
2021-10-17 20:21:13 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.6569	Spearman: 0.6468
2021-10-17 20:21:13 - INFO - train - 102 : No improvement over previous best eval score (0.678027 vs 0.680751), patience = 12






HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-10-17 20:21:23 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 3 after 1 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-17 20:21:37 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.6761	Spearman: 0.6779
2021-10-17 20:21:37 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.6843	Spearman: 0.6824
2021-10-17 20:21:37 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.6839	Spearman: 0.6824
2021-10-17 20:21:37 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.6567	Spearman: 0.6467
2021-10-17 20:21:37 - INFO - train - 102 : No improvement over previous best eval score (0.677852 vs 0.680751), patience = 11





2021-10-17 20:22:43 - INFO - train - 75 : Epoch : 3, train_step : 32/205, loss_value : 4.403318256139755 


In [12]:
def uniform_loss(x, t=2):
    return torch.pdist(x, p=2).pow(2).mul(-t).exp().mean().log()
def align_loss(x, y, alpha=2):
    return (x - y).norm(p=2, dim=1).pow(alpha).mean()

In [13]:
sentences1_embeddings=simcse.encode(sentences1_list,convert_to_tensor=True)
sentences2_embeddings=simcse.encode(sentences2_list,convert_to_tensor=True)

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=46.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=46.0), HTML(value='')))




In [14]:
align_loss(sentences1_embeddings,sentences2_embeddings)

tensor(194.2286)

In [15]:
uniform_loss(sentences1_embeddings)

tensor(-9.8505)

In [16]:
uniform_loss(sentences2_embeddings)

tensor(-9.6273)

In [26]:
s1=e_simcse.encode(sentences1_list,convert_to_tensor=True)
s2=e_simcse.encode(sentences2_list,convert_to_tensor=True)

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=46.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=46.0), HTML(value='')))




In [27]:
align_loss(s1,s2)

tensor(202.1972)

In [28]:
uniform_loss(s1)

tensor(-9.8505)

In [29]:
uniform_loss(s2)

tensor(-9.6273)

In [30]:
s1.size()

torch.Size([1458, 768])

In [31]:
sentence="数据转换方式决定了最终学习的向量表示的不变性"
s1=simcse.encode(sentence,show_progress_bar=False,normalize_embeddings=True)
s2=simcse.encode(sentence[::-1],show_progress_bar=False,normalize_embeddings=True)

In [32]:
sum(s1*s2)

0.7955213659201235