# 导包

In [1]:
import nlp_basictasks
import os,json
import numpy as np
import torch
import torch.nn as nn
import random
from tqdm.autonotebook import tqdm, trange
from torch.utils.data import DataLoader
from nlp_basictasks.modules import SBERT
from nlp_basictasks.modules.transformers import BertTokenizer,BertModel,BertConfig
from nlp_basictasks.readers.sts import InputExample,convert_examples_to_features,getExamples,convert_sentences_to_features
from nlp_basictasks.modules.utils import get_optimizer,get_scheduler
from nlp_basictasks.Trainer import Trainer
from nlp_basictasks.evaluation import stsEvaluator
from sentence_transformers import SentenceTransformer,models
model_path1='/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/distill-simcse/'
# model_path2="/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/distiluse-base-multilingual-cased-v1/"
model_path3='/data/nfs14/nfs/aisearch/asr/xhsun/CommonModel/chinese-roberta-wwm/'
# data_folder='/data/nfs14/nfs/aisearch/asr/xhsun/datasets/lcqmc/'
# train_file=os.path.join(data_folder,'lcqmc_train.tsv')
# dev_file=os.path.join(data_folder,'lcqmc_dev.tsv')
#tokenizer=BertTokenizer.from_pretrained(os.path.join(model_path1,'0_Transformer'))
tokenizer=BertTokenizer.from_pretrained(model_path3)
max_seq_len=64
batch_size=128

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


2021-10-19 15:15:19 - INFO - <module> - 54 : Loading faiss with AVX2 support.
2021-10-19 15:15:19 - INFO - <module> - 58 : Could not load library with AVX2 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx2'")
2021-10-19 15:15:19 - INFO - <module> - 64 : Loading faiss.
2021-10-19 15:15:19 - INFO - <module> - 66 : Successfully loaded faiss.
2021-10-19 15:15:20 - INFO - from_pretrained - 125 : loading vocabulary file /data/nfs14/nfs/aisearch/asr/xhsun/CommonModel/chinese-roberta-wwm/vocab.txt


# 获取数据

In [2]:
train_file='/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/STS-B/cnsd-sts-train.txt'
dev_file='/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/STS-B/cnsd-sts-dev.txt'
test_file='/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/STS-B/cnsd-sts-test.txt'
def read_data(file_path):
    sentences=[]
    labels=[]
    with open(file_path) as f:
        lines=f.readlines()
    for line in lines:
        line_split=line.strip().split('||')
        sentences.append([line_split[1],line_split[2]])
        labels.append(line_split[3])
    return sentences,labels

In [3]:
train_sentences,train_labels=read_data(train_file)
dev_sentences,dev_labels=read_data(dev_file)
test_sentences,test_labels=read_data(test_file)

In [4]:
print(train_sentences[:2],train_labels[:2])
print(dev_sentences[:2],dev_labels[:2])
print(test_sentences[:2],test_labels[:2])

[['一架飞机要起飞了。', '一架飞机正在起飞。'], ['一个男人在吹一支大笛子。', '一个人在吹长笛。']] ['5', '3']
[['一个戴着安全帽的男人在跳舞。', '一个戴着安全帽的男人在跳舞。'], ['一个小孩在骑马。', '孩子在骑马。']] ['5', '4']
[['一个女孩在给她的头发做发型。', '一个女孩在梳头。'], ['一群男人在海滩上踢足球。', '一群男孩在海滩上踢足球。']] ['2', '3']


In [5]:
test_sentences[0]

['一个女孩在给她的头发做发型。', '一个女孩在梳头。']

## create unsupervised train_dataset

In [5]:
train_sentences=[sentence[0] for sentence in train_sentences]#只取一般数据作为训练集
print(len(train_sentences))
print(train_sentences[:3])
train_examples=[InputExample(text_list=[sentence,sentence],label=1) for sentence in train_sentences]
train_dataloader=DataLoader(train_examples,shuffle=True,batch_size=batch_size)
def smart_batching_collate(batch):
    features_of_a,features_of_b,labels=convert_examples_to_features(examples=batch,tokenizer=tokenizer,max_seq_len=max_seq_len)
    return features_of_a,features_of_b,labels
train_dataloader.collate_fn=smart_batching_collate
print(train_examples[0])

5231
['一架飞机要起飞了。', '一个男人在吹一支大笛子。', '一个人正把切碎的奶酪撒在比萨饼上。']
<InputExample> label: 1, text pairs : 一架飞机要起飞了。; 一架飞机要起飞了。


## create supervised train_dataset

In [None]:
print(len(train_sentences))
print(train_sentences[:3])
#train_examples=[InputExample(text_list=sentence,label=1) for sentence in train_sentences]
train_dataloader=DataLoader(train_examples,shuffle=True,batch_size=batch_size)
def smart_batching_collate(batch):
    features_of_a,features_of_b,labels=convert_examples_to_features(examples=batch,tokenizer=tokenizer,max_seq_len=max_seq_len)
    return features_of_a,features_of_b,labels
train_dataloader.collate_fn=smart_batching_collate
print(train_examples[0])

# SimCSE模型

In [6]:
class SimCSE(nn.Module):
    def __init__(self,
                 bert_model_path,
                 is_sbert_model=True,
                temperature=0.05,
                is_distilbert=False,
                device='cpu'):
        super(SimCSE,self).__init__()
        if is_sbert_model:
            self.encoder=SentenceTransformer(model_name_or_path=bert_model_path,device=device)
        else:
            word_embedding_model = models.Transformer(bert_model_path, max_seq_length=max_seq_len)
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
            self.encoder=SentenceTransformer(modules=[word_embedding_model, pooling_model],device=device)
        self.temperature=temperature
        self.is_distilbert=is_distilbert#蒸馏版本的BERT不支持token_type_ids
    def cal_cos_sim(self,embeddings1,embeddings2):
        embeddings1_norm=torch.nn.functional.normalize(embeddings1,p=2,dim=1)
        embeddings2_norm=torch.nn.functional.normalize(embeddings2,p=2,dim=1)
        return torch.mm(embeddings1_norm,embeddings2_norm.transpose(0,1))#(batch_size,batch_size)
        
    def forward(self,batch_inputs):
        '''
        为了实现兼容，所有model的batch_inputs最后一个位置必须是labels，即使为None
        get token_embeddings,cls_token_embeddings,sentence_embeddings
        sentence_embeddings是经过Pooling层后concat的embedding。维度=768*k，其中k取决于pooling的策略
        一般来讲，只会取一种pooling策略，要么直接cls要么mean last or mean last2 or mean first and last layer，所以sentence_embeddings的维度也是768
        '''
        batch1_features,batch2_features,_=batch_inputs
        if self.is_distilbert:
            del batch1_features['token_type_ids']
            del batch2_features['token_type_ids']
        batch1_embeddings=self.encoder(batch1_features)['sentence_embedding']
        batch2_embeddings=self.encoder(batch2_features)['sentence_embedding']
        cos_sim=self.cal_cos_sim(batch1_embeddings,batch2_embeddings)/self.temperature#(batch_size,batch_size)
        batch_size=cos_sim.size(0)
        assert cos_sim.size()==(batch_size,batch_size)
        labels=torch.arange(batch_size).to(cos_sim.device)
        return nn.CrossEntropyLoss()(cos_sim,labels)
    
    def encode(self, sentences,
               batch_size: int = 32,
               show_progress_bar: bool = None,
               output_value: str = 'sentence_embedding',
               convert_to_numpy: bool = True,
               convert_to_tensor: bool = False,
               device: str = None,
               normalize_embeddings: bool = False):
        '''
        传进来的sentences只能是single_batch
        '''
        return self.encoder.encode(sentences=sentences,
                                         batch_size=batch_size,
                                         show_progress_bar=show_progress_bar,
                                         output_value=output_value,
                                         convert_to_numpy=convert_to_numpy,
                                         convert_to_tensor=convert_to_tensor,
                                         device=device,
                                         normalize_embeddings=normalize_embeddings)
    
    def save(self,output_path):
        os.makedirs(output_path,exist_ok=True)
        with open(os.path.join(output_path, 'model_param_config.json'), 'w') as fOut:
            json.dump(self.get_config_dict(output_path), fOut)
        self.encoder.save(output_path)
        
    def get_config_dict(self,output_path):
        '''
        一定要有dict，这样才能初始化Model
        '''
        return {'output_path':output_path,'temperature': self.temperature, 'is_distilbert': self.is_distilbert}
    @staticmethod
    def load(input_path):
        with open(os.path.join(input_path, 'model_param_config.json')) as fIn:
            config = json.load(fIn)
        return SimCSE(**config)

In [7]:
device='cpu'
#simcse=SimCSE(bert_model_path=model_path3,is_distilbert=False,device=device,is_sbert_model=False)
simcse=SimCSE(bert_model_path="/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/unSimCSE_STS-B/",is_distilbert=False,device=device,is_sbert_model=True)

2021-10-19 08:23:00 - INFO - __init__ - 41 : Load pretrained SentenceTransformer: /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/unSimCSE_STS-B/
2021-10-19 08:23:00 - INFO - __init__ - 107 : Load SentenceTransformer from folder: /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/unSimCSE_STS-B/


# 构造evaluator

In [23]:
#dev_sentences=[example.text_list for example in dev_examples]
#dev_labels=[example.label for example in dev_examples]
print(dev_sentences[0],dev_labels[0])
sentences1_list=[sen[0] for sen in dev_sentences]
sentences2_list=[sen[1] for sen in dev_sentences]
dev_labels=[int(score) for score in dev_labels]
evaluator=stsEvaluator(sentences1=sentences1_list,sentences2=sentences2_list,batch_size=64,write_csv=True,scores=dev_labels)

['一个戴着安全帽的男人在跳舞。', '一个戴着安全帽的男人在跳舞。'] 5


In [59]:
evaluator(simcse)

2021-10-18 19:53:18 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-18 19:53:32 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7585	Spearman: 0.7632
2021-10-18 19:53:32 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7228	Spearman: 0.7396
2021-10-18 19:53:32 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7195	Spearman: 0.7360
2021-10-18 19:53:32 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7377	Spearman: 0.7477





0.7632353820991463

In [61]:
sentences1_list[0]

'一个戴着安全帽的男人在跳舞。'

In [62]:
sentences2_list[0]

'一个戴着安全帽的男人在跳舞。'

In [63]:
dev_labels[0]

5

In [65]:
sentences1_embeddings=simcse.encode(sentences1_list,convert_to_tensor=True)
sentences2_embeddings=simcse.encode(sentences2_list,convert_to_tensor=True)

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=46.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=46.0), HTML(value='')))




In [68]:
sentences1_embeddings=simcse.encode(sentences1_list,convert_to_tensor=True)
sentences2_embeddings=simcse.encode(sentences2_list,convert_to_tensor=True)
(sentences1_embeddings-sentences2_embeddings).norm(dim=1,or).size()

torch.Size([1458])

In [70]:
sentences1_embeddings=simcse.encode(sentences1_list,convert_to_tensor=True)
sentences2_embeddings=simcse.encode(sentences2_list,convert_to_tensor=True)
(sentences1_embeddings-sentences2_embeddings).norm(dim=1,or).size()
s1=torch.pdist(sentences1_embeddings,p=2).pow(2).mul(-2).exp().mean().log()
s2=torch.pdist(sentences2_embeddings,p=2).pow(2).mul(-2).exp().mean().log()
(s1+s2)/2

tensor(-9.7389)

In [56]:
sentence="数据转换方式决定了最终学习的向量表示的不变性"
v1=simcse.encode(sentence,normalize_embeddings=True,show_progress_bar=False)
v2=simcse.encode(sentence[::-1],normalize_embeddings=True,show_progress_bar=False)

In [58]:
(v1*v2).sum()

0.7955214

In [54]:
sentence[::-1]

'性变不的示表量向的习学终最了定决式方换转据数'

In [25]:
bert_encoder=SimCSE(model_path3,is_sbert_model=False,is_distilbert=False)

Some weights of the model checkpoint at /data/nfs14/nfs/aisearch/asr/xhsun/CommonModel/chinese-roberta-wwm/ were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# 展示contrastive loss的计算过程

## 首先将每一个句子编码为向量

In [29]:
s1="这几行代码为了展示对比损失函数的计算"
s1_pie=s1
s2="今天组会的内容是对比学习在文本相似度计算任务的应用"
s2_pie=s2
z1=bert_encoder.encode(s1)
z1_pie=bert_encoder.encode(s1_pie)
z2=bert_encoder.encode(s2)
z2_pie=bert_encoder.encode(s2_pie)

ValueError: too many values to unpack (expected 3)

## normalize每一个向量

In [27]:
v1=z1/np.linalg.norm(z1,ord=2)
v1_pie=z1_pie/np.linalg.norm(z1_pie,ord=2)
v2=z2/np.linalg.norm(z2,ord=2)
v2_pie=z2_pie/np.linalg.norm(z2_pie,ord=2)

## 得到logits和labels

In [28]:
logits=torch.tensor([[np.sum(v1*v1_pie),np.sum(v1*v2_pie)],
                     [np.sum(v2*v1_pie),np.sum(v2*v2_pie)]])
labels=torch.LongTensor([[1,0],[0,1]])
print(logits)
print(labels)

tensor([[1.0000, 0.8564],
        [0.8564, 1.0000]])
tensor([[1, 0],
        [0, 1]])


## 对logits进行softmax

In [55]:
log_probs=torch.nn.functional.log_softmax(logits,dim=1)
log_probs

tensor([[-0.6537, -0.7342],
        [-0.7410, -0.6475]])

In [56]:
contrastive_loss=-(log_probs[0][0] + log_probs[1][1])/2
contrastive_loss

tensor(0.6506)

In [38]:
contrastive_loss=-torch.sum(torch.nn.functional.log_softmax(logits,dim=1)*labels)/2
contrastive_loss

tensor(0.5488)

In [40]:
3+-1

2

In [1]:
from scipy.stats import pearsonr,spearmanr

In [2]:
X=[0.9,0.7,0.3,0.8,0.6]
Y=[4,5,1,3,2]

In [8]:
rank_X=[5,3,1,4,2]
d_i=[-1,2,0,1,0]
d_i2=[1,4,0,1,0]
print("spearman rank correlation coefficient: ",1-(6*sum(d_i2))/(5*(5**2-1)))

spearman rank correlation coefficient:  0.7


In [9]:
print("spearman rank correlation coefficient: ",spearmanr(a=rank_X,b=Y)[0])

spearman rank correlation coefficient:  0.7


In [14]:
print("pearson correlation coefficient: ",pearsonr(X,Y)[0])

pearson correlation coefficient:  0.7554831017177897


# ConSERT模型

In [17]:
class ConSERT(nn.Module):
    def __init__(self,
                 bert_model_path,
                 is_sbert_model=True,
                temperature=0.05,
                is_distilbert=False,
                 cutoff_rate=0.15,
                device='cpu',
                close_dropout=True):
        super(ConSERT,self).__init__()
        if is_sbert_model:
            self.encoder=SentenceTransformer(model_name_or_path=bert_model_path,device=device)
        else:
            word_embedding_model = models.Transformer(bert_model_path, max_seq_length=max_seq_len)
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
            self.encoder=SentenceTransformer(modules=[word_embedding_model, pooling_model],device=device)
        self.temperature=temperature
        self.cutoff_rate=cutoff_rate
        self.is_distilbert=is_distilbert#蒸馏版本的BERT不支持token_type_ids
        self.close_dropout=close_dropout
        
    def cal_cos_sim(self,embeddings1,embeddings2):
        embeddings1_norm=torch.nn.functional.normalize(embeddings1,p=2,dim=1)
        embeddings2_norm=torch.nn.functional.normalize(embeddings2,p=2,dim=1)
        return torch.mm(embeddings1_norm,embeddings2_norm.transpose(0,1))#(batch_size,batch_size)

    def shuffle_and_cutoff(self,sentence_feature):
        input_ids, attention_mask=sentence_feature['input_ids'],sentence_feature['attention_mask']
        bsz, seq_len = input_ids.shape
        shuffled_input_ids=[]
        cutoff_attention_mask=[]
        for bsz_id in range(bsz):
            sample_mask = attention_mask[bsz_id]
            num_tokens = sample_mask.sum().int().item()
            cur_input_ids=input_ids[bsz_id]
            if 102 not in cur_input_ids:
                indexes = list(range(num_tokens))[1:]
                random.shuffle(indexes)
                indexes=[0]+indexes#保证第一个位置是0
            else:
                indexes = list(range(num_tokens))[1:-1]
                random.shuffle(indexes)
                indexes=[0]+indexes+[num_tokens-1]#保证第一个位置是0，最后一个位置是SEP不变
            rest_indexes = list(range(num_tokens, seq_len))
            total_indexes = indexes + rest_indexes
            shuffled_input_id=input_ids[bsz_id][total_indexes]
            #print(shuffled_input_id,indexes)
            if self.cutoff_rate>0.0:
                sample_len=max(int(num_tokens*(1-self.cutoff_rate)),1)#if true_len is 32, cutoff_rate is 0.15 then sample_len is 27
                start_id = np.random.randint(1, high=num_tokens-sample_len+1)# start_id random select from (0,6)，避免删除CLS
                cutoff_mask=[1]*seq_len
                for idx in range(start_id, start_id+sample_len):
                    cutoff_mask[idx]=0#这些位置是0，bool之后就变成了False，而masked_fill是选择True的位置替换为value的
                cutoff_mask[0]=0#避免CLS被替换
                cutoff_mask[num_tokens-1]=0#避免SEP被替换
                cutoff_mask=torch.ByteTensor(cutoff_mask).bool().to(input_ids.device)
                shuffled_input_id=shuffled_input_id.masked_fill(cutoff_mask,value=0).to(input_ids.device)
                sample_mask=sample_mask.masked_fill(cutoff_mask,value=0).to(input_ids.device)

            shuffled_input_ids.append(shuffled_input_id)
            cutoff_attention_mask.append(sample_mask)
        shuffled_input_ids=torch.vstack(shuffled_input_ids)
        cutoff_attention_mask=torch.vstack(cutoff_attention_mask)
        return shuffled_input_ids,cutoff_attention_mask
        
    def forward(self,batch_inputs):
        '''
        为了实现兼容，所有model的batch_inputs最后一个位置必须是labels，即使为None
        get token_embeddings,cls_token_embeddings,sentence_embeddings
        sentence_embeddings是经过Pooling层后concat的embedding。维度=768*k，其中k取决于pooling的策略
        一般来讲，只会取一种pooling策略，要么直接cls要么mean last or mean last2 or mean first and last layer，所以sentence_embeddings的维度也是768
        '''
        batch1_features,batch2_features,_=batch_inputs
        if self.is_distilbert:
            del batch1_features['token_type_ids']
            del batch2_features['token_type_ids']
        batch1_embeddings=self.encoder(batch1_features)['sentence_embedding']
        shuffled_input_ids,cutoff_attention_mask=self.shuffle_and_cutoff(sentence_feature=batch1_features)
        #new_features{'input_ids'}=shuffled_input_ids
        batch2_features['input_ids']=shuffled_input_ids
        batch2_features['attention_mask']=cutoff_attention_mask
        orig_attention_probs_dropout_prob=self.encoder[0].auto_model.encoder.config.attention_probs_dropout_prob
        orig_hidden_dropout_prob=self.encoder[0].auto_model.encoder.config.hidden_dropout_prob
        if self.close_dropout:
            self.encoder[0].auto_model.encoder.config.attention_probs_dropout_prob=0.0
            self.encoder[0].auto_model.encoder.config.hidden_dropout_prob=0.0
        batch2_embeddings=self.encoder(batch2_features)['sentence_embedding']
        if self.close_dropout:
            self.encoder[0].auto_model.encoder.config.attention_probs_dropout_prob=orig_attention_probs_dropout_prob
            self.encoder[0].auto_model.encoder.config.hidden_dropout_prob=orig_hidden_dropout_prob
            
        cos_sim=self.cal_cos_sim(batch1_embeddings,batch2_embeddings)/self.temperature#(batch_size,batch_size)
        batch_size=cos_sim.size(0)
        assert cos_sim.size()==(batch_size,batch_size)
        labels=torch.arange(batch_size).to(cos_sim.device)
        return nn.CrossEntropyLoss()(cos_sim,labels)
    
    def encode(self, sentences,
               batch_size: int = 32,
               show_progress_bar: bool = None,
               output_value: str = 'sentence_embedding',
               convert_to_numpy: bool = True,
               convert_to_tensor: bool = False,
               device: str = None,
               normalize_embeddings: bool = False):
        '''
        传进来的sentences只能是single_batch
        '''
        return self.encoder.encode(sentences=sentences,
                                         batch_size=batch_size,
                                         show_progress_bar=show_progress_bar,
                                         output_value=output_value,
                                         convert_to_numpy=convert_to_numpy,
                                         convert_to_tensor=convert_to_tensor,
                                         device=device,
                                         normalize_embeddings=normalize_embeddings)
    
    def save(self,output_path):
        os.makedirs(output_path,exist_ok=True)
        with open(os.path.join(output_path, 'model_param_config.json'), 'w') as fOut:
            json.dump(self.get_config_dict(output_path), fOut)
        self.encoder.save(output_path)
        
    def get_config_dict(self,output_path):
        '''
        一定要有dict，这样才能初始化Model
        '''
        return {'bert_model_path':output_path,'temperature': self.temperature, 'is_distilbert': self.is_distilbert,
               'close_dropout':self.close_dropout,'cutoff_rate':self.cutoff_rate}
    @staticmethod
    def load(input_path):
        with open(os.path.join(input_path, 'model_param_config.json')) as fIn:
            config = json.load(fIn)
            if 'bert_model_path' not in config:
                config['bert_model_path']=config['output_path']
                del config['output_path']
        return ConSERT(**config)

In [18]:
consert_closedropout=ConSERT.load("/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/unConSERT_STS-B_closedropout/")

2021-10-13 16:38:36 - INFO - __init__ - 41 : Load pretrained SentenceTransformer: /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/unConSERT_STS-B_closedropout/
2021-10-13 16:38:36 - INFO - __init__ - 107 : Load SentenceTransformer from folder: /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/unConSERT_STS-B_closedropout/


In [20]:
evaluator(consert_closedropout)

2021-10-13 16:39:40 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 16:39:53 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7792	Spearman: 0.7808
2021-10-13 16:39:53 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7560	Spearman: 0.7741
2021-10-13 16:39:53 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7564	Spearman: 0.7744
2021-10-13 16:39:53 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7515	Spearman: 0.7558





0.7807855257752795

In [21]:
consert_donotclosedropout=ConSERT(bert_model_path=model_path3,is_distilbert=False,is_sbert_model=False,close_dropout=False)

Some weights of the model checkpoint at /data/nfs14/nfs/aisearch/asr/xhsun/CommonModel/chinese-roberta-wwm/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
evaluator(consert_donotclosedropout)

2021-10-13 16:40:02 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 16:40:15 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.6641	Spearman: 0.6808
2021-10-13 16:40:15 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.6740	Spearman: 0.6882
2021-10-13 16:40:15 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.6680	Spearman: 0.6824
2021-10-13 16:40:15 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.4705	Spearman: 0.4646





0.6807509857943427

# EsimCSE

In [71]:
from queue import Queue
class ESimCSE(nn.Module):
    def __init__(self,
                 bert_model_path,
                 q_size=256,
                 dup_rate=0.32,
                 is_sbert_model=True,
                temperature=0.05,
                is_distilbert=False,
                 gamma=0.99,
                device='cpu'):
        super(ESimCSE,self).__init__()
        if is_sbert_model:
            self.encoder=SentenceTransformer(model_name_or_path=bert_model_path,device=device)
            self.moco_encoder=SentenceTransformer(model_name_or_path=bert_model_path,device=device)
        else:
            word_embedding_model = models.Transformer(bert_model_path, max_seq_length=max_seq_len)
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
            self.encoder=SentenceTransformer(modules=[word_embedding_model, pooling_model],device=device)
            self.moco_encoder=SentenceTransformer(modules=[word_embedding_model, pooling_model],device=device)
        self.gamma=gamma
        self.q=[]
        self.q_size=q_size
        self.dup_rate=dup_rate
        self.temperature=temperature
        self.is_distilbert=is_distilbert#蒸馏版本的BERT不支持token_type_ids
    def cal_cos_sim(self,embeddings1,embeddings2):
        embeddings1_norm=torch.nn.functional.normalize(embeddings1,p=2,dim=1)
        embeddings2_norm=torch.nn.functional.normalize(embeddings2,p=2,dim=1)
        return torch.mm(embeddings1_norm,embeddings2_norm.transpose(0,1))#(batch_size,batch_size)

    def word_repetition(self,sentence_feature):
        input_ids, attention_mask, token_type_ids=sentence_feature['input_ids'].cpu().tolist(),sentence_feature['attention_mask'].cpu().tolist(),sentence_feature['token_type_ids'].cpu().tolist()
        bsz, seq_len = len(input_ids),len(input_ids[0])
        #print(bsz,seq_len)
        repetitied_input_ids=[]
        repetitied_attention_mask=[]
        repetitied_token_type_ids=[]
        rep_seq_len=seq_len
        for bsz_id in range(bsz):
            sample_mask = attention_mask[bsz_id]
            actual_len = sum(sample_mask)

            cur_input_id=input_ids[bsz_id]
            dup_len=random.randint(a=0,b=max(2,int(self.dup_rate*actual_len)))
            dup_word_index=random.sample(list(range(1,actual_len)),k=dup_len)
            
            r_input_id=[]
            r_attention_mask=[]
            r_token_type_ids=[]
            for index,word_id in enumerate(cur_input_id):
                if index in dup_word_index:
                    r_input_id.append(word_id)
                    r_attention_mask.append(sample_mask[index])
                    r_token_type_ids.append(token_type_ids[bsz_id][index])

                r_input_id.append(word_id)
                r_attention_mask.append(sample_mask[index])
                r_token_type_ids.append(token_type_ids[bsz_id][index])

            after_dup_len=len(r_input_id)
            #assert after_dup_len==actual_len+dup_len
            repetitied_input_ids.append(r_input_id)#+rest_input_ids)
            repetitied_attention_mask.append(r_attention_mask)#+rest_attention_mask)
            repetitied_token_type_ids.append(r_token_type_ids)#+rest_token_type_ids)

            assert after_dup_len==dup_len+seq_len
            if after_dup_len>rep_seq_len:
                rep_seq_len=after_dup_len

        for i in range(bsz):
            after_dup_len=len(repetitied_input_ids[i])
            pad_len=rep_seq_len-after_dup_len
            repetitied_input_ids[i]+=[0]*pad_len
            repetitied_attention_mask[i]+=[0]*pad_len
            repetitied_token_type_ids[i]+=[0]*pad_len

        repetitied_input_ids=torch.LongTensor(repetitied_input_ids)
        repetitied_attention_mask=torch.LongTensor(repetitied_attention_mask)
        repetitied_token_type_ids=torch.LongTensor(repetitied_token_type_ids)
        return {"input_ids":repetitied_input_ids,'attention_mask':repetitied_attention_mask,'token_type_ids':repetitied_token_type_ids}

    def forward(self,batch_inputs):
        '''
        为了实现兼容，所有model的batch_inputs最后一个位置必须是labels，即使为None
        get token_embeddings,cls_token_embeddings,sentence_embeddings
        sentence_embeddings是经过Pooling层后concat的embedding。维度=768*k，其中k取决于pooling的策略
        一般来讲，只会取一种pooling策略，要么直接cls要么mean last or mean last2 or mean first and last layer，所以sentence_embeddings的维度也是768
        '''
        batch1_features,batch2_features,_=batch_inputs
        if self.is_distilbert:
            del batch1_features['token_type_ids']
            del batch2_features['token_type_ids']
        batch1_embeddings=self.encoder(batch1_features)['sentence_embedding']
        batch2_features=self.word_repetition(sentence_feature=batch2_features)
        batch2_embeddings=self.encoder(batch2_features)['sentence_embedding']
        cos_sim=self.cal_cos_sim(batch1_embeddings,batch2_embeddings)/self.temperature#(batch_size,batch_size)
        batch_size=cos_sim.size(0)
        assert cos_sim.size()==(batch_size,batch_size)
        labels=torch.arange(batch_size).to(cos_sim.device)
        negative_samples=None
        if len(self.q)>0:
            negative_samples=torch.vstack(self.q[:self.q_size])#(q_size,768)
        if len(self.q)+batch_size>=self.q_size:
            del self.q[:batch_size]
            
        with torch.no_grad():
            self.moco_encoder[0].auto_model.encoder.config.attention_probs_dropout_prob=0.0
            self.moco_encoder[0].auto_model.encoder.config.hidden_dropout_prob=0.0
            self.q.extend(self.moco_encoder(batch1_features)['sentence_embedding'])
            
        if negative_samples is not None:
            batch_size+=negative_samples.size(0)#(N+M)
            cos_sim_with_neg=self.cal_cos_sim(batch1_embeddings,negative_samples)/self.temperature#(N,M) not (N,N) N is bsz
            cos_sim=torch.cat([cos_sim,cos_sim_with_neg],dim=1)#(N,N+M)
            #labels=
        for encoder_param,moco_encoder_param in zip(self.encoder.parameters(),self.moco_encoder.parameters()):
            moco_encoder_param.data=self.gamma*moco_encoder_param.data+(1.-self.gamma)*encoder_param.data
            
        return nn.CrossEntropyLoss()(cos_sim,labels)
    
    def encode(self, sentences,
               batch_size: int = 32,
               show_progress_bar: bool = None,
               output_value: str = 'sentence_embedding',
               convert_to_numpy: bool = True,
               convert_to_tensor: bool = False,
               device: str = None,
               normalize_embeddings: bool = False):
        '''
        传进来的sentences只能是single_batch
        '''
        return self.encoder.encode(sentences=sentences,
                                         batch_size=batch_size,
                                         show_progress_bar=show_progress_bar,
                                         output_value=output_value,
                                         convert_to_numpy=convert_to_numpy,
                                         convert_to_tensor=convert_to_tensor,
                                         device=device,
                                         normalize_embeddings=normalize_embeddings)
    
    def save(self,output_path):
        os.makedirs(output_path,exist_ok=True)
        with open(os.path.join(output_path, 'model_param_config.json'), 'w') as fOut:
            json.dump(self.get_config_dict(output_path), fOut)
        self.encoder.save(output_path)
        
    def get_config_dict(self,output_path):
        '''
        一定要有dict，这样才能初始化Model
        '''
        return {'bert_model_path':output_path,'temperature': self.temperature, 'is_distilbert': self.is_distilbert,
                'q_size':self.q_size,'dup_rate':self.dup_rate,'gamma':self.gamma}
    @staticmethod
    def load(input_path):
        with open(os.path.join(input_path, 'model_param_config.json')) as fIn:
            config = json.load(fIn)
        return ESimCSE(**config)

In [72]:
device='cpu'
esimcse=ESimCSE(bert_model_path='/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE',
                is_distilbert=False,
                is_sbert_model=True,
                dup_rate=0.32,gamma=0.99,
                device=device)

2021-10-18 20:04:03 - INFO - __init__ - 41 : Load pretrained SentenceTransformer: /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE
2021-10-18 20:04:03 - INFO - __init__ - 107 : Load SentenceTransformer from folder: /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE
2021-10-18 20:04:06 - INFO - __init__ - 41 : Load pretrained SentenceTransformer: /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE
2021-10-18 20:04:06 - INFO - __init__ - 107 : Load SentenceTransformer from folder: /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE


In [73]:
evaluator(esimcse)

2021-10-18 20:04:10 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-18 20:04:24 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7900	Spearman: 0.7929
2021-10-18 20:04:24 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7555	Spearman: 0.7709
2021-10-18 20:04:24 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7549	Spearman: 0.7702
2021-10-18 20:04:24 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7507	Spearman: 0.7531





0.7929133672005957

In [96]:
sentences1_list=[sen[0] for sen in test_sentences]
sentences2_list=[sen[1] for sen in test_sentences]
sentences1_embeddings=esimcse.encode(sentences1_list,convert_to_tensor=True)
sentences2_embeddings=esimcse.encode(sentences2_list,convert_to_tensor=True)
print((sentences1_embeddings-sentences2_embeddings).norm(dim=1).pow(2).mean())
s1=torch.pdist(sentences1_embeddings,p=2).pow(2).mul(-2).exp().mean().log()
print(s1)
s2=torch.pdist(sentences2_embeddings,p=2).pow(2).mul(-2).exp().mean().log()
print(s2)
print((s1+s2)/2)
print((sentences1_embeddings-sentences2_embeddings).norm(dim=1).pow(2).mul(-2).exp().mean())#.log())
mengbi_esimcse=(sentences1_embeddings-sentences2_embeddings).norm(dim=1).pow(2)

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=43.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=43.0), HTML(value='')))


tensor(183.4928)
tensor(-8.0145)
tensor(-9.4073)
tensor(-8.7109)
tensor(0.0132)


In [97]:
def uniform_loss(x, t=2):
    return torch.pdist(x, p=2).pow(2).mul(-t).exp().mean().log()

In [98]:
uniform_loss(sentences1_embeddings)

tensor(-8.0145)

In [99]:
uniform_loss(sentences2_embeddings)

tensor(-9.4073)

In [100]:
def align_loss(x, y, alpha=2):
    return (x - y).norm(p=2, dim=1).pow(alpha).mean()

In [101]:
align_loss(sentences1_embeddings,sentences2_embeddings)

tensor(183.4928)

In [86]:
sentences1_embeddings=simcse.encode(sentences1_list,convert_to_tensor=True)
sentences2_embeddings=simcse.encode(sentences2_list,convert_to_tensor=True)
print((sentences1_embeddings-sentences2_embeddings).norm(dim=1).pow(2).mean())
s1=torch.pdist(sentences1_embeddings,p=2).pow(2).mul(-2).exp().mean().log()
print(s1)
s2=torch.pdist(sentences2_embeddings,p=2).pow(2).mul(-2).exp().mean().log()
print(s2)
print((s1+s2)/2)
print((sentences1_embeddings-sentences2_embeddings).norm(dim=1).pow(2).mul(-2).exp().mean().log())
mengbi_simcse=(sentences1_embeddings-sentences2_embeddings).norm(dim=1).pow(2)

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=46.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=46.0), HTML(value='')))


tensor(194.2286)
tensor(-9.8505)
tensor(-9.6273)
tensor(-9.7389)
tensor(-5.3389)


In [91]:
sum((mengbi_esimcse==mengbi_simcse).bool())

tensor(7)

In [92]:
mengbi_esimcse.mul(-2).exp().mean()

tensor(0.0048)

In [93]:
mengbi_simcse.mul(-2).exp().mean()

tensor(0.0048)

In [77]:
bert_model=SimCSE(model_path3,is_sbert_model=False,is_distilbert=False)

Some weights of the model checkpoint at /data/nfs14/nfs/aisearch/asr/xhsun/CommonModel/chinese-roberta-wwm/ were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [79]:
sentences1_embeddings=bert_model.encode(sentences1_list,convert_to_tensor=True)
sentences2_embeddings=bert_model.encode(sentences2_list,convert_to_tensor=True)
print((sentences1_embeddings-sentences2_embeddings).norm(dim=1).pow(2).mean())
s1=torch.pdist(sentences1_embeddings,p=2).pow(2).mul(-2).exp().mean().log()
print(s1)
s2=torch.pdist(sentences2_embeddings,p=2).pow(2).mul(-2).exp().mean().log()
print(s2)
print((s1+s2)/2)

HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=46.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=46.0), HTML(value='')))


tensor(58.9933)
tensor(-9.8504)
tensor(-9.6273)
tensor(-9.7389)


# train model

In [42]:
epochs=5
output_path='/data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE'
tensorboard_logdir=os.path.join(output_path,'log')

## get optimizer

In [43]:
optimizer_type='AdamW'
scheduler='WarmupLinear'
warmup_proportion=0.1
optimizer_params={'lr': 2e-5}
weight_decay=0.01
num_train_steps = int(len(train_dataloader) * epochs)
warmup_steps = num_train_steps*warmup_proportion
optimizer = get_optimizer(model=esimcse,optimizer_type=optimizer_type,weight_decay=weight_decay,optimizer_params=optimizer_params)
scheduler = get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps)

## get Trainer

In [44]:
trainer=Trainer(epochs=epochs,output_path=output_path,tensorboard_logdir=tensorboard_logdir,early_stop_patience=20)
trainer.train(train_dataloader=train_dataloader,
             model=esimcse,
             optimizer=optimizer,
             scheduler=scheduler,
             evaluator=evaluator,
             )

2021-10-13 20:54:28 - INFO - train - 56 : 一个epoch 下，每隔8个step会输出一次loss，每隔20个step会评估一次模型


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-10-13 20:54:38 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 0 after 1 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 20:54:51 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.6641	Spearman: 0.6808
2021-10-13 20:54:51 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.6740	Spearman: 0.6882
2021-10-13 20:54:51 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.6680	Spearman: 0.6824
2021-10-13 20:54:51 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.4705	Spearman: 0.4646
2021-10-13 20:54:51 - INFO - save - 371 : Save model to /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE





2021-10-13 20:54:52 - INFO - train - 98 : In epoch 0, training_step 0, the eval score is 0.6807509857943427, previous eval score is -9999999, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE
2021-10-13 20:56:12 - INFO - train - 75 : Epoch : 0, train_step : 8/205, loss_value : 1.1475277915596962 
2021-10-13 20:57:38 - INFO - train - 75 : Epoch : 0, train_step : 16/205, loss_value : 0.3286867868155241 
2021-10-13 20:58:28 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 0 after 21 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 20:58:40 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7551	Spearman: 0.7585
2021-10-13 20:58:40 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7346	Spearman: 0.7449
2021-10-13 20:58:40 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7343	Spearman: 0.7447
2021-10-13 20:58:40 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7145	Spearman: 0.7155
2021-10-13 20:58:40 - INFO - save - 371 : Save model to /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE





2021-10-13 20:58:41 - INFO - train - 98 : In epoch 0, training_step 20, the eval score is 0.7585202474064063, previous eval score is 0.6807509857943427, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE
2021-10-13 20:59:20 - INFO - train - 75 : Epoch : 0, train_step : 24/205, loss_value : 0.055874085519462824 
2021-10-13 21:00:42 - INFO - train - 75 : Epoch : 0, train_step : 32/205, loss_value : 0.023807268124073744 
2021-10-13 21:02:09 - INFO - train - 75 : Epoch : 0, train_step : 40/205, loss_value : 0.020951081591192633 
2021-10-13 21:02:15 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 0 after 41 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:02:28 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7679	Spearman: 0.7693
2021-10-13 21:02:28 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7419	Spearman: 0.7564
2021-10-13 21:02:28 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7411	Spearman: 0.7552
2021-10-13 21:02:28 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7385	Spearman: 0.7403
2021-10-13 21:02:28 - INFO - save - 371 : Save model to /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE





2021-10-13 21:02:29 - INFO - train - 98 : In epoch 0, training_step 40, the eval score is 0.7693038887787182, previous eval score is 0.7585202474064063, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-10-13 21:02:39 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 1 after 1 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:02:52 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7694	Spearman: 0.7710
2021-10-13 21:02:52 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7425	Spearman: 0.7570
2021-10-13 21:02:52 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7418	Spearman: 0.7560
2021-10-13 21:02:52 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7405	Spearman: 0.7427
2021-10-13 21:02:52 - INFO - save - 371 : Save model to /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE





2021-10-13 21:02:53 - INFO - train - 98 : In epoch 1, training_step 0, the eval score is 0.771035587975385, previous eval score is 0.7693038887787182, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE
2021-10-13 21:04:09 - INFO - train - 75 : Epoch : 1, train_step : 16/205, loss_value : 0.035574153531342745 
2021-10-13 21:05:37 - INFO - train - 75 : Epoch : 1, train_step : 32/205, loss_value : 0.019113536807708442 
2021-10-13 21:06:29 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 1 after 21 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:06:41 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7860	Spearman: 0.7889
2021-10-13 21:06:41 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7570	Spearman: 0.7710
2021-10-13 21:06:41 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7565	Spearman: 0.7705
2021-10-13 21:06:41 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7384	Spearman: 0.7409
2021-10-13 21:06:41 - INFO - save - 371 : Save model to /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE





2021-10-13 21:06:42 - INFO - train - 98 : In epoch 1, training_step 40, the eval score is 0.7888571436807816, previous eval score is 0.771035587975385, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE
2021-10-13 21:07:19 - INFO - train - 75 : Epoch : 1, train_step : 48/205, loss_value : 0.017784759518690407 
2021-10-13 21:08:46 - INFO - train - 75 : Epoch : 1, train_step : 64/205, loss_value : 0.029313169419765472 
2021-10-13 21:10:11 - INFO - train - 75 : Epoch : 1, train_step : 80/205, loss_value : 0.019131976325297728 
2021-10-13 21:10:17 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 1 after 41 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:10:30 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7843	Spearman: 0.7871
2021-10-13 21:10:30 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7514	Spearman: 0.7662
2021-10-13 21:10:30 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7506	Spearman: 0.7653
2021-10-13 21:10:30 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7453	Spearman: 0.7484
2021-10-13 21:10:30 - INFO - train - 102 : No improvement over previous best eval score (0.787076 vs 0.788857), patience = 19






HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-10-13 21:10:39 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 2 after 1 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:10:52 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7838	Spearman: 0.7864
2021-10-13 21:10:52 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7508	Spearman: 0.7655
2021-10-13 21:10:52 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7499	Spearman: 0.7646
2021-10-13 21:10:52 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7452	Spearman: 0.7485
2021-10-13 21:10:52 - INFO - train - 102 : No improvement over previous best eval score (0.786445 vs 0.788857), patience = 18





2021-10-13 21:12:13 - INFO - train - 75 : Epoch : 2, train_step : 24/205, loss_value : 0.0334612459409982 
2021-10-13 21:13:36 - INFO - train - 75 : Epoch : 2, train_step : 48/205, loss_value : 0.021292690536938608 
2021-10-13 21:14:28 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 2 after 21 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:14:41 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7797	Spearman: 0.7828
2021-10-13 21:14:41 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7471	Spearman: 0.7617
2021-10-13 21:14:41 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7460	Spearman: 0.7604
2021-10-13 21:14:41 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7468	Spearman: 0.7492
2021-10-13 21:14:41 - INFO - train - 102 : No improvement over previous best eval score (0.782786 vs 0.788857), patience = 17





2021-10-13 21:15:18 - INFO - train - 75 : Epoch : 2, train_step : 72/205, loss_value : 0.027970600873231888 
2021-10-13 21:16:42 - INFO - train - 75 : Epoch : 2, train_step : 96/205, loss_value : 0.020671935402788222 
2021-10-13 21:18:07 - INFO - train - 75 : Epoch : 2, train_step : 120/205, loss_value : 0.015347252017818391 
2021-10-13 21:18:13 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 2 after 41 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:18:25 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7886	Spearman: 0.7917
2021-10-13 21:18:25 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7539	Spearman: 0.7690
2021-10-13 21:18:25 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7531	Spearman: 0.7681
2021-10-13 21:18:25 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7532	Spearman: 0.7554
2021-10-13 21:18:25 - INFO - save - 371 : Save model to /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE





2021-10-13 21:18:26 - INFO - train - 98 : In epoch 2, training_step 120, the eval score is 0.7916804790959123, previous eval score is 0.7888571436807816, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-10-13 21:18:38 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 3 after 1 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:18:51 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7887	Spearman: 0.7918
2021-10-13 21:18:51 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7540	Spearman: 0.7692
2021-10-13 21:18:51 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7532	Spearman: 0.7683
2021-10-13 21:18:51 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7530	Spearman: 0.7552
2021-10-13 21:18:51 - INFO - save - 371 : Save model to /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE





2021-10-13 21:18:52 - INFO - train - 98 : In epoch 3, training_step 0, the eval score is 0.7917551052847813, previous eval score is 0.7916804790959123, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE
2021-10-13 21:20:11 - INFO - train - 75 : Epoch : 3, train_step : 32/205, loss_value : 0.029419887636322528 
2021-10-13 21:21:32 - INFO - train - 75 : Epoch : 3, train_step : 64/205, loss_value : 0.022672737133689225 
2021-10-13 21:22:22 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 3 after 21 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:22:34 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7900	Spearman: 0.7929
2021-10-13 21:22:34 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7555	Spearman: 0.7709
2021-10-13 21:22:34 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7549	Spearman: 0.7702
2021-10-13 21:22:34 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7507	Spearman: 0.7531
2021-10-13 21:22:34 - INFO - save - 371 : Save model to /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE





2021-10-13 21:22:35 - INFO - train - 98 : In epoch 3, training_step 80, the eval score is 0.7929133672005957, previous eval score is 0.7917551052847813, model has been saved in /data/nfs14/nfs/aisearch/asr/xhsun/bwbd_recall/unsupervisedSTSModel/ESimCSE
2021-10-13 21:23:11 - INFO - train - 75 : Epoch : 3, train_step : 96/205, loss_value : 0.01519197560264729 
2021-10-13 21:24:40 - INFO - train - 75 : Epoch : 3, train_step : 128/205, loss_value : 0.019646525790449232 
2021-10-13 21:26:10 - INFO - train - 75 : Epoch : 3, train_step : 160/205, loss_value : 0.026167668955167755 
2021-10-13 21:26:15 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 3 after 41 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:26:28 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7885	Spearman: 0.7917
2021-10-13 21:26:28 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7542	Spearman: 0.7697
2021-10-13 21:26:28 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7534	Spearman: 0.7692
2021-10-13 21:26:28 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7501	Spearman: 0.7528
2021-10-13 21:26:28 - INFO - train - 102 : No improvement over previous best eval score (0.791657 vs 0.792913), patience = 16






HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

2021-10-13 21:26:38 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 4 after 1 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:26:51 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7885	Spearman: 0.7915
2021-10-13 21:26:51 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7542	Spearman: 0.7697
2021-10-13 21:26:51 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7534	Spearman: 0.7691
2021-10-13 21:26:51 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7502	Spearman: 0.7529
2021-10-13 21:26:51 - INFO - train - 102 : No improvement over previous best eval score (0.791529 vs 0.792913), patience = 15





2021-10-13 21:28:08 - INFO - train - 75 : Epoch : 4, train_step : 40/205, loss_value : 0.030378026654943824 
2021-10-13 21:29:32 - INFO - train - 75 : Epoch : 4, train_step : 80/205, loss_value : 0.017732904729200527 
2021-10-13 21:30:22 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 4 after 21 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:30:35 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7872	Spearman: 0.7902
2021-10-13 21:30:35 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7532	Spearman: 0.7689
2021-10-13 21:30:35 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7524	Spearman: 0.7680
2021-10-13 21:30:35 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7497	Spearman: 0.7522
2021-10-13 21:30:35 - INFO - train - 102 : No improvement over previous best eval score (0.790173 vs 0.792913), patience = 14





2021-10-13 21:31:11 - INFO - train - 75 : Epoch : 4, train_step : 120/205, loss_value : 0.019710425898665562 
2021-10-13 21:32:32 - INFO - train - 75 : Epoch : 4, train_step : 160/205, loss_value : 0.01952880504541099 
2021-10-13 21:33:58 - INFO - train - 75 : Epoch : 4, train_step : 200/205, loss_value : 0.015049356035888195 
2021-10-13 21:34:03 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset in epoch 4 after 41 steps:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 21:34:16 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7869	Spearman: 0.7899
2021-10-13 21:34:16 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7528	Spearman: 0.7684
2021-10-13 21:34:16 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7519	Spearman: 0.7676
2021-10-13 21:34:16 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7489	Spearman: 0.7517
2021-10-13 21:34:16 - INFO - train - 102 : No improvement over previous best eval score (0.789881 vs 0.792913), patience = 13






In [26]:
evaluator(consert_donotclosedropout)

2021-10-13 18:23:51 - INFO - __call__ - 72 : EmbeddingSimilarityEvaluator: Evaluating the model on  dataset:


HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))




HBox(children=(HTML(value='Batches'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

2021-10-13 18:24:04 - INFO - __call__ - 103 : Cosine-Similarity :	Pearson: 0.7764	Spearman: 0.7783
2021-10-13 18:24:04 - INFO - __call__ - 105 : Manhattan-Distance:	Pearson: 0.7496	Spearman: 0.7682
2021-10-13 18:24:04 - INFO - __call__ - 107 : Euclidean-Distance:	Pearson: 0.7499	Spearman: 0.7688
2021-10-13 18:24:04 - INFO - __call__ - 109 : Dot-Product-Similarity:	Pearson: 0.7536	Spearman: 0.7592





0.7783171236503472