In [1]:
import torch
import torch.nn as nn
#from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
from transformers import default_data_collator
from torch.utils.data import DataLoader

# Import our models. The package will take care of downloading the models automatically
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = model.to('cuda')

raw_datasets = load_dataset('glue', 'mnli')

id_to_label = {
    0: 'entailment',
    1: 'neutral',
    2: 'contradiction'
}

Found cached dataset glue (/home/ichida/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
def preprocess_p_function(dataset):
    premise = tokenizer(dataset['premise'], padding="max_length", max_length=64, truncation=True, return_tensors="pt")
    return premise

def preprocess_h_function(dataset):
    premise = tokenizer(dataset['hypothesis'], padding="max_length", max_length=64, truncation=True, return_tensors="pt")
    return premise

train_dataset = raw_datasets["train"]
entailment_ds = train_dataset.filter(lambda row: row['label'] == 0)
nonentailment_ds = train_dataset.filter(lambda row: row['label'] != 0)

p_entailment = entailment_ds.map(
            preprocess_p_function,
            #num_proc=20,
            batched=True,
            remove_columns=entailment_ds.column_names,
            desc="Running tokenizer on entailment dataset"
        )

h_entailment = entailment_ds.map(
            preprocess_h_function,
            #num_proc=20,
            batched=True,
            remove_columns=entailment_ds.column_names,
            desc="Running tokenizer on entailment dataset"
        )

p_entailment_dl = DataLoader(p_entailment, shuffle=False, collate_fn=default_data_collator, batch_size=128)
h_entailment_dl = DataLoader(h_entailment, shuffle=False, collate_fn=default_data_collator, batch_size=128)


  0%|          | 0/393 [00:00<?, ?ba/s]

  0%|          | 0/393 [00:00<?, ?ba/s]

Running tokenizer on entailment dataset:   0%|          | 0/131 [00:00<?, ?ba/s]

Running tokenizer on entailment dataset:   0%|          | 0/131 [00:00<?, ?ba/s]

In [3]:
from tqdm import tqdm
print(len(p_entailment_dl))

1023


In [4]:
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
a = []
model.eval()

total = len(p_entailment_dl)
with torch.no_grad():
    for prem_batch, hyp_batch in tqdm(zip(p_entailment_dl, h_entailment_dl), total=total):
        #print(hyp_batch['input_ids'].size())
        hyp_batch = {k:b.to("cuda") for k,b in hyp_batch.items()}
        prem_batch = {k:b.to("cuda") for k,b in prem_batch.items()}
        h_embedding = model(**hyp_batch).pooler_output
        p_embedding = model(**prem_batch).pooler_output
        sim = cos(p_embedding, h_embedding)
        a.append(sim)
        #break



100%|██████████| 1023/1023 [08:11<00:00,  2.08it/s]


In [5]:
similarity_tensor = torch.cat(a)
similarity_tensor.mean(), similarity_tensor.std(), similarity_tensor.min(), similarity_tensor.max(),similarity_tensor.median()

(tensor(0.7726, device='cuda:0'),
 tensor(0.1298, device='cuda:0'),
 tensor(-0.1027, device='cuda:0'),
 tensor(1.0000, device='cuda:0'),
 tensor(0.7872, device='cuda:0'))

In [10]:
p = nonentailment_ds.map(
            preprocess_p_function,
            #num_proc=20,
            batched=True,
            remove_columns=entailment_ds.column_names,
            desc="Running tokenizer on entailment dataset"
        )

h = nonentailment_ds.map(
            preprocess_h_function,
            #num_proc=20,
            batched=True,
            remove_columns=entailment_ds.column_names,
            desc="Running tokenizer on entailment dataset"
        )

p_n = DataLoader(p, shuffle=False, collate_fn=default_data_collator, batch_size=128)
h_n = DataLoader(h, shuffle=False, collate_fn=default_data_collator, batch_size=128)

Loading cached processed dataset at /home/ichida/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-6b84bbf329aa34a3.arrow
Loading cached processed dataset at /home/ichida/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-71b2c8270053a9ef.arrow


In [11]:
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
a_ne = []
model.eval()

total = len(h_n)
with torch.no_grad():
    for prem_batch, hyp_batch in tqdm(zip(p_n, h_n), total=total):
        #print(hyp_batch['input_ids'].size())
        hyp_batch = {k:b.to("cuda") for k,b in hyp_batch.items()}
        prem_batch = {k:b.to("cuda") for k,b in prem_batch.items()}
        h_embedding = model(**hyp_batch).pooler_output
        p_embedding = model(**prem_batch).pooler_output
        sim = cos(p_embedding, h_embedding)
        a_ne.append(sim)
        #break



 18%|█▊        | 361/2046 [04:00<18:41,  1.50it/s]


KeyboardInterrupt: 

In [37]:
ne_similarity_tensor = torch.cat(a_ne)
ne_similarity_tensor = ne_similarity_tensor[ne_similarity_tensor < 0.9]
ne_similarity_tensor.mean(), ne_similarity_tensor.std(), ne_similarity_tensor.min(), ne_similarity_tensor.max(),ne_similarity_tensor.median()

(tensor(0.5347, device='cuda:0'),
 tensor(0.1754, device='cuda:0'),
 tensor(-0.2846, device='cuda:0'),
 tensor(0.9000, device='cuda:0'),
 tensor(0.5454, device='cuda:0'))

In [10]:
ds = raw_datasets["train"]
len(ds.filter(lambda row: row['label'] == 0))

Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

130899

In [36]:
len((ne_similarity_tensor > .9).nonzero(as_tuple=True)[0])

2471

In [31]:
nonentailment_ds[64487]

{'premise': 'mean is we moved closer, in,',
 'hypothesis': 'mean is we moved closer, in,',
 'label': 1,
 'idx': 97443}