In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

## BERT Embeddings

In [3]:
text1 = 'I really liked the movie.'
output1 = model(**tokenizer(text1, return_tensors='pt'))

In [4]:
output1.pooler_output.shape

torch.Size([1, 768])

In [5]:
emb1 = output1.pooler_output

In [6]:
def get_embedding(text):
    output = model(**tokenizer(text, return_tensors='pt').to(model.device))
    return output.pooler_output

# Similarity matching

In [7]:
emb1 = get_embedding('I really liked the movie.')
emb2 = get_embedding('Poverty and wealth disparity all across the world is on the rise.')
cosine_similarity(emb1.detach().numpy(), emb2.detach().numpy()).item()

0.7753926515579224

In [8]:
emb3 = get_embedding('I really hated the movie.')
cosine_similarity(emb1.detach().numpy(), emb3.detach().numpy()).item()

0.9938818216323853

# Finetuning

In [9]:
dataset = load_dataset("stanfordnlp/sst2")

In [10]:
dataset['train']

Dataset({
    features: ['idx', 'sentence', 'label'],
    num_rows: 67349
})

In [11]:
small_dataset = dataset['train'][:10000]

In [12]:
positive_pairs, negative_pairs = [], []
while len(positive_pairs) < 5000 or len(negative_pairs) < 5000:
    idx1 = torch.randint(0, len(small_dataset['idx']), (1,)).item()
    idx2 = torch.randint(0, len(small_dataset['idx']), (1,)).item()
    if idx1 == idx2:
        continue

    if small_dataset['label'][idx1] == small_dataset['label'][idx2]:
        if (idx1, idx2) in positive_pairs or (idx2, idx1) in positive_pairs:
            continue
        print(len(positive_pairs), len(negative_pairs))
        positive_pairs.append((idx1, idx2))
        # positive_pairs.append((small_dataset['sentence'][idx1], small_dataset['sentence'][idx2]))
    else:
        if (idx1, idx2) in negative_pairs or (idx2, idx1) in negative_pairs:
            continue
        print(len(positive_pairs), len(negative_pairs))
        negative_pairs.append((idx1, idx2))
        # negative_pairs.append((small_dataset['sentence'][idx1], small_dataset['sentence'][idx2]))

0 0
0 1
1 1
1 2
1 3
2 3
2 4
2 5
3 5
3 6
3 7
3 8
3 9
4 9
5 9
6 9
7 9
7 10
7 11
8 11
8 12
9 12
10 12
10 13
11 13
11 14
11 15
11 16
11 17
11 18
12 18
12 19
12 20
12 21
13 21
14 21
14 22
15 22
16 22
17 22
18 22
18 23
18 24
19 24
19 25
20 25
20 26
20 27
21 27
21 28
21 29
22 29
23 29
24 29
25 29
26 29
27 29
28 29
29 29
30 29
31 29
32 29
33 29
34 29
35 29
36 29
37 29
38 29
39 29
40 29
41 29
42 29
42 30
42 31
43 31
43 32
44 32
45 32
46 32
47 32
48 32
48 33
49 33
49 34
49 35
50 35
50 36
51 36
52 36
52 37
52 38
53 38
53 39
53 40
53 41
53 42
54 42
55 42
55 43
55 44
56 44
57 44
57 45
58 45
58 46
59 46
60 46
60 47
60 48
60 49
61 49
62 49
62 50
62 51
63 51
64 51
65 51
66 51
66 52
66 53
66 54
66 55
66 56
67 56
67 57
68 57
69 57
70 57
70 58
71 58
71 59
71 60
71 61
72 61
73 61
73 62
74 62
75 62
76 62
77 62
77 63
77 64
78 64
78 65
79 65
79 66
80 66
81 66
81 67
82 67
83 67
83 68
83 69
84 69
84 70
84 71
85 71
86 71
86 72
86 73
86 74
87 74
88 74
88 75
88 76
89 76
89 77
89 78
90 78
90 79
90 80
91 80
91 81
9

In [13]:
len(positive_pairs), len(negative_pairs)

(5141, 5000)

In [14]:
positive_sentences = []
for idx1, idx2 in positive_pairs:
    positive_sentences.append((small_dataset['sentence'][idx1], small_dataset['sentence'][idx2]))

negative_sentences = []
for idx1, idx2 in negative_pairs:
    negative_sentences.append((small_dataset['sentence'][idx1], small_dataset['sentence'][idx2]))

In [15]:
import pandas as pd

In [16]:
pdf = pd.DataFrame(positive_sentences, columns=['sentence1', 'sentence2'])
pdf['label'] = 1
ndf = pd.DataFrame(negative_sentences, columns=['sentence1', 'sentence2'])
ndf['label'] = 0
df = pd.concat([pdf, ndf], ignore_index=True)

In [17]:
df

Unnamed: 0,sentence1,sentence2,label
0,", madonna is one helluva singer .",does pack some serious suspense .,1
1,the soulful gravity of crudup 's anchoring per...,"is a pan-american movie , with moments of genu...",1
2,"kwan is a master of shadow , quietude , and ro...",showing honest emotions,1
3,"none of this sounds promising and , indeed , t...","seen it all before ,",1
4,recommend `` never again,"enormously enjoyable , high-adrenaline documen...",1
...,...,...,...
10136,so many merchandised-to-the-max movies,oft-brilliant,0
10137,* corpus and its amiable jerking,is conversational bordering on confessional,0
10138,green men,immensely entertaining,0
10139,"that 's amusing enough while you watch it , of...",be easy for critics to shred it,0


In [18]:
pair_dataset = Dataset.from_pandas(df)

In [19]:
pair_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 10141
})

In [20]:
# Tokenize the dataset. Tokenize each sentence separately. Also, add the labels.
def tokenize_function(examples):
    sentence1 = tokenizer(examples["sentence1"], truncation=True, padding="max_length", max_length=100, return_tensors='pt')
    sentence2 = tokenizer(examples["sentence2"], truncation=True, padding="max_length", max_length=100, return_tensors='pt')
    return_obj = {}
    for key in sentence1:
        return_obj[key + "_1"] = sentence1[key]
    for key in sentence2:
        return_obj[key + "_2"] = sentence2[key]
    return_obj["labels"] = examples["label"]
    return return_obj



In [21]:
tokenized_dataset = pair_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10141 [00:00<?, ? examples/s]

In [22]:
tokenized_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'input_ids_1', 'token_type_ids_1', 'attention_mask_1', 'input_ids_2', 'token_type_ids_2', 'attention_mask_2', 'labels'],
    num_rows: 10141
})

In [23]:
tokenized_dataset.set_format('torch', columns=['input_ids_1', 'token_type_ids_1', 'attention_mask_1', 'input_ids_2', 'token_type_ids_2', 'attention_mask_2', 'label'])

In [24]:
tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2'])

In [25]:
tokenized_dataset

Dataset({
    features: ['label', 'input_ids_1', 'token_type_ids_1', 'attention_mask_1', 'input_ids_2', 'token_type_ids_2', 'attention_mask_2', 'labels'],
    num_rows: 10141
})

In [26]:
# split tokenized dataset into train and eval
train_test_ds = tokenized_dataset.train_test_split(test_size=0.1)

In [27]:
class SiameseBert(torch.nn.Module):
    def __init__(self, model):
        super(SiameseBert, self).__init__()
        self.bert = model
        self.linear = torch.nn.Linear(1, 2)
        # self.linear1 = torch.nn.Linear(768*2, 768)
        # self.linear2 = torch.nn.Linear(768, 2)

    def forward(self, input_ids_1, token_type_ids_1, attention_mask_1, input_ids_2, token_type_ids_2, attention_mask_2, labels=None):
        output1 = self.bert(input_ids=input_ids_1, token_type_ids=token_type_ids_1, attention_mask=attention_mask_1)
        output2 = self.bert(input_ids=input_ids_2, token_type_ids=token_type_ids_2, attention_mask=attention_mask_2)
        output = torch.bmm(output1.pooler_output.unsqueeze(1), output2.pooler_output.unsqueeze(2))
        # print(output.shape)
        output = output.view(-1, 1)
        # print(output.shape)
        output = self.linear(output)
        # print(output.shape)

        # output = torch.cat((output1.pooler_output, output2.pooler_output), dim=1)
        # # output = torch.cat((output1.pooler_output, output2.pooler_output, output1.pooler_output - output2.pooler_output), dim=1)
        # output = self.linear1(output)
        # output = torch.relu(output)
        # output = self.linear2(output)
        # output = torch.softmax(output, dim=1)
        if labels is not None:
            loss = torch.nn.CrossEntropyLoss()(output, labels)
            return loss, output
        return output

In [28]:
siamese_model = SiameseBert(AutoModel.from_pretrained("bert-base-uncased"))

In [29]:
output_model_dir = './siamese_bert'

In [30]:
training_args = TrainingArguments(
    output_dir = output_model_dir,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    learning_rate = 5e-4,
    num_train_epochs = 2,
    bf16 = True, # bfloat16 training 
    optim = "adamw_torch_fused", # improved optimizer 
    # logging & evaluation strategies
    logging_strategy = "steps",
    logging_steps = 50,
    eval_strategy = "steps",
    save_strategy = "steps",
    save_total_limit = 2,
    load_best_model_at_end = True,
    # push to hub parameters
    push_to_hub = False,
    report_to="none",
    
)

In [31]:
trainer = Trainer(
    model = siamese_model,
    args = training_args,
    train_dataset = train_test_ds['train'],
    eval_dataset = train_test_ds['test'],
)

In [32]:

trainer.train()

Step,Training Loss,Validation Loss
50,53.1437,18.495567
100,5.5536,3.342486
150,2.3242,6.434175
200,1.8604,1.284201
250,1.4077,1.082976
300,0.9974,0.802182
350,1.086,0.704649
400,1.1867,1.014576
450,0.8819,0.692045
500,0.7952,0.72389


Could not locate the best model at ./siamese_bert/checkpoint-450/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=572, training_loss=6.145173102825671, metrics={'train_runtime': 184.1999, 'train_samples_per_second': 99.088, 'train_steps_per_second': 3.105, 'total_flos': 0.0, 'train_loss': 6.145173102825671, 'epoch': 2.0})

In [33]:
model = siamese_model.bert
model = model.to('cpu')

In [34]:
emb1 = get_embedding('I really liked the movie.')
emb2 = get_embedding('Poverty and wealth disparity all across the world is on the rise.')
cosine_similarity(emb1.detach().numpy(), emb2.detach().numpy()).item()

0.28789713978767395

In [35]:
emb3 = get_embedding('I really hated the movie.')
cosine_similarity(emb1.detach().numpy(), emb3.detach().numpy()).item()

0.36950868368148804