In [1]:
TASK = 'NLI+STS'

In [2]:
%%capture
!pip install -U sentence-transformers datasets==1.9.0 transformers==4.9.0

In [9]:
from sentence_transformers import (
    InputExample,
    SentencesDataset,
    SentenceTransformer,
    evaluation,
    losses,
    models,
)
import torch
from torch.utils.data import DataLoader
from typing import *

Evaluator = evaluation.EmbeddingSimilarityEvaluator

In [4]:
%reload_ext autoreload
%autoreload 2

## Define a model

In [5]:
class SBert(SentenceTransformer):
    """ Sentence BERT

    A SentenceTransformer model based on BERT, that can be used to map sentences / text to embeddings.
    """
    tokenizer_name = 'bert-base-uncased'
    model_name = 'bert-base-uncased'
    _cache_dir = 'model_cache'

    def __init__(self,
                 model_path=None,
                 device: Optional[str] = None,
                 max_seq_length: Optional[int] = None,
                 do_lower_case: bool = False,
                 cache_dir: Optional[str] = _cache_dir):
        if model_path is not None:
            return super().__init__(model_path)
        
        transformer = models.Transformer(
            model_name_or_path=self.model_name,
            tokenizer_name_or_path=self.tokenizer_name,
            do_lower_case=do_lower_case,
            max_seq_length=max_seq_length,
            cache_dir=cache_dir
        )
        pooling = models.Pooling(transformer.get_word_embedding_dimension(),
                                 pooling_mode='mean')
        
        super().__init__(modules=[transformer, pooling],
                         device=device)
        

## Training Data


In [6]:
import pandas as pd
import random
import gzip
import csv

def read_sts_data(path):
  train_samples = []
  dev_samples = []
  test_samples = []
  with gzip.open(path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
      score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
      inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)
      if row['split'] == 'dev':
        dev_samples.append(inp_example)
      elif row['split'] == 'test':
        test_samples.append(inp_example)
      else:
        train_samples.append(inp_example)
  return train_samples, dev_samples, test_samples

def read_nli_data(path, type, drop_rate=0.95):
  samples = []
  with gzip.open(path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
      if random.random() < drop_rate:
        continue
      if row['split'] == type:
        label_id = nli_label2int[row['label']]
        samples.append(InputExample(
          texts=[row['sentence1'], row['sentence2']], label=label_id))
  return samples

nli_label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}

In [7]:
import os
import random

if not os.path.exists('stsbenchmark.tsv.gz'):
  !wget https://sbert.net/datasets/stsbenchmark.tsv.gz

if not os.path.exists('AllNLI.tsv.gz'):
  !wget https://sbert.net/datasets/AllNLI.tsv.gz

sts_train_examples, dev_examples, test_examples = \
  read_sts_data('stsbenchmark.tsv.gz')

if 'NLI' in TASK:
  nli_train_examples = read_nli_data('AllNLI.tsv.gz', type='train')


In [23]:
# print(torch.cuda.memory_summary(device=None, abbreviated=False))
print('NLI data size', len(nli_train_examples))

NLI data size 19007


##  Loss function

In [11]:
import torch
from torch import nn


class CosineSimilarityLoss(nn.Module):
    """
    Cosine similarity loss used in STS regression.
    """
    
    def __init__(self, model: nn.Module):
        super().__init__()
        self.model = model

    def forward(self, sentence_features, labels):
        embeddings = [self.model(sentence_feature)['sentence_embedding']
                      for sentence_feature in sentence_features]
        emb1, emb2 = embeddings
        sim = torch.cosine_similarity(emb1, emb2)
        return nn.functional.mse_loss(sim, labels.view(-1))


class SoftmaxLoss(nn.Module):
    """
    Softmax loss used in NLI classification.
    """

    def __init__(self,
                 model: nn.Module,
                 num_labels: int):
        super().__init__()
        self.model = model
        self.num_labels = num_labels
        emb_dim = model.get_sentence_embedding_dimension()
        self.classifier = nn.Linear(3 * emb_dim, num_labels)

    def forward(self, sentence_features, labels):
        reps = [self.model(sentence_feature)['sentence_embedding']
                for sentence_feature in sentence_features]
        rep_a, rep_b = reps
        
        features = torch.cat(
            [rep_a, rep_b, torch.abs(rep_a - rep_b)],
            dim = 1)

        return nn.functional.cross_entropy(
            self.classifier(features),
            labels.view(-1))


## Build SBert Model

In [21]:
model_save_path = 'sbert_model/' + TASK

try:
  model = SBert(model_path=model_save_path)
  print("Loaded model from", model_save_path)
except:
  max_seq_len = None
  model = SBert(max_seq_length=max_seq_len)
  print("Initialized new model")

# print(torch.cuda.memory_summary(device=None, abbreviated=False))

Loaded model from sbert_model/NLI+STS


## Training

In [13]:
def train(model, task, epochs, bs):
  task = task.upper()
  assert task in ['STS', 'NLI']
  train_examples = sts_train_examples if task == 'STS' else nli_train_examples
  train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=bs)
  
  if task == 'NLI':
      train_loss = SoftmaxLoss(model=model, num_labels=len(nli_label2int))
  else:
      train_loss = CosineSimilarityLoss(model=model)

  evaluator = Evaluator.from_input_examples(dev_examples, name='sts-dev')

  warmup_steps = len(train_dataloader) * epochs // 10
  evaluation_steps = len(train_dataloader) // 10

  model.fit(train_objectives=[(train_dataloader, train_loss)],
            epochs=epochs,
            evaluator=evaluator,
            warmup_steps=warmup_steps,
            evaluation_steps=evaluation_steps,
            output_path=model_save_path)

In [22]:
# print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [15]:
if 'NLI' in TASK:
  train(model, 'NLI', epochs=1, bs=16)

if 'STS' in TASK:
  epochs = 2 if 'NLI' in TASK else 4
  train(model, 'STS', epochs=epochs, bs=16)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1188 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

## Evaluate

In [19]:
test_evaluator = Evaluator.from_input_examples(test_examples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

0.8441489271563647

## Simple Semantic Search

In [20]:
top_k = 5

def semantic_search(queries, embedder, corpus_embeddings, top_k=top_k):
  for query in queries:
      query_embedding = embedder.encode(query, convert_to_tensor=True)

      # We use cosine-similarity and torch.topk to find the highest 5 scores
      cos_scores = torch.cosine_similarity(query_embedding, corpus_embeddings)
      top_results = torch.topk(cos_scores, k=top_k)

      print("\n\n======================\n\n")
      print("Query:", query)
      print("\nTop %d most similar sentences in corpus:" % top_k)

      for score, idx in zip(top_results[0], top_results[1]):
          print(corpus[idx], "(Score: {:.4f})".format(score))


In [32]:
! wget https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip
! unzip "sentiment labelled sentences.zip"

--2021-12-17 22:50:17--  https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84188 (82K) [application/x-httpd-php]
Saving to: ‘sentiment labelled sentences.zip.1’


2021-12-17 22:50:18 (317 KB/s) - ‘sentiment labelled sentences.zip.1’ saved [84188/84188]

Archive:  sentiment labelled sentences.zip
replace sentiment labelled sentences/.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace __MACOSX/sentiment labelled sentences/._.DS_Store? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace sentiment labelled sentences/amazon_cells_labelled.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace sentiment labelled sentences/imdb_labelled.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [36]:
import re

with open("sentiment labelled sentences/imdb_labelled.txt",
          encoding='utf8') as f:
    corpus = [re.sub('[\s\d]+$', '', l) for l in f.readlines()]

print(len(corpus))
for s in corpus[:5]:
  print(s)

1000
A very, very, very slow-moving, aimless movie about a distressed, drifting young man.
Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.
Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.
Very little music or anything to speak of.
The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.


In [37]:
embedder = SBert(model_path=model_save_path)
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

semantic_search(random.sample(corpus, 10), embedder, corpus_embeddings)





Query: I came out of it feeling angry.

Top 5 most similar sentences in corpus:
I came out of it feeling angry. (Score: 1.0000)
What the hell kind of crap is that?! (Score: 0.3636)
I thought it was bad. (Score: 0.3530)
I mean this in a terrible way. (Score: 0.3378)
I saw this movie and I thought this is a stupid movie. (Score: 0.3297)




Query: End of Days is one of the worst big-budget action movies I've ever seen.

Top 5 most similar sentences in corpus:
End of Days is one of the worst big-budget action movies I've ever seen. (Score: 1.0000)
This was one of the worst films i have ever seen. (Score: 0.8118)
One of the most boring,pointless movies I have ever seen. (Score: 0.7391)
This movie is a pure disaster, the story is stupid and the editing is the worst I have seen, it confuses you incredibly. (Score: 0.7239)
Unfortunately, this is a bad movie that is just plain bad. (Score: 0.7166)




Query: It's very slow.

Top 5 most similar sentences in corpus:
It's very slow. (Score: 1