In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/14/9d/abacb6f7bb63df39285c55bb51b6403a7fd93ac2aea48b01f6215175446c/sentence-transformers-1.1.1.tar.gz (81kB)
[K     |████████████████████████████████| 81kB 5.6MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/b0/9e/5b80becd952d5f7250eaf8fc64b957077b12ccfe73e9c03d37146ab29712/transformers-4.6.0-py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 15.3MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 32.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)

In [None]:
"""
This script trains sentence transformers with a triplet loss function.
As corpus, we use the wikipedia sections dataset that was describd by Dor et al., 2018, Learning Thematic Similarity Metric Using Triplet Networks.
"""

from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime
from zipfile import ZipFile
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator

import csv
import logging
import os

In [None]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

In [None]:
logging.info("Read test dataset")
test_sts_samples = []
with open("/content/sample_data/sentence_pair_unique.csv", 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn)
    for row in reader:
        score = float(row['avg']) / 4.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['sent1_text'], row['sent2_text']], label=score)
        test_sts_samples.append(inp_example)

2021-05-09 11:54:01 - Read test dataset


In [None]:
model_name = 'stsb-bert-base'
output_path = "output/training-wikipedia-sections-"+model_name+"-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model = SentenceTransformer(model_name)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_sts_samples, name='sts-test')
test_evaluator(model, output_path=output_path)

FileNotFoundError: ignored

In [None]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df_test = pd.read_csv("/content/sample_data/sentence_pair_unique.csv")

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df_test.shape[0]))

# Display 5 random rows from the data.
df_test.sample(1)

# Get the lists of sentences and their labels.
sent1 = df_test.sent1_text.values
sent2 = df_test.sent2_text.values
sent1 = sent1.tolist()
sent2 = sent2.tolist()

sent1_embeddings = model.encode(sent1)
sent1_embeddings.shape

sent2_embeddings = model.encode(sent2)
sent2_embeddings.shape

from sklearn.metrics.pairwise import cosine_similarity
import scipy
import pandas as pd
import numpy as np


cos_list = []
cos2_list = []

for j in range(len(sent1_embeddings)):
  sent1 = sent1_embeddings[j]
  sent2 = sent2_embeddings[j]
  cos = 1-scipy.spatial.distance.cdist(sent1.reshape(1, -1), sent2.reshape(1, -1), "cosine")[0][0]
  cos2 = cosine_similarity(sent1.reshape(1, -1), sent2.reshape(1, -1))[0][0]
  cos_list.append(cos)
  cos2_list.append(cos2)

assert(len(cos_list)==len(df_test))
assert(len(cos2_list)==len(df_test))

df_test["model_similarity"] = cos_list
df_test["model_similarity2"] = cos2_list

df_test.to_csv("/content/sample_data/sentence_pair_sbert.csv",
                    index=None)

Number of training sentences: 1,635



## Siamese Network Training

In [None]:
### Create a torch.DataLoader that passes training batch instances to our model

model_name = 'stsb-bert-base'
train_batch_size = 8
output_path = "output/training-yelp-pair-"+model_name+"-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
num_epochs = 1
# n_train = 1000 
n_train = 3000
# n_train = 8000
trip_type = "firstsentence"

model = SentenceTransformer(model_name)

2021-05-12 04:53:45 - Load pretrained SentenceTransformer: stsb-bert-base
2021-05-12 04:53:45 - Did not find folder stsb-bert-base
2021-05-12 04:53:45 - Search model on server: http://sbert.net/models/stsb-bert-base.zip
2021-05-12 04:53:45 - Load SentenceTransformer from folder: /root/.cache/torch/sentence_transformers/sbert.net_models_stsb-bert-base
2021-05-12 04:53:47 - Use pytorch device: cuda


In [None]:
logger.info("Read Yelp Pair train dataset")
train_examples = []
with open("/content/sample_data/yelp_pair_train_10000_" + trip_type + ".csv", encoding="utf-8") as fIn:
    reader = csv.DictReader(fIn)
    for row in reader:
     # print(row)
      train_examples.append(InputExample(texts=[row['sent1_text'], row['sent2_text']], label=int(row['label'])))

2021-05-12 04:53:47 - Read Yelp Pair train dataset


In [None]:
test_examples = train_examples[0:1000]
dev_examples = train_examples[1000:2000]
train_examples = train_examples[2000:(2000+n_train)]

In [None]:
print(len(test_examples))
print(len(dev_examples))
print(len(train_examples))

1000
1000
8000


In [None]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=2)

2021-05-12 04:53:47 - Softmax loss: #Vectors concatenated: 3


In [None]:
logger.info("Read Yelp Pair dev dataset")

evaluator = BinaryClassificationEvaluator.from_input_examples(dev_examples, name='yelp-dev_sbertft_' + str(n_train) + "_" + trip_type + "_run1")

warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

2021-05-12 04:53:47 - Read Yelp Pair dev dataset


In [None]:
print(output_path)

output/training-yelp-pair-stsb-bert-base-2021-05-12_04-53-45


In [None]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=output_path)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1000.0, style=ProgressStyle(description_w…

2021-05-12 05:00:33 - Binary Accuracy Evaluation of the model on yelp-dev_sbertft_8000_entirereview_run1 dataset in epoch 0 after 1000 steps:
2021-05-12 05:00:45 - Accuracy with Cosine-Similarity:           88.70	(Threshold: 0.3976)
2021-05-12 05:00:45 - F1 with Cosine-Similarity:                 88.55	(Threshold: 0.2364)
2021-05-12 05:00:45 - Precision with Cosine-Similarity:          86.13
2021-05-12 05:00:45 - Recall with Cosine-Similarity:             91.12
2021-05-12 05:00:45 - Average Precision with Cosine-Similarity:  94.70

2021-05-12 05:00:45 - Accuracy with Manhatten-Distance:           88.60	(Threshold: 359.7927)
2021-05-12 05:00:45 - F1 with Manhatten-Distance:                 88.31	(Threshold: 421.9900)
2021-05-12 05:00:45 - Precision with Manhatten-Distance:          86.22
2021-05-12 05:00:45 - Recall with Manhatten-Distance:             90.50
2021-05-12 05:00:45 - Average Precision with Manhatten-Distance:  94.60

2021-05-12 05:00:45 - Accuracy with Euclidean-Distance:  

In [None]:
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

logger.info("Read test examples")
model = SentenceTransformer(output_path)
test_evaluator = BinaryClassificationEvaluator.from_input_examples(dev_examples, name='yelp-test_sbertft_' + str(n_train) + "_" + trip_type + "_run1")
test_evaluator(model, output_path=output_path)

2021-05-12 05:00:59 - Read test examples
2021-05-12 05:00:59 - Load pretrained SentenceTransformer: output/training-yelp-pair-stsb-bert-base-2021-05-12_04-53-45
2021-05-12 05:00:59 - Load SentenceTransformer from folder: output/training-yelp-pair-stsb-bert-base-2021-05-12_04-53-45
2021-05-12 05:01:01 - Use pytorch device: cuda
2021-05-12 05:01:01 - Binary Accuracy Evaluation of the model on yelp-test_sbertft_8000_entirereview_run1 dataset:
2021-05-12 05:01:13 - Accuracy with Cosine-Similarity:           88.70	(Threshold: 0.3976)
2021-05-12 05:01:13 - F1 with Cosine-Similarity:                 88.55	(Threshold: 0.2364)
2021-05-12 05:01:13 - Precision with Cosine-Similarity:          86.13
2021-05-12 05:01:13 - Recall with Cosine-Similarity:             91.12
2021-05-12 05:01:13 - Average Precision with Cosine-Similarity:  94.70

2021-05-12 05:01:13 - Accuracy with Manhatten-Distance:           88.60	(Threshold: 359.7927)
2021-05-12 05:01:13 - F1 with Manhatten-Distance:                 

0.9470137769515997

In [None]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df_test = pd.read_csv("/content/sample_data/sentence_pair_unique.csv")

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df_test.shape[0]))

# Display 5 random rows from the data.
df_test.sample(1)

# Get the lists of sentences and their labels.
sent1 = df_test.sent1_text.values
sent2 = df_test.sent2_text.values
sent1 = sent1.tolist()
sent2 = sent2.tolist()

sent1_embeddings = model.encode(sent1)
sent1_embeddings.shape

sent2_embeddings = model.encode(sent2)
sent2_embeddings.shape

from sklearn.metrics.pairwise import cosine_similarity
import scipy
import pandas as pd
import numpy as np


cos_list = []
cos2_list = []

for j in range(len(sent1_embeddings)):
  sent1 = sent1_embeddings[j]
  sent2 = sent2_embeddings[j]
  cos = 1-scipy.spatial.distance.cdist(sent1.reshape(1, -1), sent2.reshape(1, -1), "cosine")[0][0]
  cos2 = cosine_similarity(sent1.reshape(1, -1), sent2.reshape(1, -1))[0][0]
  cos_list.append(cos)
  cos2_list.append(cos2)

assert(len(cos_list)==len(df_test))
assert(len(cos2_list)==len(df_test))

df_test["model_similarity"] = cos_list
df_test["model_similarity2"] = cos2_list

df_test.to_csv("/content/sample_data/sentence_pair_sbert_yelp_"+str(n_train)+"_" + trip_type + "_pair_run1.csv",
                    index=None)

Number of training sentences: 1,635



HBox(children=(FloatProgress(value=0.0, description='Batches', max=52.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=52.0, style=ProgressStyle(description_width…




## Triplet Network Training

In [None]:
### Create a torch.DataLoader that passes training batch instances to our model

model_name = 'stsb-bert-base'
train_batch_size = 8
output_path = "output/training-yelp-triplets-"+model_name+"-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
num_epochs = 1
# n_train = 1000 
n_train = 3000
# n_train = 8000
trip_margin = 9.0
trip_type = "firstsentence"

model = SentenceTransformer(model_name)

2021-05-13 03:09:57 - Load pretrained SentenceTransformer: stsb-bert-base
2021-05-13 03:09:57 - Did not find folder stsb-bert-base
2021-05-13 03:09:57 - Search model on server: http://sbert.net/models/stsb-bert-base.zip
2021-05-13 03:09:57 - Downloading sentence transformer model from http://sbert.net/models/stsb-bert-base.zip and saving it at /root/.cache/torch/sentence_transformers/sbert.net_models_stsb-bert-base


HBox(children=(FloatProgress(value=0.0, max=405233603.0), HTML(value='')))


2021-05-13 03:10:26 - Load SentenceTransformer from folder: /root/.cache/torch/sentence_transformers/sbert.net_models_stsb-bert-base


Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/sbert.net_models_stsb-bert-base/0_BERT were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2021-05-13 03:10:27 - Use pytorch device: cuda


In [None]:
logger.info("Read Triplet train dataset")
train_examples = []
with open("/content/sample_data/yelp_triplets_train_10000_" + trip_type + ".csv", encoding="utf-8") as fIn:
    reader = csv.DictReader(fIn)
    for row in reader:
        train_examples.append(InputExample(texts=[row['anchor_text'], row['same_text'], row['diff_text']], label=0))

2021-05-13 03:10:27 - Read Triplet train dataset


In [None]:
len(train_examples)

10000

In [None]:
test_examples = train_examples[0:1000]
dev_examples = train_examples[1000:2000]
train_examples = train_examples[2000:(2000+n_train)]

In [None]:
print(len(test_examples))
print(len(dev_examples))
print(len(train_examples))

1000
1000
3000


In [None]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.TripletLoss(model=model, triplet_margin= trip_margin)

In [None]:
len(train_dataloader)

375

In [None]:
logger.info("Read Yelp Triplet dev dataset")

evaluator = TripletEvaluator.from_input_examples(dev_examples, name='yelp-dev_sbertft_' + str(n_train) + "_" + trip_type + "_margin" + str(int(trip_margin))+ "_run1")

warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data


2021-05-13 03:10:53 - Read Yelp Triplet dev dataset


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=200,
          warmup_steps=warmup_steps,
          output_path=output_path)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=375.0, style=ProgressStyle(description_wi…

2021-05-13 03:11:53 - TripletEvaluator: Evaluating the model on yelp-dev_sbertft_3000_firstsentence_margin9_run1 dataset in epoch 0 after 200 steps:
2021-05-13 03:11:57 - Accuracy Cosine Distance:   	70.10
2021-05-13 03:11:57 - Accuracy Manhatten Distance:	70.30
2021-05-13 03:11:57 - Accuracy Euclidean Distance:	70.30

2021-05-13 03:11:57 - Save model to output/training-yelp-triplets-stsb-bert-base-2021-05-13_03-09-57

2021-05-13 03:12:38 - TripletEvaluator: Evaluating the model on yelp-dev_sbertft_3000_firstsentence_margin9_run1 dataset after epoch 0:
2021-05-13 03:12:42 - Accuracy Cosine Distance:   	71.20
2021-05-13 03:12:42 - Accuracy Manhatten Distance:	72.10
2021-05-13 03:12:42 - Accuracy Euclidean Distance:	71.90

2021-05-13 03:12:42 - Save model to output/training-yelp-triplets-stsb-bert-base-2021-05-13_03-09-57



In [None]:
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

logger.info("Read test examples")
model = SentenceTransformer(output_path)
test_evaluator = TripletEvaluator.from_input_examples(test_examples, name='yelp-test_sbertft_' + str(n_train) + "_" + trip_type + "_margin" + str(int(trip_margin))+ "_run1")
test_evaluator(model, output_path=output_path)

2021-05-13 03:12:44 - Read test examples
2021-05-13 03:12:44 - Load pretrained SentenceTransformer: output/training-yelp-triplets-stsb-bert-base-2021-05-13_03-09-57
2021-05-13 03:12:44 - Load SentenceTransformer from folder: output/training-yelp-triplets-stsb-bert-base-2021-05-13_03-09-57
2021-05-13 03:12:45 - Use pytorch device: cuda
2021-05-13 03:12:45 - TripletEvaluator: Evaluating the model on yelp-test_sbertft_3000_firstsentence_margin9_run1 dataset:
2021-05-13 03:12:50 - Accuracy Cosine Distance:   	71.30
2021-05-13 03:12:50 - Accuracy Manhatten Distance:	70.90
2021-05-13 03:12:50 - Accuracy Euclidean Distance:	70.70



0.713

In [None]:
## Baseline for SBERT## 

## # logger.info("Read test examples")
# model = SentenceTransformer(model_name)
# test_evaluator = TripletEvaluator.from_input_examples(test_examples, name='yelp-test')
# test_evaluator(model, output_path=output_path)

In [None]:
import pandas as pd

model = SentenceTransformer(output_path)
# Load the dataset into a pandas dataframe.
df_test = pd.read_csv("/content/sample_data/sentence_pair_unique.csv")

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df_test.shape[0]))

# Display 5 random rows from the data.
df_test.sample(1)

# Get the lists of sentences and their labels.
sent1 = df_test.sent1_text.values
sent2 = df_test.sent2_text.values
sent1 = sent1.tolist()
sent2 = sent2.tolist()

sent1_embeddings = model.encode(sent1)
sent1_embeddings.shape

sent2_embeddings = model.encode(sent2)
sent2_embeddings.shape

from sklearn.metrics.pairwise import cosine_similarity
import scipy
import pandas as pd
import numpy as np


cos_list = []
cos2_list = []

for j in range(len(sent1_embeddings)):
  sent1 = sent1_embeddings[j]
  sent2 = sent2_embeddings[j]
  cos = 1-scipy.spatial.distance.cdist(sent1.reshape(1, -1), sent2.reshape(1, -1), "cosine")[0][0]
  cos2 = cosine_similarity(sent1.reshape(1, -1), sent2.reshape(1, -1))[0][0]
  cos_list.append(cos)
  cos2_list.append(cos2)

assert(len(cos_list)==len(df_test))
assert(len(cos2_list)==len(df_test))

df_test["model_similarity"] = cos_list
df_test["model_similarity2"] = cos2_list


df_test.to_csv("/content/sample_data/sentence_pair_sbert_yelp_"+str(n_train)+"_" + trip_type + "_triplet_margin" + str(int(trip_margin)) + "_run1.csv",
                    index=None)

2021-05-13 03:12:50 - Load pretrained SentenceTransformer: output/training-yelp-triplets-stsb-bert-base-2021-05-13_03-09-57
2021-05-13 03:12:50 - Load SentenceTransformer from folder: output/training-yelp-triplets-stsb-bert-base-2021-05-13_03-09-57
2021-05-13 03:12:51 - Use pytorch device: cuda
Number of training sentences: 1,635



HBox(children=(FloatProgress(value=0.0, description='Batches', max=52.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=52.0, style=ProgressStyle(description_width…




In [None]:
import gzip


sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)


logging.info("Read STSbenchmark train dataset")
train_sts_samples = []
dev_sts_samples = []
test_sts_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)

        if row['split'] == 'dev':
            dev_sts_samples.append(inp_example)
        elif row['split'] == 'test':
            test_sts_samples.append(inp_example)
        else:
            train_sts_samples.append(inp_example)




##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

model = SentenceTransformer(output_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_sts_samples, name='sts-test')
test_evaluator(model, output_path=output_path)


2021-05-03 02:44:58 - Read STSbenchmark train dataset
2021-05-03 02:44:58 - Load pretrained SentenceTransformer: output/training-wikipedia-sections-distilbert-base-uncased-2021-05-03_01-58-18
2021-05-03 02:44:58 - Load SentenceTransformer from folder: output/training-wikipedia-sections-distilbert-base-uncased-2021-05-03_01-58-18
2021-05-03 02:44:59 - Use pytorch device: cuda
2021-05-03 02:44:59 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2021-05-03 02:45:01 - Cosine-Similarity :	Pearson: 0.5180	Spearman: 0.5217
2021-05-03 02:45:01 - Manhattan-Distance:	Pearson: 0.5411	Spearman: 0.5348
2021-05-03 02:45:01 - Euclidean-Distance:	Pearson: 0.5412	Spearman: 0.5353
2021-05-03 02:45:01 - Dot-Product-Similarity:	Pearson: 0.3125	Spearman: 0.2936


0.5353330741570357

In [None]:
len(test_sts_samples)

1635