In [1]:
import numpy as np
from texttable import Texttable

from dataset_handler import *
from semantic_search import *

In [2]:
dataset_dir = os.path.join('data', 'dataset')
indices_dir = os.path.join('data', 'indices')

In [3]:
def get_search_results(query_embeddings, index):
    neighbors = index.knnQueryBatch(query_embeddings, k=1, num_threads=2)
    output_ids = [neighbor[0][0] + 1 for neighbor in neighbors]
    match_scores = [neighbor[1][0] for neighbor in neighbors]
    
    return output_ids, match_scores

In [4]:
def get_accuracy(true_ids, output_ids):
    n = len(true_ids)
    acc = np.sum(np.array(true_ids) == np.array(output_ids)) / n
    
    return acc

In [5]:
def print_result(clause_ids, output_ids, match_scores):
    table = Texttable()
    table.add_row(['clause_id', 'output_id', 'match_score'])
    table.add_rows(list(zip(clause_ids, output_ids, match_scores)), header=False)
    print(table.draw())
    
    return

In [6]:
clauses_dict, query_clauses = load_dataset(dataset_dir=dataset_dir)
print(query_clauses.shape)
query_clauses.head()

(49, 4)


Unnamed: 0,id,clause_id,query,clause
0,100,1,Remove any major changes to house before leaving.,The tenant shall at the termination of this ag...
1,200,2,Take license if you're carrying out business.,The tenant shall himself obtain the license fo...
2,201,2,Make sure you're having a license if doing som...,The tenant shall himself obtain the license fo...
3,300,3,I won't provide any insurance or security cover.,All kinds of security arrangements insurances ...
4,400,4,Stay good with neighbors.,The tenant shall keep good relationship with n...


In [7]:
queries = query_clauses['query'].tolist()
clause_ids = query_clauses['clause_id'].tolist()
len(queries), len(clause_ids)

(49, 49)

In [8]:
clauses = load_clauses(dataset_dir=dataset_dir)
len(clauses)

45

## For SRoBERTa-NLI-STSb

In [13]:
tokenizer, model = load_models()
tokenizer, model

(PreTrainedTokenizerFast(name_or_path='sentence-transformers/roberta-base-nli-stsb-mean-tokens', vocab_size=50262, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}),
 RobertaModel(
   (embeddings): RobertaEmbeddings(
     (word_embeddings): Embedding(50265, 768, padding_idx=1)
     (position_embeddings): Embedding(514, 768, padding_idx=1)
     (token_type_embeddings): Embedding(1, 768)
     (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
     (dropout): Dropout(p=0.1, inplace=False)
   )
   (encoder): RobertaEncoder(
     (layer): ModuleList(
       (0): RobertaLayer(
         (attention): RobertaAttention(
           (self): RobertaSelfAttention(
             (query): Linear(in_features=768, out

In [14]:
clause_embeddings = get_embeddings(clauses, tokenizer, model)
clause_embeddings.shape

torch.Size([45, 768])

In [15]:
create_and_store_index(clause_embeddings, name=os.path.join(indices_dir, 'roberta_base_nli_stsb'))
index = load_index(name=os.path.join(indices_dir, 'roberta_base_nli_stsb'))
index

<nmslib.FloatIndex method='hnsw' space='cosinesimil' at 0x5640a0cc01f0>

In [16]:
query_embeddings = get_embeddings(queries, tokenizer, model)
query_embeddings.shape

torch.Size([49, 768])

In [17]:
output_ids, match_scores = get_search_results(query_embeddings, index)
print_result(clause_ids, output_ids, match_scores)

+-----------+-----------+-------------+
| clause_id | output_id | match_score |
+-----------+-----------+-------------+
| 1         | 44        | 0.397       |
+-----------+-----------+-------------+
| 2         | 2         | 0.365       |
+-----------+-----------+-------------+
| 2         | 2         | 0.423       |
+-----------+-----------+-------------+
| 3         | 12        | 0.474       |
+-----------+-----------+-------------+
| 4         | 4         | 0.100       |
+-----------+-----------+-------------+
| 5         | 5         | 0.272       |
+-----------+-----------+-------------+
| 6         | 6         | 0.268       |
+-----------+-----------+-------------+
| 7         | 36        | 0.253       |
+-----------+-----------+-------------+
| 8         | 8         | 0.478       |
+-----------+-----------+-------------+
| 9         | 9         | 0.325       |
+-----------+-----------+-------------+
| 10        | 10        | 0.184       |
+-----------+-----------+-------------+


In [18]:
get_accuracy(clause_ids, output_ids)

0.7346938775510204

## For ALBERT

In [9]:
tokenizer, model = load_models(model_name='albert-base-v2')
tokenizer, model

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(PreTrainedTokenizerFast(name_or_path='albert-base-v2', vocab_size=30000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '<unk>', 'sep_token': '[SEP]', 'pad_token': '<pad>', 'cls_token': '[CLS]', 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=False)}),
 AlbertModel(
   (embeddings): AlbertEmbeddings(
     (word_embeddings): Embedding(30000, 128, padding_idx=0)
     (position_embeddings): Embedding(512, 128)
     (token_type_embeddings): Embedding(2, 128)
     (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
     (dropout): Dropout(p=0, inplace=False)
   )
   (encoder): AlbertTransformer(
     (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
     (albert_layer_groups): ModuleList(
       (0): AlbertLayerGroup(
         (albert_layers): ModuleList(
           (0): AlbertLayer(
             (full_layer_layer_norm

In [10]:
clause_embeddings = get_embeddings(clauses, tokenizer, model)
clause_embeddings.shape

torch.Size([45, 768])

In [11]:
create_and_store_index(clause_embeddings, name=os.path.join(indices_dir, 'albertv2'))
index = load_index(name=os.path.join(indices_dir, 'albertv2'))
index

<nmslib.FloatIndex method='hnsw' space='cosinesimil' at 0x55dd0b6c5b90>

In [12]:
query_embeddings = get_embeddings(queries, tokenizer, model)
query_embeddings.shape

torch.Size([49, 768])

In [13]:
output_ids, match_scores = get_search_results(query_embeddings, index)
print_result(clause_ids, output_ids, match_scores)

+-----------+-----------+-------------+
| clause_id | output_id | match_score |
+-----------+-----------+-------------+
| 1         | 41        | 0.146       |
+-----------+-----------+-------------+
| 2         | 3         | 0.153       |
+-----------+-----------+-------------+
| 2         | 3         | 0.136       |
+-----------+-----------+-------------+
| 3         | 3         | 0.151       |
+-----------+-----------+-------------+
| 4         | 4         | 0.150       |
+-----------+-----------+-------------+
| 5         | 4         | 0.117       |
+-----------+-----------+-------------+
| 6         | 41        | 0.127       |
+-----------+-----------+-------------+
| 7         | 38        | 0.093       |
+-----------+-----------+-------------+
| 8         | 41        | 0.138       |
+-----------+-----------+-------------+
| 9         | 3         | 0.157       |
+-----------+-----------+-------------+
| 10        | 41        | 0.134       |
+-----------+-----------+-------------+


In [14]:
get_accuracy(clause_ids, output_ids)

0.3673469387755102

## For universal encoder

In [15]:
from tensorflow_hub import load
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
model = load( module_url )


INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/5'.
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-large/5: 100.04MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-large/5: 190.04MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-large/5: 270.04MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-large/5: 340.04MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-large/5: 390.04MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-large/5: 430.04MB
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder-large/5: 510.04MB
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder-large/5, Total size: 577.10MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/5'.


In [17]:
def embed(input):
    return model( input )

clause_embeddings = embed(clauses)
clause_embeddings.shape
# clauses

TensorShape([45, 512])

In [18]:
create_and_store_index(clause_embeddings, name=os.path.join(indices_dir, 'UniversalSentenceEncoderLarge'))
index = load_index(name=os.path.join(indices_dir, 'UniversalSentenceEncoderLarge'))
index

<nmslib.FloatIndex method='hnsw' space='cosinesimil' at 0x55dd35d0edf0>

In [20]:
query_embeddings = embed(queries)
query_embeddings.shape

TensorShape([49, 512])

In [21]:
output_ids, match_scores = get_search_results(query_embeddings, index)
print_result(clause_ids, output_ids, match_scores)

+-----------+-----------+-------------+
| clause_id | output_id | match_score |
+-----------+-----------+-------------+
| 1         | 33        | 0.727       |
+-----------+-----------+-------------+
| 2         | 2         | 0.518       |
+-----------+-----------+-------------+
| 2         | 2         | 0.656       |
+-----------+-----------+-------------+
| 3         | 26        | 0.626       |
+-----------+-----------+-------------+
| 4         | 4         | 0.345       |
+-----------+-----------+-------------+
| 5         | 5         | 0.662       |
+-----------+-----------+-------------+
| 6         | 6         | 0.621       |
+-----------+-----------+-------------+
| 7         | 7         | 0.609       |
+-----------+-----------+-------------+
| 8         | 8         | 0.545       |
+-----------+-----------+-------------+
| 9         | 9         | 0.786       |
+-----------+-----------+-------------+
| 10        | 10        | 0.488       |
+-----------+-----------+-------------+


In [22]:
get_accuracy(clause_ids, output_ids)



0.8367346938775511

## For BERT

In [14]:
tokenizer, model = load_models(model_name='bert-base-uncased')
tokenizer, model

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

(PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}),
 BertModel(
   (embeddings): BertEmbeddings(
     (word_embeddings): Embedding(30522, 768, padding_idx=0)
     (position_embeddings): Embedding(512, 768)
     (token_type_embeddings): Embedding(2, 768)
     (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     (dropout): Dropout(p=0.1, inplace=False)
   )
   (encoder): BertEncoder(
     (layer): ModuleList(
       (0): BertLayer(
         (attention): BertAttention(
           (self): BertSelfAttention(
             (query): Linear(in_features=768, out_features=768, bias=True)
             (key): Linear(in_features=768, out_features=768, bias=True)
             (value): Linear(in_features=768, out_features=768, bias=True)
             (dropout): Dropout(p=0.1, 

In [15]:
clause_embeddings = get_embeddings(clauses, tokenizer, model)
clause_embeddings.shape

torch.Size([45, 768])

In [16]:
create_and_store_index(clause_embeddings, name=os.path.join(indices_dir, 'bert-base'))
index = load_index(name=os.path.join(indices_dir, 'bert-base'))
index

<nmslib.FloatIndex method='hnsw' space='cosinesimil' at 0x561879205a70>

In [17]:
query_embeddings = get_embeddings(queries, tokenizer, model)
query_embeddings.shape

torch.Size([49, 768])

In [18]:
output_ids, match_scores = get_search_results(query_embeddings, index)
print_result(clause_ids, output_ids, match_scores)

+-----------+-----------+-------------+
| clause_id | output_id | match_score |
+-----------+-----------+-------------+
| 1         | 41        | 0.264       |
+-----------+-----------+-------------+
| 2         | 2         | 0.307       |
+-----------+-----------+-------------+
| 2         | 23        | 0.303       |
+-----------+-----------+-------------+
| 3         | 23        | 0.296       |
+-----------+-----------+-------------+
| 4         | 4         | 0.248       |
+-----------+-----------+-------------+
| 5         | 4         | 0.354       |
+-----------+-----------+-------------+
| 6         | 41        | 0.272       |
+-----------+-----------+-------------+
| 7         | 25        | 0.195       |
+-----------+-----------+-------------+
| 8         | 22        | 0.310       |
+-----------+-----------+-------------+
| 9         | 9         | 0.311       |
+-----------+-----------+-------------+
| 10        | 10        | 0.230       |
+-----------+-----------+-------------+


In [19]:
get_accuracy(clause_ids, output_ids)

0.4897959183673469

## For RoBERTa

In [9]:
tokenizer, model = load_models(model_name='roberta-base')
tokenizer, model

(PreTrainedTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}),
 RobertaModel(
   (embeddings): RobertaEmbeddings(
     (word_embeddings): Embedding(50265, 768, padding_idx=1)
     (position_embeddings): Embedding(514, 768, padding_idx=1)
     (token_type_embeddings): Embedding(1, 768)
     (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
     (dropout): Dropout(p=0.1, inplace=False)
   )
   (encoder): RobertaEncoder(
     (layer): ModuleList(
       (0): RobertaLayer(
         (attention): RobertaAttention(
           (self): RobertaSelfAttention(
             (query): Linear(in_features=768, out_features=768, bias=True)
             (key): Linear(in_features=768, o

In [10]:
clause_embeddings = get_embeddings(clauses, tokenizer, model)
clause_embeddings.shape

torch.Size([45, 768])

In [11]:
create_and_store_index(clause_embeddings, name=os.path.join(indices_dir, 'roberta-base'))
index = load_index(name=os.path.join(indices_dir, 'roberta-base'))
index

<nmslib.FloatIndex method='hnsw' space='cosinesimil' at 0x55b8acfa5e30>

In [12]:
query_embeddings = get_embeddings(queries, tokenizer, model)
query_embeddings.shape

torch.Size([49, 768])

In [13]:
output_ids, match_scores = get_search_results(query_embeddings, index)
print_result(clause_ids, output_ids, match_scores)

+-----------+-----------+-------------+
| clause_id | output_id | match_score |
+-----------+-----------+-------------+
| 1         | 37        | 0.025       |
+-----------+-----------+-------------+
| 2         | 4         | 0.025       |
+-----------+-----------+-------------+
| 2         | 4         | 0.028       |
+-----------+-----------+-------------+
| 3         | 37        | 0.023       |
+-----------+-----------+-------------+
| 4         | 4         | 0.023       |
+-----------+-----------+-------------+
| 5         | 5         | 0.025       |
+-----------+-----------+-------------+
| 6         | 41        | 0.026       |
+-----------+-----------+-------------+
| 7         | 19        | 0.018       |
+-----------+-----------+-------------+
| 8         | 4         | 0.025       |
+-----------+-----------+-------------+
| 9         | 3         | 0.026       |
+-----------+-----------+-------------+
| 10        | 39        | 0.020       |
+-----------+-----------+-------------+


In [14]:
get_accuracy(clause_ids, output_ids)

0.2653061224489796