In [1]:
import torch
import torch.nn as nn
from torch import tensor 
from transformers import BertModel, BertTokenizer
import gzip
import pandas as pd

In [2]:
class EmbeddingModel(nn.Module):
    def __init__(self, bertName = "bert-base-uncased"): # other bert models can also be supported
        super().__init__()
        self.bertName = bertName
        # use BERT model
        self.tokenizer = BertTokenizer.from_pretrained(self.bertName)
        self.model = BertModel.from_pretrained(self.bertName)        
       
    def forward(self, s, device = "cuda"):
        # get tokens, which also include attention_mask
        tokens = self.tokenizer(s, return_tensors='pt', padding = "max_length", truncation = True, max_length = 256).to(device)
        
        # get token embeddings
        output = self.model(**tokens)
        tokens_embeddings = output.last_hidden_state
        #print("tokens_embeddings:" + str(tokens_embeddings.shape))
        
        # mean pooling to get text embedding
        embeddings = tokens_embeddings * tokens.attention_mask[...,None] # [B, T, emb]
        #print("embeddings:" + str(embeddings.shape))
        
        embeddings = embeddings.sum(1) # [B, emb]
        valid_tokens = tokens.attention_mask.sum(1) # [B]
        embeddings = embeddings / valid_tokens[...,None] # [B, emb]    
        
        return embeddings

    # from scratch: nn.CosineSimilarity(dim = 1)(q,a)
    def cos_score(self, q, a): 
        q_norm = q / (q.pow(2).sum(dim=1, keepdim=True).pow(0.5))
        r_norm = a / (a.pow(2).sum(dim=1, keepdim=True).pow(0.5))
        return (q_norm @ r_norm.T).diagonal()

In [3]:
# contrastive training
class TrainModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.m = EmbeddingModel("bert-base-uncased")

    def forward(self, s1, s2, score):        
        cos_score = self.m.cos_score(self.m(s1), self.m(s2))
        loss = nn.MSELoss()(cos_score, score)
        return loss, cos_score

In [4]:
training_load=torch.load("myTextEmbedding.pt")
m=training_load.m
#m = EmbeddingModel().to("cuda")

In [5]:
import requests

In [6]:
def searchWiki(s):
    response = requests.get(
            'https://en.wikipedia.org/w/api.php',
            params={
                'action': 'query',
                'format': 'json',
                'titles': s,
                'prop': 'extracts',
                'exintro': True,
                'explaintext': True,
            }
        ).json()
    page = next(iter(response['query']['pages'].values()))
    return page['extract']

In [7]:
concepts = ["vector database", "Similarity search", "Sentence embedding"]
wiki_data = [searchWiki(c).replace("\n","") for c in concepts]

In [8]:
chunk_size = 256 # as EmbeddingModel default context length, i.e.max_length

In [9]:
# fixed size chunking
def chunk(w):
    return [w[i:i+chunk_size] for i in range(0,len(w),chunk_size)]
chunk(wiki_data[0])        
#len(wiki_data[0])

['A vector database management system (VDBMS) or simply vector database or vector store is a database that can store vectors (fixed-length lists of numbers) along with other data items. Vector databases typically implement one or more Approximate Nearest Nei',
 'ghbor (ANN) algorithms, so that one can search the database with a query vector to retrieve the closest matching database records.Vectors are mathematical representations of data in a high-dimensional space. In this space, each dimension corresponds to a f',
 "eature of the data, with the number of dimensions ranging from few hundreds to tens of thousands, depending on the complexity of the data being represented. A vector's position in this space represents its characteristics. Words, phrases, or entire documen",
 'ts, and images, audio, and other types of data can all be vectorized. These feature vectors may be computed from the raw data using machine learning methods such as feature extraction algorithms, word embeddings or 

In [10]:
# sentense chunking
def chunk(w):
    return w.split(".")
chunk(wiki_data[0])   

['A vector database management system (VDBMS) or simply vector database or vector store is a database that can store vectors (fixed-length lists of numbers) along with other data items',
 ' Vector databases typically implement one or more Approximate Nearest Neighbor (ANN) algorithms, so that one can search the database with a query vector to retrieve the closest matching database records',
 'Vectors are mathematical representations of data in a high-dimensional space',
 ' In this space, each dimension corresponds to a feature of the data, with the number of dimensions ranging from few hundreds to tens of thousands, depending on the complexity of the data being represented',
 " A vector's position in this space represents its characteristics",
 ' Words, phrases, or entire documents, and images, audio, and other types of data can all be vectorized',
 ' These feature vectors may be computed from the raw data using machine learning methods such as feature extraction algorithms, word embed

In [11]:
chunk_data = []
for w in wiki_data:
    chunk_data = chunk_data + chunk(w) 

chunk_data = [c.strip()+"." for c in chunk_data]
while '.' in chunk_data:
    chunk_data.remove('.')
chunk_data

['A vector database management system (VDBMS) or simply vector database or vector store is a database that can store vectors (fixed-length lists of numbers) along with other data items.',
 'Vector databases typically implement one or more Approximate Nearest Neighbor (ANN) algorithms, so that one can search the database with a query vector to retrieve the closest matching database records.',
 'Vectors are mathematical representations of data in a high-dimensional space.',
 'In this space, each dimension corresponds to a feature of the data, with the number of dimensions ranging from few hundreds to tens of thousands, depending on the complexity of the data being represented.',
 "A vector's position in this space represents its characteristics.",
 'Words, phrases, or entire documents, and images, audio, and other types of data can all be vectorized.',
 'These feature vectors may be computed from the raw data using machine learning methods such as feature extraction algorithms, word embe

In [12]:
chunk_emb = m(chunk_data)

In [13]:
question = ["what is embedding"]
result_score = m.cos_score(m(question).expand(chunk_emb.shape),chunk_emb)

In [15]:
chunk_emb.shape

torch.Size([35, 768])

In [16]:
m(question).shape

torch.Size([1, 768])

In [17]:
_,idxs = torch.topk(result_score,3)

In [17]:
result_score.flatten()

tensor([0.4591, 0.5320, 0.5176, 0.5351, 0.5761, 0.5203, 0.5421, 0.5658, 0.5095,
        0.5275, 0.5734, 0.5198, 0.5224, 0.5555, 0.5569, 0.5305, 0.6105, 0.5481,
        0.5712, 0.5381, 0.4875, 0.5586, 0.5119, 0.5479, 0.6304, 0.5918, 0.5842,
        0.5537, 0.6217, 0.6240, 0.5312, 0.5952, 0.5661, 0.5491, 0.5608],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [18]:
result_score.flatten()[24]

tensor(0.4386, device='cuda:0', grad_fn=<SelectBackward0>)

In [19]:
idxs.flatten().tolist()

[24, 31, 27]

In [20]:
[chunk_data[idx] for idx in idxs.flatten().tolist()]

['State of the art embeddings are based on the learned hidden layer representation of dedicated sentence transformer models.',
 'An alternative direction is to aggregate word embeddings, such as those returned by Word2vec, into sentence embeddings.',
 "SBERT later achieved superior sentence embedding performance by fine tuning BERT's [CLS] token embeddings through the usage of a siamese neural network architecture on the SNLI dataset."]

In [20]:
[chunk_data[idx] for idx in idxs.flatten().tolist()]

['State of the art embeddings are based on the learned hidden layer representation of dedicated sentence transformer models.',
 'Skip-Thought trains an encoder-decoder structure for the task of neighboring sentences predictions.',
 'Other approaches are loosely based on the idea of distributional semantics applied to sentences.']

In [25]:
def search_document(s, topk=3):
    question = [s]
    result_score = m.cos_score(m(question).expand(chunk_emb.shape),chunk_emb)
    print(result_score)
    _,idxs = torch.topk(result_score,topk)
    print([result_score.flatten()[idx] for idx in idxs.flatten().tolist()])
    return [chunk_data[idx] for idx in idxs.flatten().tolist()]

In [26]:
search_document("what are State of the art embeddings")

tensor([0.1173, 0.1579, 0.1608, 0.2874, 0.1524, 0.2882, 0.3161, 0.2539, 0.1580,
        0.2775, 0.3162, 0.1425, 0.2240, 0.2439, 0.3948, 0.2428, 0.1651, 0.3433,
        0.2615, 0.1924, 0.2228, 0.1930, 0.1392, 0.2549, 0.5884, 0.2730, 0.2193,
        0.3320, 0.3042, 0.2103, 0.3500, 0.3279, 0.1144, 0.3512, 0.3090],
       device='cuda:0', grad_fn=<DiagonalBackward0>)
[tensor(0.5884, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.3948, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.3512, device='cuda:0', grad_fn=<SelectBackward0>)]


['State of the art embeddings are based on the learned hidden layer representation of dedicated sentence transformer models.',
 'This is becoming increasingly important in an age of large information repositories where the objects contained do not possess any natural order, for example large collections of images, sounds and other sophisticated digital objects.',
 'However, more elaborate solutions based on word vector quantization have also been proposed.']

In [22]:
search_document("what are State of the art embeddings")

[tensor(0.7679, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.6845, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.6748, device='cuda:0', grad_fn=<SelectBackward0>)]


['State of the art embeddings are based on the learned hidden layer representation of dedicated sentence transformer models.',
 'This is becoming increasingly important in an age of large information repositories where the objects contained do not possess any natural order, for example large collections of images, sounds and other sophisticated digital objects.',
 'Text documents describing the domain of interest are collected and for each document a feature vector (known as an "embedding") is computed, typically using a deep learning network, and stored in a vector database.']

In [27]:
search_document("what is SISAP foundation")

tensor([ 0.0689,  0.0256,  0.1134,  0.1077,  0.1104,  0.1202, -0.0071,  0.1059,
         0.1150,  0.0014,  0.0659,  0.0246,  0.0981,  0.1139,  0.0393,  0.0760,
         0.0595,  0.0616,  0.0125,  0.0545,  0.0927,  0.1908,  0.5429,  0.1074,
         0.0997,  0.1076,  0.1373,  0.1768,  0.1730,  0.1053,  0.1760,  0.1794,
         0.0870,  0.0704,  0.1369], device='cuda:0',
       grad_fn=<DiagonalBackward0>)
[tensor(0.5429, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.1908, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.1794, device='cuda:0', grad_fn=<SelectBackward0>)]


['This resulted in the formation of the SISAP foundation, whose main activity is a series of annual international conferences on the generic topic.',
 'In 2008 a few leading researchers in the field felt strongly that the subject should be a research topic in its own right, to allow focus on the general issues applicable across the many diverse domains of its use.',
 'An alternative direction is to aggregate word embeddings, such as those returned by Word2vec, into sentence embeddings.']

In [23]:
search_document("what is SISAP foundation")

[tensor(0.5501, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.5260, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.5019, device='cuda:0', grad_fn=<SelectBackward0>)]


['This resulted in the formation of the SISAP foundation, whose main activity is a series of annual international conferences on the generic topic.',
 'Skip-Thought trains an encoder-decoder structure for the task of neighboring sentences predictions.',
 'In 2008 a few leading researchers in the field felt strongly that the subject should be a research topic in its own right, to allow focus on the general issues applicable across the many diverse domains of its use.']

In [28]:
search_document("what are sentence embeddings?")

tensor([0.2074, 0.1640, 0.2663, 0.2546, 0.1525, 0.5036, 0.4202, 0.4270, 0.3133,
        0.2790, 0.3995, 0.2420, 0.4209, 0.3161, 0.2915, 0.2927, 0.2304, 0.1892,
        0.2789, 0.2547, 0.2093, 0.1940, 0.1916, 0.6079, 0.5657, 0.4848, 0.3760,
        0.4391, 0.6059, 0.5217, 0.1623, 0.7555, 0.2698, 0.4316, 0.4551],
       device='cuda:0', grad_fn=<DiagonalBackward0>)
[tensor(0.7555, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.6079, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.6059, device='cuda:0', grad_fn=<SelectBackward0>)]


['An alternative direction is to aggregate word embeddings, such as those returned by Word2vec, into sentence embeddings.',
 'In natural language processing, a sentence embedding refers to a numeric representation of a sentence in the form of a vector of real numbers which encodes meaningful semantic information.',
 'Other approaches are loosely based on the idea of distributional semantics applied to sentences.']

In [24]:
search_document("what are sentence embeddings?")

[tensor(0.7916, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.7334, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.7326, device='cuda:0', grad_fn=<SelectBackward0>)]


['An alternative direction is to aggregate word embeddings, such as those returned by Word2vec, into sentence embeddings.',
 'In natural language processing, a sentence embedding refers to a numeric representation of a sentence in the form of a vector of real numbers which encodes meaningful semantic information.',
 'Skip-Thought trains an encoder-decoder structure for the task of neighboring sentences predictions.']

In [29]:
search_document("what is BERT?")

tensor([0.1262, 0.0500, 0.1203, 0.1252, 0.1615, 0.1370, 0.0351, 0.0883, 0.0687,
        0.0639, 0.0649, 0.0890, 0.0863, 0.0819, 0.0777, 0.1128, 0.0566, 0.1082,
        0.1061, 0.0951, 0.0455, 0.0086, 0.0588, 0.1971, 0.0516, 0.3951, 0.4957,
        0.3823, 0.1412, 0.1151, 0.1914, 0.1348, 0.1328, 0.1268, 0.0983],
       device='cuda:0', grad_fn=<DiagonalBackward0>)
[tensor(0.4957, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.3951, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.3823, device='cuda:0', grad_fn=<SelectBackward0>)]


["In practice however, BERT's sentence embedding with the [CLS] token achieves poor performance, often worse than simply averaging non-contextual word embeddings.",
 'BERT pioneered an approach involving the use of a dedicated [CLS] token prepended to the beginning of each sentence inputted into the model; the final hidden state vector of this token encodes information about the sentence and can be fine-tuned for use in sentence classification tasks.',
 "SBERT later achieved superior sentence embedding performance by fine tuning BERT's [CLS] token embeddings through the usage of a siamese neural network architecture on the SNLI dataset."]

In [30]:
search_document("what is Nearest neighbor search?")

tensor([0.2179, 0.5065, 0.1234, 0.2750, 0.2229, 0.1145, 0.2659, 0.3370, 0.3362,
        0.2488, 0.2640, 0.2444, 0.1571, 0.4875, 0.2580, 0.5563, 0.4745, 0.2181,
        0.3890, 0.4118, 0.3547, 0.1441, 0.1433, 0.1586, 0.1362, 0.1867, 0.2015,
        0.1898, 0.1985, 0.1981, 0.1363, 0.1715, 0.2007, 0.2436, 0.1812],
       device='cuda:0', grad_fn=<DiagonalBackward0>)
[tensor(0.5563, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.5065, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.4875, device='cuda:0', grad_fn=<SelectBackward0>)]


['Nearest neighbor search and range queries are important subclasses of similarity search, and a number of solutions exist.',
 'Vector databases typically implement one or more Approximate Nearest Neighbor (ANN) algorithms, so that one can search the database with a query vector to retrieve the closest matching database records.',
 'Similarity search is the most general term used for a range of mechanisms which share the principle of searching (typically very large) spaces of objects where the only available comparator is the similarity between any pair of objects.']

In [31]:
search_document("how to do similarity search?")

tensor([0.3254, 0.5237, 0.2649, 0.3176, 0.2846, 0.2579, 0.3631, 0.4937, 0.5642,
        0.3998, 0.3535, 0.3247, 0.1850, 0.6416, 0.2925, 0.5873, 0.6323, 0.2343,
        0.5752, 0.5734, 0.5358, 0.2268, 0.1483, 0.2711, 0.1813, 0.2128, 0.1444,
        0.2308, 0.2644, 0.2422, 0.1182, 0.2163, 0.2448, 0.2656, 0.2814],
       device='cuda:0', grad_fn=<DiagonalBackward0>)
[tensor(0.6416, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.6323, device='cuda:0', grad_fn=<SelectBackward0>), tensor(0.5873, device='cuda:0', grad_fn=<SelectBackward0>)]


['Similarity search is the most general term used for a range of mechanisms which share the principle of searching (typically very large) spaces of objects where the only available comparator is the similarity between any pair of objects.',
 'Research in similarity search is dominated by the inherent problems of searching over complex objects.',
 'Nearest neighbor search and range queries are important subclasses of similarity search, and a number of solutions exist.']