In [23]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F


# Sentences we want sentence embeddings for
sentences = ['A hamburger costs 10 dollars', 'A hotdog costs 5 dollars', 'A pizza costs 15 dollars', 'A taco costs 2 dollars', 'A soda costs 1 dollar', "we don't sell coke"]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [61]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_sentences_embeddings(sentences: list):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    return F.normalize(sentence_embeddings, p=2, dim=1)

In [80]:
from sqlalchemy import create_engine, text
from sqlalchemy.orm import Session

def insert_embeddings(embeddings_dict: dict):
    engine = create_engine('postgresql://postgres:postgres@localhost:5432/vector_db')


    with Session(engine) as session:
        for _, value in embeddings_dict.items():
            query = text("""
                INSERT INTO business_embeddings (embedding, content) VALUES
                (:vector, :content)
            """)
            session.execute(query, dict(vector=value['embedding'], content=value['content']))

        session.commit()

In [62]:
def transform_embeddings_to_str(embeddings: torch.Tensor):
    new_list = []
    for embedding in embeddings:
        new_list.append(str([x.item() for x in list(embedding)]))
    return new_list

In [63]:
embeddings_dict = {idx: {'embedding': embedding, 'content': content} for idx, (embedding, content) in enumerate(zip(transform_embeddings_to_str(get_sentences_embeddings(sentences)), sentences))}

In [66]:
# str([x.item() for x in list(embeddings_dict[0]['embedding'])])
embeddings_dict[0]['embedding']

'[-0.04257448762655258, 0.0365988090634346, -0.0030170485842972994, -0.028192119672894478, 0.013431346043944359, 0.0034944803919643164, 0.0211520716547966, 0.08243482559919357, 0.04188685864210129, 0.06088932231068611, 0.045378293842077255, -0.04801978915929794, -0.023994581773877144, 0.003970183897763491, -0.04590587317943573, -0.11710409075021744, 0.08210884779691696, -0.0015061248559504747, -0.012893849052488804, -0.012492349371314049, -0.021876046434044838, -0.03169425576925278, -0.05823637917637825, 0.04573022201657295, 0.045724429190158844, -0.07080163806676865, 0.05294065177440643, -0.08163420110940933, -0.05111908167600632, 0.004920905455946922, -0.022669702768325806, -0.07914390414953232, -0.08523766696453094, 0.05273035541176796, -0.0005112505168654025, -0.11234196275472641, 0.08515442907810211, 0.016209090128540993, -0.09812614321708679, 0.02992318384349346, 0.011334304697811604, -0.0033304214011877775, 0.01637197472155094, -0.08399442583322525, 0.0514669194817543, -0.016521

In [81]:
insert_embeddings(embeddings_dict)

In [91]:
def retrieve_embeddings(p_embedding: torch.Tensor):
    engine = create_engine('postgresql://postgres:postgres@localhost:5432/vector_db')
    p_embedding = transform_embeddings_to_str(p_embedding)[0]
    with Session(engine) as session:
        query = text("""
            SELECT * FROM business_embeddings ORDER BY :p_embedding <-> embedding LIMIT 2;
        """)
        result = session.execute(query, dict(p_embedding= p_embedding)).fetchall()
        return result

In [104]:
prompt = 'how much does a burger cost?'
result_embeddings = retrieve_embeddings(get_sentences_embeddings([prompt]))

In [105]:
result_embeddings[0][2]

'A hamburger costs 10 dollars'

In [45]:
import torch.nn.functional as F
import torch

t = torch.tensor(   [  [2.0, 5],
                        [35.0, 5],
                        [440, 5],
                        [-5, 4]  ], dtype=torch.float32)
t.shape

torch.Size([4, 2])

In [46]:
t

tensor([[  2.,   5.],
        [ 35.,   5.],
        [440.,   5.],
        [ -5.,   4.]])

In [53]:
F.normalize(torch.Tensor([[1, 1],
                          [1,1]]), dim=0)

tensor([[0.7071, 0.7071],
        [0.7071, 0.7071]])