In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from unidecode import unidecode
import pandas as pd

from sklearn.metrics.pairwise import euclidean_distances

In [2]:
stopwords = np.loadtxt("romanian_stopwords.txt", dtype=str)

In [3]:
# load tokenizer and model using bert romanian cased model
# https://github.com/dumitrescustefan/Romanian-Transformers
# https://huggingface.co/dumitrescustefan/bert-base-romanian-cased-v1

tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")

tokenizer2 = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model2 = AutoModel.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at dumitrescustefan/bert-base-romanian-cased-v1 were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initiali

In [4]:
doc1= "Sos roșii, mozzarella, sare de mare, blat crocant la foc cu lemne"
doc2 = "Sos de roșii, mozzarella, șuncă, porumb, măsline, ciuperci, condimente"
doc3 = "Sos de roșii, mozzarella, salam picant, gorgonzola, hribi"
doc4 = "Sos de roșii, mozzarella, salam, condimente"
doc5 = "Sos de roșii, mozzarella, salam picant, gorgonzola, ceapă roșie, peperoncini"

doc6 = "Penne, sos de roșii, carne de vită, condimente"
doc7 = "Penne sau spaghete, bacon, parmezan, smântână dulce, condimente"
doc8 = "Paste de casă cu sos de roșii picant ( roșii cherry, green chilli prospăt trase la tigaie în ulei de măsline extravirgin, fierte cu pulpă de roșii și fulgi de peperoncino)"
doc9 = "Spaghetti (normale/integrale), somon, smantana lichida, marar, parmezan, sare, piper"
doc10 = "Tagliatelle, pui, usturoi, ciuperci, baby spanac, parmesan, smântână lichidă, sare, piper"

docs = [doc1, doc2, doc3, doc4, doc5, doc6, doc7, doc8, doc9, doc10]

In [5]:
stopwords = np.loadtxt("romanian_stopwords.txt", dtype=str)
def preprocess(tokens):
    new_tokens = []
    tokens = unidecode(tokens)
    for token in tokens.split():
        token = token.lower()
        if token in stopwords:
            continue
        token = ''.join(c if c.isalpha() else ' ' for c in token)
        if token:
            new_tokens.append(token.strip())
    return " ".join(new_tokens)

preprocess_docs = []
for doc in docs:
    preprocess_docs.append(preprocess(doc))
preprocess_docs

['sos rosii mozzarella sare mare blat crocant foc lemne',
 'sos rosii mozzarella sunca porumb masline ciuperci condimente',
 'sos rosii mozzarella salam picant gorgonzola hribi',
 'sos rosii mozzarella salam condimente',
 'sos rosii mozzarella salam picant gorgonzola ceapa rosie peperoncini',
 'penne sos rosii carne vita condimente',
 'penne spaghete bacon parmezan smantana dulce condimente',
 'paste casa sos rosii picant  rosii cherry green chilli prospat trase tigaie ulei masline extravirgin fierte pulpa rosii fulgi peperoncino',
 'spaghetti normale integrale somon smantana lichida marar parmezan sare piper',
 'tagliatelle pui usturoi ciuperci baby spanac parmesan smantana lichida sare piper']

In [6]:
embeddings = []
for doc in preprocess_docs:
    _tokens = tokenizer2.encode(doc, add_special_tokens=True, return_tensors="pt")
    _tokens_embeddings = model2(_tokens)[0]
    _tokens_embeddings_mean = np.mean(_tokens_embeddings.detach().numpy().squeeze(), axis=0)
    embeddings.append(_tokens_embeddings_mean)
embeddings = np.array(embeddings)

In [7]:
euclidean_distances(
    [embeddings[0]], 
    [embeddings[1]]
)

array([[5.7212086]], dtype=float32)

In [8]:
pd.options.display.float_format = "{:,.2f}".format
pd.DataFrame(euclidean_distances(embeddings, embeddings))    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,5.72,5.35,7.76,5.93,7.53,7.72,5.78,7.07,6.15
1,5.72,0.0,4.93,6.22,5.53,6.48,7.02,5.46,7.04,6.13
2,5.35,4.93,0.0,6.95,4.12,7.36,7.43,5.92,7.01,6.34
3,7.76,6.22,6.95,0.0,7.03,7.45,6.68,8.38,9.13,9.06
4,5.93,5.53,4.12,7.03,0.0,7.2,7.62,6.02,7.1,6.43
5,7.53,6.48,7.36,7.45,7.2,0.0,7.44,7.0,8.1,8.14
6,7.72,7.02,7.43,6.68,7.62,7.44,0.0,7.68,7.01,7.31
7,5.78,5.46,5.92,8.38,6.02,7.0,7.68,0.0,7.01,5.98
8,7.07,7.04,7.01,9.13,7.1,8.1,7.01,7.01,0.0,5.15
9,6.15,6.13,6.34,9.06,6.43,8.14,7.31,5.98,5.15,0.0
