In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_parquet('samples/grocery_clustered_embeddings.parquet')

In [8]:
coffee = df[df.labels=='coffee'].review

In [13]:
# %%time
count = CountVectorizer(min_df=5, stop_words='english').fit(coffee)
candidates = count.get_feature_names_out()

CPU times: total: 219 ms
Wall time: 208 ms


In [14]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [15]:
model.save(path='model.pt', model_name='distilbert-base-nli-mean-tokens')

In [17]:
%time doc_embedding = model.encode(coffee.tolist())

CPU times: total: 25min 38s
Wall time: 6min 36s


In [19]:
doc_embedding.shape, doc_embedding

((7731, 768),
 array([[ 0.09277777, -0.19423124,  0.91040546, ...,  0.09022616,
          0.3006782 , -0.89581615],
        [-0.29280344,  0.12678905,  0.28470254, ...,  0.8807069 ,
          0.13547674, -0.7031219 ],
        [-0.26900193,  0.11505219,  0.28977126, ...,  0.4623891 ,
         -0.1142708 , -0.43770787],
        ...,
        [-0.67804015, -0.60036194,  0.45906696, ...,  0.22135499,
         -0.07449716, -0.12734628],
        [-0.29627576, -0.18420197,  0.5372846 , ...,  0.44228598,
         -0.44503224, -0.20379664],
        [-0.26966068, -0.5385037 ,  0.4138214 , ...,  0.1294331 ,
         -0.5458137 ,  0.00194951]], dtype=float32))

In [32]:
mean_doc_embedding = doc_embedding.mean(axis=0)

In [26]:
%time candidate_embeddings = model.encode(candidates)

CPU times: total: 33.4 s
Wall time: 8.51 s


In [28]:
candidate_embeddings.shape, candidate_embeddings

((2541, 768),
 array([[-0.3524073 , -0.11575425,  0.38016495, ..., -0.6573786 ,
         -0.40317225, -0.6600991 ],
        [-1.1718513 , -0.35939643,  0.63253826, ..., -0.38831976,
          0.09106359, -0.98302585],
        [-0.5647676 , -0.05328847,  0.52299094, ..., -0.59879243,
         -0.38728356, -0.5681657 ],
        ...,
        [-0.50929326, -0.3594223 ,  0.2125237 , ...,  0.10140759,
         -0.34987637, -0.21217924],
        [-0.7713096 , -0.07744987,  0.24580972, ..., -0.05702488,
         -0.26014346, -0.47629043],
        [-0.5781346 , -0.14670385,  0.03196179, ..., -0.510243  ,
         -0.07463825, -0.8674555 ]], dtype=float32))

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import util

In [39]:
mean_doc_embedding.reshape(1,-1).shape, candidate_embeddings.shape

((1, 768), (2541, 768))

In [42]:
top_n = 5
distances = util.cos_sim(mean_doc_embedding.reshape(1,-1), candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [43]:
keywords

['coffee', 'flavorable', 'coffeemaker', 'flavorfull', 'starbucks']

In [50]:
L = df.labels.unique().tolist()

In [72]:
%%time 
i = 11
x = df[df.labels==L[i]].review
mean_doc_embedding = model.encode(x.sample(200).tolist()).mean(axis=0).reshape(1,-1)
distances = util.cos_sim(mean_doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
keywords, L[i]

CPU times: total: 33 s
Wall time: 8.61 s


(['tasty', 'chocolatey', 'flavorable', 'flavorful', 'flavorfull'], 'powder')

In [73]:
x

18       Five Stars Great product and fast shipping. Re...
25       High quality I am in love with this item. I lo...
73       Great flavor, but it clumps terribly Great fla...
84       Delicious and full of fiber Tasty raw with a n...
99                                  great yum, great stuff
                               ...                        
99652    Very expensive This is a very good product you...
99655    Great cocoa for smoothies! Tasty!  Have only u...
99686    Quality product, great price This stuff is exc...
99803                       What a deal:))) Supper happy!!
99957    Simply the best - for as litle as most of us u...
Name: review, Length: 806, dtype: object