<a href="https://colab.research.google.com/github/ta269uec/semantic-search/blob/develop/k_means_sentence_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing all the packages necessary + loading the data

In [11]:
!pip install datasets transformers torch sentence-transformers --quiet

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
LIB_LOCATION = "/content/drive/MyDrive/git-repos/semantic-search/"
OUT_DATA_LOCATION = LIB_LOCATION + "data/"

In [4]:
# Clone if you do not have this.
#!git clone -b develop https://github.com/ta269uec/semantic-search.git /content/drive/MyDrive/git-repos/semantic-search/

In [14]:
import sys
sys.path.append(LIB_LOCATION)

In [15]:
from src.sentence_encoder import SentenceEncoder
from src.sst2 import get_reviews_sst2
from src.kmeans import cluster_k_means

In [16]:
pos_sentences, neg_sentences = get_reviews_sst2()



In [7]:
st = SentenceEncoder()

Downloading (…)5dded/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)4d81d5dded/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)81d5dded/config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ded/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5dded/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

Downloading (…)dded/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)4d81d5dded/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1d5dded/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [17]:
pos_embeddings, neg_embeddings = st.encode(pos_sentences), st.encode(neg_sentences)

In [26]:
import os
import pickle
file_name_pos = os.path.join(OUT_DATA_LOCATION, 'sst2_train_pos_embeddings.pkl')
file_name_neg = os.path.join(OUT_DATA_LOCATION, 'sst2_train_neg_embeddings.pkl')
with open(file_name_pos, "wb") as fd:
  pickle.dump(pos_embeddings, fd)
with open(file_name_neg, "wb") as fd:
  pickle.dump(neg_embeddings, fd)

# Build K-Means Clusters

In [18]:
def build_clusters(N, pos_sentences, pos_embeddings, neg_sentences, neg_embeddings):
  clustered_pos_sentences, clustered_pos_embeddings = cluster_k_means(corpus = pos_sentences,\
                                                            corpus_embeddings = pos_embeddings.cpu(),\
                                                            num_clusters = N)
  clustered_neg_sentences, clustered_neg_embeddings = cluster_k_means(corpus = neg_sentences,\
                                                            corpus_embeddings = neg_embeddings.cpu(),\
                                                            num_clusters = N)
  return (clustered_pos_sentences, clustered_pos_embeddings), (clustered_neg_sentences, clustered_neg_embeddings)


In [24]:
import os
import pickle
def serialize_clusters(root, ds_name, label, n_clusters, results):
  file_prefix = f"{ds_name}_{label}_{n_clusters}_"
  sentences, embeddings = results[0], results[1]
  f1, f2 = os.path.join(root, f"{file_prefix}sentences.pkl"), os.path.join(root, f"{file_prefix}embeddings.pkl")
  with open(f1, 'wb') as handle:
    pickle.dump(sentences, handle, protocol=pickle.HIGHEST_PROTOCOL)
  with open(f2, 'wb') as handle:
    pickle.dump(sentences, handle, protocol=pickle.HIGHEST_PROTOCOL)    
  return

In [25]:
root = OUT_DATA_LOCATION
for N in [10, 25, 50, 100, 250]:
  print(f"Processing cluster size {N}")
  res_pos, res_neg = build_clusters(N, pos_sentences, pos_embeddings, neg_sentences, neg_embeddings)
  serialize_clusters(root, ds_name, label="pos", n_clusters=N, results=res_pos)
  serialize_clusters(root, ds_name, label="neg", n_clusters=N, results=res_neg)
  pass

Processing cluster size 10




Processing cluster size 25




Processing cluster size 50




Processing cluster size 100




Processing cluster size 250




## Sanity Check 1: Are clusters semantically similar?

In [52]:
clusters = [len(clustered_pos_embeddings[i]) for i in range(len(clustered_pos_embeddings))]

In [53]:
import torch
X = torch.stack(clustered_pos_embeddings[0])

In [54]:
from sentence_transformers import util

cos_scores = util.cos_sim(X, X).numpy()

In [55]:
cos_scores

array([[1.        , 0.56275344, 0.16633949, ..., 0.34173852, 0.33133203,
        0.33750123],
       [0.56275344, 1.0000001 , 0.25965416, ..., 0.31524807, 0.38929945,
        0.3176574 ],
       [0.16633949, 0.25965416, 0.9999997 , ..., 0.18074846, 0.2621564 ,
        0.18604575],
       ...,
       [0.34173855, 0.315248  , 0.18074849, ..., 1.0000002 , 0.377027  ,
        0.32030904],
       [0.33133203, 0.38929948, 0.26215637, ..., 0.37702698, 1.        ,
        0.29153726],
       [0.33750117, 0.31765735, 0.18604569, ..., 0.32030913, 0.29153726,
        0.99999976]], dtype=float32)