# Semantic search with sentence embedding

# Load data

In [1]:
%reset -f 
import pdb

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset, Dataset

from QuranCorpus import QuranCorpus

%matplotlib inline
pd.set_option('max_colwidth', 400)
pd.set_option('display.max_rows', 500)

In [176]:
quran_corpus = QuranCorpus(is_remove_basamal=True)
quran_corpus.read_in_quran()

In [177]:
print(len(quran_corpus.documents_by_verse), len(quran_corpus.i_surah), len(quran_corpus.i_verse))
df0 = pd.DataFrame({'text': quran_corpus.documents_by_verse, 'surah': quran_corpus.i_surah, 'verse': quran_corpus.i_verse})

6235 6235 6235


In [178]:
data = Dataset.from_pandas(df0)

In [179]:
data[0]

{'text': 'praise be to allah, lord of the worlds.', 'surah': 1, 'verse': 2}

# Create embeddings

Instructions: https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search

### Preprocessing

In [180]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [181]:
import torch

device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

#### CLS pooling
Pooling is the process of converting a sequence of embeddings into a sentence embedding is called “pooling”.  
One way is using CLS pooling: to collect the last hidden state for the special [CLS] token  
 - CLS token: Append a special <CLS> token to the start of every sequence. This special token is meant to capture the sequence-level information. 
 - During the training process, some sentence-level classification (like next sewntence prediction) task based on this CLS embedding will tune the CLS token representation via backpropagation.  
  
From [article of pooling methods](https://blog.ml6.eu/the-art-of-pooling-embeddings-c56575114cf8)


In [182]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

### Embedding

In [183]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [184]:
# Compute one doc
text_input = data['text'][0]
embedding = get_embeddings(text_input)
# Detach from the computational graph, copy it to host memory, and then convert to numpy array
embedding = embedding.detach().cpu().numpy()

print(text_input, '\n', embedding.shape)

praise be to allah, lord of the worlds. 
 (1, 768)


In [185]:
# Compute everything
embeddings_dataset = data.map(
    lambda x: {'embeddings': get_embeddings(x['text']).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/6235 [00:00<?, ? examples/s]

# Clustering by Surah

In [191]:
dfe = embeddings_dataset.to_pandas()

## Hierachichal clustering

In [192]:
from sklearn.cluster import AgglomerativeClustering

def get_cluster_labels(x):
    n_clusters = int(np.log2(len(x)))
    clustering = AgglomerativeClustering(n_clusters=n_clusters)  # Specify the number of clusters
    # Fit the model to your data
    clustering.fit(np.stack(x.values))
    # Get the cluster labels for each data point
    return clustering.labels_



In [193]:
dfe['topic'] = dfe.groupby('surah')['embeddings'].transform(get_cluster_labels)

## Use Eucleadian distance to find topic transition
https://datascience.stackexchange.com/questions/73151/how-to-identify-topic-transition-in-consecutive-sentences-using-python

In [199]:
dfe['previous_emb'] = dfe['embeddings'].shift(1)
dfe['dist_eu'] = (dfe['embeddings'] - dfe['previous_emb']).apply(np.linalg.norm)
dfe.loc[0, 'dist_eu'] = 0

In [210]:
dist_threshold = 6.0
dfe['is_over_threshold'] = (dfe['dist_eu'] > dist_threshold).astype(int)

In [211]:
dfe['topic_group'] = dfe['is_over_threshold'].cumsum()

In [213]:
dfe[['text', 'topic_group', 'surah', 'verse']].tail(10)

Unnamed: 0,text,topic_group,surah,verse
6225,from the evil of what he created.,4103,113,3
6226,and from the evil of the darkness as it gathers.,4103,113,4
6227,and from the evil of those who practice sorcery.,4103,113,5
6228,and from the evil of an envious when he envies.,4103,113,6
6229,"say, i seek refuge in the lord of mankind.",4104,114,2
6230,the king of mankind.,4105,114,3
6231,the god of mankind.,4105,114,4
6232,from the evil of the sneaky whisperer.,4106,114,5
6233,who whispers into the hearts of people.,4107,114,6
6234,from among jinn and among people.,4108,114,7


# FAISS similarity search
Using [FAISS](https://faiss.ai/) for efficient similarity search


In [33]:
# !pip install faiss-cpu
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/7 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'surah', 'verse', 'embeddings'],
    num_rows: 6348
})

In [85]:
question = 'What is the story of yusuf'
question_embedding = get_embeddings([question]).cpu().detach().numpy()
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [86]:
backslash_char = '\n'
for _, row in samples_df.iterrows():
    print(f"Text: {row.text}")
    print(f"Context: {backslash_char}{df0[(df0['surah'] == row.surah) & (df0['verse'].isin(list(range(max(0, row.verse - 1), row.verse + 3))))]['text'].str.cat(sep=backslash_char)}")
    print(f"SCORE: {row.scores}")
    print(f"Surah:Verse: {row.surah}:{row.verse}")
    print("=" * 50)
    print()

Text: i am to you a faithful messenger.
Context: 
their brother noah said to them, do you not fear?
i am to you a faithful messenger.
so fear allah, and obey me.
i ask of you no payment for this. my payment is only from the lord of the worlds.
SCORE: 35.835548400878906
Surah:Verse: 26:108

Text: i am to you a faithful messenger.
Context: 
when their brother hud said to them, do you not fear?
i am to you a faithful messenger.
so fear allah, and obey me.
i ask of you no payment for this. my payment is only from the lord of the worlds.
SCORE: 35.835548400878906
Surah:Verse: 26:126

Text: i am to you a faithful messenger.
Context: 
when their brother saleh said to them, do you not fear?
i am to you a faithful messenger.
so fear allah, and obey me.
i ask of you no payment for it. my payment is only from the lord of the worlds.
SCORE: 35.835548400878906
Surah:Verse: 26:144

Text: i am to you a faithful messenger.
Context: 
when their brother lot said to them, do you not fear?
i am to you a f

# Next: Get embedding of groups of sentences 
Concat N=4 sentences and get embedding accordingly