In [7]:
model_name = 'all-mpnet-base-v2'

## Importing model

In [1]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
model = SentenceTransformer(model_name)



## Loading and Cleaning Ayahs

In [4]:
import pickle
# load ayahs from pickle file
with open("ayahs.pkl", "rb") as f:
    ayahs = pickle.load(f)

In [5]:
import re
import numpy as np
import string

def clean_text(text):
    text = text.lower()
    # also remove semicolon and colon
    text = re.sub(r'-', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# apply the function to all the strings inside numpy array
vfunc = np.vectorize(clean_text)
cleaned_ayahs = vfunc(ayahs)
cleaned_ayahs[0]

'in the name of allah most benevolent ever merciful'

## Creating Embeddings

In [12]:
# If embeddings not present than generate embeddings otherwise load them
import os
if os.path.isfile(f'./Embeddings/{model_name}.pkl'):
    embeddings = pickle.load(open(f'./Embeddings/{model_name}.pkl', 'rb'))
else:
    embeddings = model.encode(cleaned_ayahs)
    pickle.dump(embeddings, open(f'./Embeddings/{model_name}.pkl', 'wb'))

## Creating Indexes

In [13]:
import faiss

In [18]:
embedding_dim = embeddings.shape[1]
# print(embedding_dim)
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)
index.ntotal

6236

## Implementing Search

In [19]:
def search(query, model, index, ayahs, k=10):
    # Convert the query to an embedding
    query_embedding = model.encode([query])

    # Search the index for the top k most similar embeddings
    distances, indices = index.search(query_embedding, k)

    # Retrieve the corresponding verses and their indices
    similar_verses = [(ayahs[idx], idx) for idx in indices[0]]
    distances_similar_verses = [(dist, ayahs[idx], idx) for dist, idx in zip(distances[0], indices[0])]
    return similar_verses, distances_similar_verses

## Example Searches

In [29]:
query1 = "give me ayahs related to importance of salah"
query2 = "god has purchased the lives of believers for jannah"
query3 = "send peace on prophet and you will get"
query4 = "description of jannah in quran"
query5 = "can you tell me the verse related to giving zakat to which people is obligatory"
query6 = "fasting is compulsory for muslims"
query7 = "Fast a (fixed) number of days, but if someone is ill or is travelling (he should complete) the number of days (he had missed); and those who find it hard to fast should expiate by feeding a poor person. For the good they do with a little hardship is better for men. And if you fast it is good for you, if you knew."

results, distances = search(query=query6,model=model,index=index,ayahs=ayahs,k=20)
# Show ayahs, their indexes and their distances
for i, (ayah, idx) in enumerate(results):
    print(f"Result {i+1}:\nAyah: {ayah}\nIndex: {idx}\nDistance: {distances[i][0]}\n")

Result 1:
Ayah: Fast a (fixed) number of days, but if someone is ill or is travelling (he should complete) the number of days (he had missed); and those who find it hard to fast should expiate by feeding a poor person. For the good they do with a little hardship is better for men. And if you fast it is good for you, if you knew.
Index: 190
Distance: 0.2231331318616867

Result 2:
Ayah: O believers, fasting is enjoined on you as it was on those before you, so that you might become righteous.
Index: 189
Distance: 0.7952790856361389

Result 3:
Ayah: Ramadan is the month in which the Qur'an was revealed as guidance to man and clear proof of the guidance, and criterion (of falsehood and truth). So when you see the new moon you should fast the whole month; but a person who is ill or travelling (and fails to do so) should fast on other days, as God wishes ease and not hardship for you, so that you complete the (fixed) number (of fasts), and give glory to God for the guidance, and be grateful.
