In [1]:
model_name = 'all-mpnet-base-v2'

## Importing model

In [2]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = SentenceTransformer(model_name)



## Loading and Cleaning Ayahs

In [4]:
import pickle
# load ayahs from pickle file
with open("ayahs.pkl", "rb") as f:
    ayahs = pickle.load(f)

In [5]:
import re
import numpy as np
import string

def clean_text(text):
    text = text.lower()
    # also remove semicolon and colon
    text = re.sub(r'-', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# apply the function to all the strings inside numpy array
vfunc = np.vectorize(clean_text)
cleaned_ayahs = vfunc(ayahs)
cleaned_ayahs[0]

'in the name of allah most benevolent ever merciful'

## Creating Embeddings

In [6]:
# If embeddings not present than generate embeddings otherwise load them
import os
if os.path.isfile(f'./Embeddings/{model_name}.pkl'):
    embeddings = pickle.load(open(f'./Embeddings/{model_name}.pkl', 'rb'))
else:
    embeddings = model.encode(cleaned_ayahs)
    pickle.dump(embeddings, open(f'./Embeddings/{model_name}.pkl', 'wb'))

## Creating Indexes

In [7]:
import faiss

In [8]:
embedding_dim = embeddings.shape[1]
# print(embedding_dim)
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)
index.ntotal

6236

## Query Optimizing

In [9]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet
from itertools import chain

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Zubair\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
from nltk.stem import PorterStemmer

def stem_query(query):
    stemmer = PorterStemmer()
    words = query.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [11]:
def expand_query(query):
    words = query.split()
    expanded_words = []
    for word in words:
        synonyms = wordnet.synsets(word)
        lemmas = set(chain.from_iterable([syn.lemma_names() for syn in synonyms[:1]]))
        lemmas.add(word)
        # Keep only one synonym, replace underscore with space, and add parentheses
        lemmas = ['(' + lemma.replace('_', ' ') + ')' for lemma in list(lemmas)[:1]]
        lemmas.append(word)
        expanded_words.append(' '.join(lemmas))
    expanded_query = ' '.join(expanded_words)
    return expanded_query

## Implementing Search

In [12]:
def search(query, model, index, ayahs, k=10):
    # Convert the query to an embedding
    query_embedding = model.encode([query])

    # Search the index for the top k most similar embeddings
    distances, indices = index.search(query_embedding, k)

    # Retrieve the corresponding verses and their indices
    similar_verses = [(ayahs[idx], idx) for idx in indices[0]]
    distances_similar_verses = [(dist, ayahs[idx], idx) for dist, idx in zip(distances[0], indices[0])]
    return similar_verses, distances_similar_verses

## Example Searches

In [14]:
query1 = "give me ayahs related to importance of salah"
query2 = "god has purchased the lives of believers for jannah"
query3 = "send peace on prophet and you will get"
query4 = "description of jannah in quran"
query5 = "can you tell me the verse related to giving zakat to which people is obligatory"
query6 = "fasting is compulsory for muslims"
query7 = "Fast a (fixed) number of days, but if someone is ill or is travelling (he should complete) the number of days (he had missed); and those who find it hard to fast should expiate by feeding a poor person. For the good they do with a little hardship is better for men. And if you fast it is good for you, if you knew."
query8 = "give zakat to which people is obligatory"
# exp_query = expand_query(query2)
# st_query = stem_query(query2)
# print("Query: ", query2)
# print("Expanded Query: ",st_query)
results, distances = search(query=query2,model=model,index=index,ayahs=ayahs,k=20)
# Show ayahs, their indexes and their distances
for i, (ayah, idx) in enumerate(results):
    print(f"Result {i+1}:\nAyah: {ayah}\nIndex: {idx}\nDistance: {distances[i][0]}\n")

Result 1:
Ayah: God has verily bought the souls and possessions of the faithful in exchange for a promise of Paradise. They fight in the cause of God, and kill and are killed. This is a promise incumbent on Him, as in the Torah, so the Gospel and the Qur'an. And who is more true to his promise than God? So rejoice at the bargain you have made with Him; for this will be triumph supreme.
Index: 1345
Distance: 0.8081139326095581

Result 2:
Ayah: Those who accepted the faith and left their homes and fought in the way of God, wealth and soul, have a greater reward with God, and will be successful.
Index: 1254
Distance: 0.9154306650161743

Result 3:
Ayah: Those among the migrants (from Makkah) and helpers (in Madina) who were the first to believe, and those who followed them in goodness, have been accepted by God and they follow His way. For them He has gardens with streams of running water where they will abide for ever; and that is happiness supreme.
Index: 1334
Distance: 0.939133942127227