In [1]:
import mysql.connector

Connecting with database

In [2]:
import db_credentials as creds
mydb = mysql.connector.connect(
    host=creds.host,
    user=creds.user,
    password=creds.password,
    database=creds.database,
    port = creds.port
)

In [3]:
mycursor = mydb.cursor()

In [4]:
# testing
mycursor.execute("SHOW TABLES")

In [5]:
for x in mycursor:
    print(x)

('ayah_edition',)
('ayahs',)
('unique_eng_words',)


## Term document matrix

In [6]:
# create np array of size 6236x5480
import numpy as np
term_matrix = np.zeros((6236, 5480))
term_matrix.shape

(6236, 5480)

In [7]:
mycursor.execute("SELECT * FROM ayah_edition")

In [8]:
# converting to numpy array
ayahs_records = np.array(mycursor.fetchall())
ayahs_records[0][3]

'In the name of Allah, most benevolent, ever-merciful.'

## Getting just ayah strings from records

In [9]:
ayahs = ayahs_records[:,3]
ayahs

array(['In the name of Allah, most benevolent, ever-merciful.',
       'ALL PRAISE BE to Allah, Lord of all the worlds,',
       'Most beneficent, ever-merciful,', ...,
       'From the evil of him who breathes temptations into the minds of men,',
       'Who suggests evil thoughts to the hearts of men --',
       'From among the jinns and men.'], dtype=object)

In [10]:
# store ayahs in pickle file
import pickle
with open("ayahs.pkl", "wb") as f:
    pickle.dump(ayahs, f)

# load ayahs from pickle file
with open("ayahs.pkl", "rb") as f:
    ayahs = pickle.load(f)

## Removing symbols from ayah strings

In [11]:
# in each record using regex extract only words without comma, dot, quotations marks and apostrophe and lower case them using numpy
import re
import numpy as np
import string

def clean_text(text):
    text = text.lower()
    # also remove semicolon and colon
    text = re.sub(r'-', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# apply the function to all the strings inside numpy array
vfunc = np.vectorize(clean_text)
cleaned_ayahs = vfunc(ayahs)
cleaned_ayahs[0]

'in the name of allah most benevolent ever merciful'

## Getting separate words

In [12]:
words_array = np.char.split(cleaned_ayahs.astype(str))
words_array

array([list(['in', 'the', 'name', 'of', 'allah', 'most', 'benevolent', 'ever', 'merciful']),
       list(['all', 'praise', 'be', 'to', 'allah', 'lord', 'of', 'all', 'the', 'worlds']),
       list(['most', 'beneficent', 'ever', 'merciful']), ...,
       list(['from', 'the', 'evil', 'of', 'him', 'who', 'breathes', 'temptations', 'into', 'the', 'minds', 'of', 'men']),
       list(['who', 'suggests', 'evil', 'thoughts', 'to', 'the', 'hearts', 'of', 'men']),
       list(['from', 'among', 'the', 'jinns', 'and', 'men'])],
      dtype=object)

## Getting unique words from the database

In [13]:
mycursor.execute("SELECT * FROM unique_eng_words")

In [14]:
unique_words_arr = np.array(mycursor.fetchall())
unique_words_arr

array([['1', 'a'],
       ['2', 'aaron'],
       ['3', 'abandon'],
       ...,
       ['5478', 'zakat'],
       ['5479', 'zaqqum'],
       ['5480', 'zodiac']], dtype='<U15')

In [15]:
# storing unique_words arr in pickle file
with open('unique_words.pkl', 'wb') as f:
    pickle.dump(unique_words_arr, f)

# loading unique_words arr from pickle file
with open('unique_words.pkl', 'rb') as f:
    unique_words_arr = pickle.load(f)

# Filling term document matrix
Remember that inside the database, the indexing is starting from 1 but in the code, it is 0. Lets say we want to see the 'the' word inside the unique words inside the database. Its id is 4874 but here in the code, its index is 4873

In [16]:
for idx, ayah in enumerate(words_array):
    for word in ayah:
        if word in unique_words_arr[:,1]:
            term_matrix[idx,np.where(unique_words_arr[:,1] == word)[0]] = 1
        
term_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
term_matrix[0][4873]    # finding 'the' in the first ayah

1.0

## Storing the term document matrix in pickle file

In [18]:
number_of_chunks = 20

chunks = np.array_split(term_matrix, number_of_chunks)

for i, chunk in enumerate(chunks):
    with open(f'term_matrix/term_matrix_part_{i}.pkl', 'wb') as f:
        pickle.dump(chunk, f)

# loading matrix back
term_matrix = []
for i in range(number_of_chunks):
    with open(f'term_matrix/term_matrix_part_{i}.pkl', 'rb') as f:
        term_matrix.append(pickle.load(f))

# Concatenate all chunks back into one array
term_matrix = np.concatenate(term_matrix, axis=0)

In [19]:
# load term matrix from pickle file
# with open('term_matrix.pickle', 'rb') as f:
#     term_matrix = pickle.load(f)
# term_matrix

## Getting the Query

In [20]:
# query = input("Your Question: ")
# query = "a the most gracious and the most merciful"
query = "jannah is for those who believe in allah"
query = clean_text(query)
query_words = query.split()
query_words

['jannah', 'is', 'for', 'those', 'who', 'believe', 'in', 'allah']

## Filling the query vector

In [21]:
# Column Vector
query_vector = np.zeros((5480,1))
for word in query_words:
    if word in unique_words_arr[:,1]:
        query_vector[np.where(unique_words_arr[:,1] == word)[0]] = 1

query_vector

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

## Multiplying the query vector with the term document matrix

In [22]:
result = np.dot(term_matrix, query_vector)
result

array([[2.],
       [1.],
       [0.],
       ...,
       [1.],
       [1.],
       [0.]])

## Getting the top 10 ayahs

In [23]:
top_ayahs = np.argsort(result, axis=0)[::-1][:10]
top_ayahs

array([[ 224],
       [5095],
       [ 887],
       [1367],
       [1706],
       [3383],
       [2852],
       [1099],
       [1878],
       [ 223]], dtype=int64)

## Getting the ayahs from the database

In [24]:
for idx in top_ayahs:
    print(idx, ayahs[idx][0])

[224] Surely those who believe, and those who leave their homes and fight in the way of God, may hope for His benevolence, for God is forgiving and kind.
[5095] Hasten for the forgiveness of your Lord and Paradise whose expanse is as wide as that of the heavens and the earth, which has been prepared for those who believe in God and His apostles. This is the bounty of God which He bestows on whosoever He please; and the bounty of God is infinite.
[887] It is He who sends down water from the skies, and brings out of it everything that grows, the green foliage, the grain lying close, the date palm trees with clusters of dates, and the gardens of grapes, and of olives and pomegranates, so similar yet so unlike. Look at the fruits, how they appear on the trees, and they ripen. In all these are signs for those who believe.
[1367] To Him will you all return: God's promise is true. It is He who originates creation, then will revert it, so that He may reward those who believe and do good things

In [25]:
mycursor.execute(f"SELECT * FROM ayahs WHERE id = {1}")
ayat = mycursor.fetchall()
ayat

[(1,
  1,
  '\ufeffبِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ',
  1,
  1,
  1,
  1,
  1,
  0,
  datetime.datetime(2018, 6, 7, 8, 6, 54),
  datetime.datetime(2018, 6, 7, 8, 6, 54))]

In [26]:
ayat_obj = {"text":ayat[0][2], "verse_id":ayat[0][3], 'surah_id':ayat[0][5], "ayat_number":f"{ayat[0][5]}:{ayat[0][3]}"}
ayat_obj

{'text': '\ufeffبِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ',
 'verse_id': 1,
 'surah_id': 1,
 'ayat_number': '1:1'}