## Description

##### This notebook processes data for `Quran` and `Quran-Tafsir` from different resources to have a complete Quran data file. 
##### It prepares data, merges them together and apply some preprocessing on final data to be ready to use in `Qurani` Application. 

## Libraries Used

In [1]:
import pandas as pd

## Data Preprocessing

In [2]:
# Read data
quraan_data = pd.read_csv('data/The Quran Dataset.csv')
tafsir_data = pd.read_csv('data/mokhtaser_tafseer.csv')

In [3]:
quraan_data.head(3)

Unnamed: 0,surah_no,surah_name_en,surah_name_ar,surah_name_roman,ayah_no_surah,ayah_no_quran,ayah_ar,ayah_en,ruko_no,juz_no,...,hizb_quarter,total_ayah_surah,total_ayah_quran,place_of_revelation,sajah_ayah,sajdah_no,no_of_word_ayah,list_of_words,Ayah_without_tashkil,Unnamed: 20
0,1,The Opener,الفاتحة,Al-Fatihah,1,1,بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,"In the Name of Allah—the Most Compassionate, M...",1,1,...,1,7,6236,Meccan,False,,4,"[بِسْمِ,ٱللَّهِ,ٱلرَّحْمَٰنِ,ٱلرَّحِيمِ]",بسم الله الرحمن الرحيم,
1,1,The Opener,الفاتحة,Al-Fatihah,2,2,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ,"All praise is for Allah—Lord of all worlds,",1,1,...,1,7,6236,Meccan,False,,4,"[ٱلْحَمْدُ,لِلَّهِ,رَبِّ,ٱلْعَٰلَمِينَ]",الحمد لله رب العالمين,
2,1,The Opener,الفاتحة,Al-Fatihah,3,3,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,"the Most Compassionate, Most Merciful,",1,1,...,1,7,6236,Meccan,False,,2,"[ٱلرَّحْمَٰنِ,ٱلرَّحِيمِ]",الرحمن الرحيم,


In [4]:
tafsir_data.head(3)

Unnamed: 0.1,Unnamed: 0,id,sura,aya,arabic_text,translation,footnotes
0,0,1,1,1,بِسۡمِ ٱللَّهِ ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ,باسم الله أبدأ قراءة القرآن، مستعينًا به تعالى...,
1,1,2,1,2,ٱلۡحَمۡدُ لِلَّهِ رَبِّ ٱلۡعَٰلَمِينَ,الثناء الكامل، وجميع أنواع المحامد من صفات الج...,
2,2,3,1,3,ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ,ثناء على الله تعالى بعد حمده في الآية السابقة.,


In [5]:
# Rename tafsir_data columns to more readable name
tafsir_data = tafsir_data.rename(columns={'id': 'ayah_no_quran','sura': 'surah_no', 'aya': 'ayah_no', 'arabic_text': 'ayah_ar', 'translation':'tafsir'})
tafsir_data.head(3)

Unnamed: 0.1,Unnamed: 0,ayah_no_quran,surah_no,ayah_no,ayah_ar,tafsir,footnotes
0,0,1,1,1,بِسۡمِ ٱللَّهِ ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ,باسم الله أبدأ قراءة القرآن، مستعينًا به تعالى...,
1,1,2,1,2,ٱلۡحَمۡدُ لِلَّهِ رَبِّ ٱلۡعَٰلَمِينَ,الثناء الكامل، وجميع أنواع المحامد من صفات الج...,
2,2,3,1,3,ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ,ثناء على الله تعالى بعد حمده في الآية السابقة.,


In [6]:
# Rename quraan_data columns to more readable name
quraan_data = quraan_data.rename(columns={'hizb_quarter': 'quarter_no','sajah_ayah': 'has_sajdah'})
quraan_data.head(3)

Unnamed: 0,surah_no,surah_name_en,surah_name_ar,surah_name_roman,ayah_no_surah,ayah_no_quran,ayah_ar,ayah_en,ruko_no,juz_no,...,quarter_no,total_ayah_surah,total_ayah_quran,place_of_revelation,has_sajdah,sajdah_no,no_of_word_ayah,list_of_words,Ayah_without_tashkil,Unnamed: 20
0,1,The Opener,الفاتحة,Al-Fatihah,1,1,بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,"In the Name of Allah—the Most Compassionate, M...",1,1,...,1,7,6236,Meccan,False,,4,"[بِسْمِ,ٱللَّهِ,ٱلرَّحْمَٰنِ,ٱلرَّحِيمِ]",بسم الله الرحمن الرحيم,
1,1,The Opener,الفاتحة,Al-Fatihah,2,2,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ,"All praise is for Allah—Lord of all worlds,",1,1,...,1,7,6236,Meccan,False,,4,"[ٱلْحَمْدُ,لِلَّهِ,رَبِّ,ٱلْعَٰلَمِينَ]",الحمد لله رب العالمين,
2,1,The Opener,الفاتحة,Al-Fatihah,3,3,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,"the Most Compassionate, Most Merciful,",1,1,...,1,7,6236,Meccan,False,,2,"[ٱلرَّحْمَٰنِ,ٱلرَّحِيمِ]",الرحمن الرحيم,


In [7]:
# Choose need fields from each files
tafsir_data = tafsir_data[['ayah_no_quran', 'tafsir']]
quraan_data = quraan_data[[
                'surah_no', 'surah_name_ar', 'surah_name_roman', 'ayah_no_surah', 'ayah_no_quran', 'ayah_ar', 'ayah_en','Ayah_without_tashkil', 'juz_no', 'quarter_no', 'total_ayah_surah', 'total_ayah_quran', 'place_of_revelation', 'has_sajdah', 'sajdah_no']]

In [8]:
# Ensure `ayah_no_quran` filed` in 2 data files is the same
print(min(tafsir_data['ayah_no_quran']), max(tafsir_data['ayah_no_quran']))
print(min(quraan_data['ayah_no_quran']), max(quraan_data['ayah_no_quran']))

1 6236
1 6236


In [9]:
# Merge two data files on `ayah_no_quran` filed
merged_data = pd.merge(
                tafsir_data, quraan_data, left_on='ayah_no_quran', right_on='ayah_no_quran', how='inner')
merged_data.head(3)

Unnamed: 0,ayah_no_quran,tafsir,surah_no,surah_name_ar,surah_name_roman,ayah_no_surah,ayah_ar,ayah_en,Ayah_without_tashkil,juz_no,quarter_no,total_ayah_surah,total_ayah_quran,place_of_revelation,has_sajdah,sajdah_no
0,1,باسم الله أبدأ قراءة القرآن، مستعينًا به تعالى...,1,الفاتحة,Al-Fatihah,1,بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,"In the Name of Allah—the Most Compassionate, M...",بسم الله الرحمن الرحيم,1,1,7,6236,Meccan,False,
1,2,الثناء الكامل، وجميع أنواع المحامد من صفات الج...,1,الفاتحة,Al-Fatihah,2,ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ,"All praise is for Allah—Lord of all worlds,",الحمد لله رب العالمين,1,1,7,6236,Meccan,False,
2,3,ثناء على الله تعالى بعد حمده في الآية السابقة.,1,الفاتحة,Al-Fatihah,3,ٱلرَّحْمَٰنِ ٱلرَّحِيمِ,"the Most Compassionate, Most Merciful,",الرحمن الرحيم,1,1,7,6236,Meccan,False,


In [10]:
merged_data['place_of_revelation'].value_counts()

place_of_revelation
Meccan     4613
Medinan    1623
Name: count, dtype: int64

In [11]:
# Add  `is_Meccan` column (boolean column)
merged_data['is_meccan'] = (merged_data['place_of_revelation'] == 'Meccan')

In [13]:
merged_data['is_meccan'].value_counts()

is_meccan
True     4613
False    1623
Name: count, dtype: int64

In [16]:
# fill non-value in `sajdah_no` with 0
merged_data['sajdah_no'].isnull().sum()

np.int64(6221)

In [14]:
merged_data['sajdah_no'].value_counts()

sajdah_no
1.0     1
2.0     1
3.0     1
4.0     1
5.0     1
6.0     1
7.0     1
8.0     1
9.0     1
10.0    1
11.0    1
12.0    1
13.0    1
14.0    1
15.0    1
Name: count, dtype: int64

In [15]:
merged_data['sajdah_no'] = merged_data['sajdah_no'].fillna(0).astype(int)

In [16]:
merged_data['sajdah_no'].value_counts()

sajdah_no
0     6221
1        1
2        1
3        1
4        1
5        1
6        1
7        1
8        1
9        1
10       1
11       1
12       1
13       1
14       1
15       1
Name: count, dtype: int64

In [17]:
# Ensure `total_ayah_quran` column ia all true
merged_data['total_ayah_quran'].value_counts()

total_ayah_quran
6236    6236
Name: count, dtype: int64

In [18]:
# Ensure `juz_no` values from 1 to 30
filtered_juz_no = merged_data['juz_no'][(merged_data['juz_no'] > 0) & (merged_data['juz_no'] <= 30)]
print(filtered_juz_no)

0        1
1        1
2        1
3        1
4        1
        ..
6231    30
6232    30
6233    30
6234    30
6235    30
Name: juz_no, Length: 6236, dtype: int64


In [19]:
# Ensure `quarter_no` values from 1 to 240
filtered_quarter_no = merged_data['quarter_no'][(merged_data['quarter_no'] > 0) & (merged_data['quarter_no'] <= 240)]
print(filtered_quarter_no)

0         1
1         1
2         1
3         1
4         1
       ... 
6231    240
6232    240
6233    240
6234    240
6235    240
Name: quarter_no, Length: 6236, dtype: int64


In [21]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6236 entries, 0 to 6235
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ayah_no_quran         6236 non-null   int64 
 1   tafsir                6236 non-null   object
 2   surah_no              6236 non-null   int64 
 3   surah_name_ar         6236 non-null   object
 4   surah_name_roman      6236 non-null   object
 5   ayah_no_surah         6236 non-null   int64 
 6   ayah_ar               6236 non-null   object
 7   ayah_en               6236 non-null   object
 8   Ayah_without_tashkil  6236 non-null   object
 9   juz_no                6236 non-null   int64 
 10  quarter_no            6236 non-null   int64 
 11  total_ayah_surah      6236 non-null   int64 
 12  total_ayah_quran      6236 non-null   int64 
 13  place_of_revelation   6236 non-null   object
 14  has_sajdah            6236 non-null   bool  
 15  sajdah_no             6236 non-null   

## Save Preprocessed data

In [22]:
merged_data.to_csv('data/processed_data.csv')