# EDA
Version: 02.04.2025

In [1]:
# imports
import os
import matplotlib.pyplot as plt
import re
import pandas as pd
import numpy as np

# Apply a style
plt.style.use('ggplot')


# Avg. token count per sura

In [None]:
# Loading data
df = pd.read_csv("stats/avg_sura_tokens.csv")

In [None]:
df.info()

In [None]:
df_sorted = df.sort_values("sura_id")
df_sorted.plot(kind="barh", x="sura_id", y="avg_tokens_sura", color="skyblue", figsize=(10, 25))
plt.title("Average Tokens per Sura")
plt.xlabel("Avg. Tokens")
plt.ylabel("Sura")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig("images/avg_sura_tokens.jpg")

# Avg. tokens per school of law

In [None]:
# Loading data
df = pd.read_csv("stats/avg_tokens_school_of_theology.csv")

In [None]:
df.info()

In [None]:
df_sorted = df.sort_values("avg_token_count")
df_sorted.plot(kind="bar", x="label", y="avg_token_count", color="skyblue")
plt.title("Average Tafsir Tokens per School of Theology")
plt.xlabel("School of Theology")
plt.ylabel("Average Tokens (in Millions)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig("images/avg_tokens_sot.jpg")

In [None]:
df

# Avg. quotes per author

In [30]:
# Loading data
df = pd.read_csv("stats/quote_distribution.csv")

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tafsir_id           230 non-null    int64  
 1   type                230 non-null    object 
 2   quote_count         230 non-null    int64  
 3   word_count          230 non-null    int64  
 4   quote_per_1k_token  230 non-null    float64
 5   title_dmg           230 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 10.9+ KB


In [32]:
df.head()

Unnamed: 0,tafsir_id,type,quote_count,word_count,quote_per_1k_token,title_dmg
0,1,Hadith,3391,2719940,1.246719,Ǧāmiʿ al-bayān ʿan taʾwīl āy al-Qurʾān
1,1,Poetry,1904,2719940,0.700015,Ǧāmiʿ al-bayān ʿan taʾwīl āy al-Qurʾān
2,1,Quran,59727,2719940,21.95894,Ǧāmiʿ al-bayān ʿan taʾwīl āy al-Qurʾān
3,2,Hadith,1016,662449,1.533703,al-Kaššāf ʿan ḥaqāʾiq ġawāmiḍ at-tanzīl wa-...
4,2,Poetry,856,662449,1.292175,al-Kaššāf ʿan ḥaqāʾiq ġawāmiḍ at-tanzīl wa-...


In [33]:
## Overall abs. quote count (sum of quran + hadith + poetry)
df_abs_quotes = df.groupby("tafsir_id")["quote_count"].sum().reset_index()

In [34]:
df_abs_quotes

Unnamed: 0,tafsir_id,quote_count
0,1,65022
1,2,20555
2,3,32686
3,4,56219
4,5,40597
...,...,...
75,101,4157
76,102,32480
77,103,17762
78,104,13664


In [35]:
df_abs_quotes["word_count"] = df_abs_quotes["tafsir_id"].map(lambda x: df[df["tafsir_id"] == x]["word_count"].iloc[0])

In [36]:
df_abs_quotes

Unnamed: 0,tafsir_id,quote_count,word_count
0,1,65022,2719940
1,2,20555,662449
2,3,32686,1283581
3,4,56219,2869452
4,5,40597,1929863
...,...,...,...
75,101,4157,139513
76,102,32480,1253097
77,103,17762,1876028
78,104,13664,199112


In [37]:
# Calculating the rel. number of quotes per 1000 tokens
df_abs_quotes["rel_quotes_per_1k_tokens"] = (df_abs_quotes["quote_count"] / df_abs_quotes["word_count"]) * 1000

In [38]:
df_abs_quotes

Unnamed: 0,tafsir_id,quote_count,word_count,rel_quotes_per_1k_tokens
0,1,65022,2719940,23.905674
1,2,20555,662449,31.028804
2,3,32686,1283581,25.464696
3,4,56219,2869452,19.592243
4,5,40597,1929863,21.036208
...,...,...,...,...
75,101,4157,139513,29.796506
76,102,32480,1253097,25.919781
77,103,17762,1876028,9.467876
78,104,13664,199112,68.624694


In [39]:
# Sanity check
df_abs_quotes.sort_values("rel_quotes_per_1k_tokens", ascending=False)

Unnamed: 0,tafsir_id,quote_count,word_count,rel_quotes_per_1k_tokens
9,10,29860,219007,136.342674
7,8,25178,186564,134.956369
50,60,16870,168095,100.359916
69,95,40851,477252,85.596289
64,89,2944,38567,76.334690
...,...,...,...,...
54,68,4896,385548,12.698808
44,51,2576,244890,10.519009
35,40,5911,601338,9.829746
77,103,17762,1876028,9.467876


In [40]:
df_abs_quotes.sort_values("rel_quotes_per_1k_tokens", ascending=False).to_csv("stats/abs_quote_distribution.csv")