In [5]:
import json

with open("quran_en.json", "r", encoding="utf-8") as f:
    documents_raw = json.load(f)

documents = []

for surah in documents_raw:
    for ayah in surah["verses"]:
        documents.append({
            "surah_number": surah["id"],
            "surah_name": surah["transliteration"],
            "surah_translation": surah["translation"],
            "ayah_number": ayah["id"],
            "reference": f"{surah['id']}:{ayah['id']}",
            "text": ayah["translation"],
            "language": "English"
        })

with open("quran_flat.json", "w", encoding="utf-8") as f:
    json.dump(documents, f, ensure_ascii=False, indent=2)


In [7]:
import requests
import json
import time

TAFSIR_ID = 169

In [21]:
SURAH_VERSES = {
    1: 7, 2: 286, 3: 200, 
    4: 176, 5: 120, 6: 165, 
    7: 206, 8: 75, 9: 129,
    10: 109, 11: 123, 12: 111, 
    13: 43, 14: 52, 15: 99, 16: 128, 17: 111, 18: 110,
    19: 98, 20: 135, 21: 112, 22: 78, 23: 118, 24: 64, 25: 77, 26: 227, 27: 93,
    28: 88, 29: 69, 30: 60, 31: 34, 32: 30, 33: 73, 34: 54, 35: 45, 36: 83,
    37: 182, 38: 88, 39: 75, 40: 85, 41: 54, 42: 53, 43: 89, 44: 59, 45: 37,
    46: 35, 47: 38, 48: 29, 49: 18, 50: 45, 51: 60, 52: 49, 53: 62, 54: 55,
    55: 78, 56: 96, 57: 29, 58: 22, 59: 24, 60: 13, 61: 14, 62: 11, 63: 11,
    64: 18, 65: 12, 66: 12, 67: 30, 68: 52, 69: 52, 70: 44, 71: 28, 72: 28,
    73: 20, 74: 56, 75: 40, 76: 31, 77: 50, 78: 40, 79: 46, 80: 42, 81: 29,
    82: 19, 83: 36, 84: 25, 85: 22, 86: 17, 87: 19, 88: 26, 89: 30, 90: 20,
    91: 15, 92: 21, 93: 11, 94: 8, 95: 8, 96: 19, 97: 5, 98: 8, 99: 8,
    100: 11, 101: 11, 102: 8, 103: 3, 104: 9, 105: 5, 106: 4, 107: 7,
    108: 3, 109: 6, 110: 3, 111: 5, 112: 4, 113: 5, 114: 6
}

In [23]:
import requests
from bs4 import BeautifulSoup

def clean_html(raw_html):
    soup = BeautifulSoup(raw_html, "html.parser")
    return soup.get_text(separator=" ", strip=True)

def get_tafsir(surah, ayah, tafsir_id=169):  # 169 = Ibn Kathir
    url = f"https://api.quran.com/api/v4/tafsirs/{tafsir_id}/by_ayah/{surah}:{ayah}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if data.get("tafsir"):
            t = data["tafsir"]
            cleaned_text = clean_html(t.get("text", ""))
            return {
                "surah": surah,
                "ayah": ayah,
                "reference": t.get("verse_key", f"{surah}:{ayah}"),
                "tafsir_source": t.get("resource_name", "Ibn Kathir"),
                "tafsir_text": cleaned_text
            }
        else:
            print(f"⚠️ No tafsir found for {surah}:{ayah}")
            return None
    else:
        print(f"⚠️ HTTP {response.status_code} for {surah}:{ayah}")
        return None

In [31]:
def download_tafsir():
    tafsir_data = []

    for surah, ayah_count in SURAH_AYAH_COUNTS.items():
        print(f"Processing Surah {surah} ({ayah_count} ayahs)...")
        for ayah in range(1, ayah_count + 1):
            tafsir = get_tafsir(surah, ayah)
            if tafsir:
                tafsir_data.append(tafsir)
            time.sleep(0.2)

    with open("tafsir_ibn_kathir.json", "w", encoding="utf-8") as f:
        json.dump(tafsir_data, f, ensure_ascii=False, indent=2)
    print("All tafsir saved to tafsir_ibn_kathir.json")

In [35]:
if __name__ == "__main__":
    download_tafsir()

Processing Surah 1 (7 ayahs)...
Processing Surah 2 (286 ayahs)...
Processing Surah 3 (200 ayahs)...
Processing Surah 4 (176 ayahs)...
Processing Surah 5 (120 ayahs)...
Processing Surah 6 (165 ayahs)...
Processing Surah 7 (206 ayahs)...
Processing Surah 8 (75 ayahs)...
Processing Surah 9 (129 ayahs)...
Processing Surah 10 (109 ayahs)...
Processing Surah 11 (123 ayahs)...
Processing Surah 12 (111 ayahs)...
Processing Surah 13 (43 ayahs)...
Processing Surah 14 (52 ayahs)...
Processing Surah 15 (99 ayahs)...
Processing Surah 16 (128 ayahs)...
Processing Surah 17 (111 ayahs)...
Processing Surah 18 (110 ayahs)...
Processing Surah 19 (98 ayahs)...
Processing Surah 20 (135 ayahs)...
Processing Surah 21 (112 ayahs)...
Processing Surah 22 (78 ayahs)...
Processing Surah 23 (118 ayahs)...
Processing Surah 24 (64 ayahs)...
Processing Surah 25 (77 ayahs)...
Processing Surah 26 (227 ayahs)...
Processing Surah 27 (93 ayahs)...
Processing Surah 28 (88 ayahs)...
Processing Surah 29 (69 ayahs)...
Process

In [None]:
import json

with open("quran_flat.json", "r", encoding="utf-8") as f:
    quran_data = json.load(f)

with open("tafsir_ibn_kathir.json", "r", encoding="utf-8") as f:
    tafsir_data = json.load(f)

In [None]:
tafsir_map = {t["reference"]: t for t in tafsir_data}

merged = []
for verse in quran_data:
    ref = verse["reference"]
    tafsir_entry = tafsir_map.get(ref, {})
    merged.append({
        **verse,
        "tafsir_text": tafsir_entry.get("tafsir_text", ""),
        "tafsir_source": tafsir_entry.get("tafsir_source", "N/A")
    })

with open("quran_with_tafsir.json", "w", encoding="utf-8") as f:
    json.dump(merged, f, ensure_ascii=False, indent=2)
    