# Quran Word Aligner (Skip Bismillah Version)
Generate word-level timestamps for Quran recitations WITHOUT Bismillah.

**Use this when:**
- Reciter's audio does NOT have Bismillah at the start of first verses
- The original aligner created bad timing because it tried to force Bismillah onto the audio

**Instructions:**
1. Go to Runtime > Change runtime type > Select GPU (T4)
2. Run all cells in order
3. Set RECITER, SURAH_START, SURAH_END
4. Download the output JSON

In [None]:
# Mount Google Drive - saves progress permanently!
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted!")

In [None]:
# Install dependencies
!pip install -q openai-whisper python-Levenshtein tqdm

In [None]:
# Check GPU
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Download Quran text
import urllib.request
print("Downloading Quran text...")
urllib.request.urlretrieve(
    "https://tanzil.net/pub/download/index.php?quranType=uthmani&outType=txt-2&agree=true",
    "quran-uthmani.txt"
)
print("Done!")

In [None]:
# Aligner code WITH BISMILLAH SKIP OPTION
import json
import os
import re
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Tuple
from tqdm import tqdm
import whisper
import Levenshtein

@dataclass
class WordSegment:
    word: str
    start_ms: int
    end_ms: int

@dataclass
class AlignedSpan:
    index_start: int
    index_end: int
    start_ms: int
    end_ms: int

def normalize_arabic(text: str) -> str:
    diacritics = re.compile(r'[\u064B-\u065F\u0670]')
    text = diacritics.sub('', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ة', 'ه', text)
    text = re.sub(r'ى', 'ي', text)
    text = text.replace('\u0640', '')
    return text.strip()

def load_quran_text(quran_file: str, skip_bismillah: bool = False) -> Dict[int, str]:
    """Load Quran text. If skip_bismillah=True, removes first 4 words (Bismillah) from ayah 1."""
    quran_text = {}
    with open(quran_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            parts = line.split('|')
            if len(parts) >= 3:
                surah = int(parts[0])
                ayah = int(parts[1])
                text = parts[2]
                
                # Skip Bismillah for first ayah (except surah 1 and 9)
                if skip_bismillah and ayah == 1 and surah not in [1, 9]:
                    words = text.split()
                    if len(words) > 4:
                        text = ' '.join(words[4:])  # Skip first 4 words (Bismillah)
                
                key = surah * 1000 + ayah
                quran_text[key] = text
    return quran_text

def align_words(recognized: List[WordSegment], reference_words: List[str]) -> List[AlignedSpan]:
    if not recognized or not reference_words:
        return []
    
    rec_normalized = [normalize_arabic(w.word) for w in recognized]
    ref_normalized = [normalize_arabic(w) for w in reference_words]
    
    n, m = len(rec_normalized), len(ref_normalized)
    INF = float('inf')
    dp = [[INF] * (m + 1) for _ in range(n + 1)]
    dp[0][0] = 0
    
    for j in range(1, m + 1):
        dp[0][j] = j * 0.8
    for i in range(1, n + 1):
        dp[i][0] = i * 1.2
    
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            rec_word = rec_normalized[i-1]
            ref_word = ref_normalized[j-1]
            
            if rec_word == ref_word:
                cost = 0
            else:
                ratio = Levenshtein.ratio(rec_word, ref_word)
                if ratio > 0.8:
                    cost = 0.1
                elif ratio > 0.6:
                    cost = 0.4
                elif ratio > 0.4:
                    cost = 0.7
                else:
                    cost = 1.0
            
            dp[i][j] = min(
                dp[i-1][j-1] + cost,
                dp[i-1][j] + 1.2,
                dp[i][j-1] + 0.8
            )
    
    alignment = []
    i, j = n, m
    
    while i > 0 or j > 0:
        if i > 0 and j > 0:
            rec_word = rec_normalized[i-1]
            ref_word = ref_normalized[j-1]
            
            if rec_word == ref_word:
                cost = 0
            else:
                ratio = Levenshtein.ratio(rec_word, ref_word)
                if ratio > 0.8:
                    cost = 0.1
                elif ratio > 0.6:
                    cost = 0.4
                elif ratio > 0.4:
                    cost = 0.7
                else:
                    cost = 1.0
            
            if abs(dp[i][j] - (dp[i-1][j-1] + cost)) < 0.001:
                alignment.append((i-1, j-1))
                i -= 1
                j -= 1
                continue
        
        if i > 0 and abs(dp[i][j] - (dp[i-1][j] + 1.2)) < 0.001:
            alignment.append((i-1, None))
            i -= 1
        elif j > 0:
            alignment.append((None, j-1))
            j -= 1
        else:
            break
    
    alignment.reverse()
    
    spans = []
    for rec_idx, aligned_ref_idx in alignment:
        if rec_idx is not None and aligned_ref_idx is not None:
            spans.append(AlignedSpan(
                index_start=aligned_ref_idx,
                index_end=aligned_ref_idx + 1,
                start_ms=recognized[rec_idx].start_ms,
                end_ms=recognized[rec_idx].end_ms
            ))
    
    # Fill gaps
    if spans and len(spans) < len(reference_words):
        covered_indices = {s.index_start for s in spans}
        span_by_idx = {s.index_start: s for s in spans}
        filled_spans = []
        
        for ref_idx in range(len(reference_words)):
            if ref_idx in covered_indices:
                filled_spans.append(span_by_idx[ref_idx])
            else:
                prev_span = filled_spans[-1] if filled_spans else None
                if prev_span:
                    avg_duration = max(prev_span.end_ms - prev_span.start_ms, 300)
                    filled_spans.append(AlignedSpan(
                        index_start=ref_idx,
                        index_end=ref_idx + 1,
                        start_ms=prev_span.end_ms,
                        end_ms=prev_span.end_ms + avg_duration
                    ))
        spans = filled_spans
    
    return spans

print("Aligner code loaded! (Skip Bismillah version)")

In [None]:
# Load Whisper model and Quran text (BISMILLAH SKIPPED)
print("Loading Whisper large-v3 model...")
model = whisper.load_model("large-v3")
print("Model loaded!")

# Load Quran text WITH BISMILLAH SKIPPED for first ayahs
quran_text = load_quran_text("quran-uthmani.txt", skip_bismillah=True)
print(f"Loaded {len(quran_text)} ayahs (Bismillah skipped for first verses)")

# Verify: Show sample
print(f"\nSample - Surah 2:1 words: {quran_text[2001].split()[:5]}...")
print(f"Sample - Surah 10:1 words: {quran_text[10001].split()[:5]}...")

In [None]:
# Configuration - EDIT THIS
RECITER = "Abdullah_Basfar_64kbps"  # Reciter folder name on EveryAyah
SURAH_START = 1   # Start surah (1-114)
SURAH_END = 114   # End surah (1-114)

# Save to Google Drive so progress is never lost!
OUTPUT_FILE = "/content/drive/MyDrive/alignment_skip_bismillah.json"

# Surah ayah counts
AYAH_COUNTS = [
    7, 286, 200, 176, 120, 165, 206, 75, 129, 109, 123, 111, 43, 52, 99, 128,
    111, 110, 98, 135, 112, 78, 118, 64, 77, 227, 93, 88, 69, 60, 34, 30, 73,
    54, 45, 83, 182, 88, 75, 85, 54, 53, 89, 59, 37, 35, 38, 29, 18, 45, 60,
    49, 62, 55, 78, 96, 29, 22, 24, 13, 14, 11, 11, 18, 12, 12, 30, 52, 52,
    44, 28, 28, 20, 56, 40, 31, 50, 40, 46, 42, 29, 19, 36, 25, 22, 17, 19,
    26, 30, 20, 15, 21, 11, 8, 8, 19, 5, 8, 8, 11, 11, 8, 3, 9, 5, 4, 7, 3,
    6, 3, 5, 4, 5, 6
]

print(f"Will process surahs {SURAH_START} to {SURAH_END}")
print(f"Output file: {OUTPUT_FILE}")
print("\n⚠️ BISMILLAH WILL BE SKIPPED FOR FIRST AYAHS ⚠️")

In [None]:
# Download audio and process (with auto-save)
import urllib.request
import os

os.makedirs("audio", exist_ok=True)

# Load existing progress if any
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
        results = json.load(f)
    processed = {(r['surah'], r['ayah']) for r in results}
    last_surah = max(r['surah'] for r in results) if results else 0
    print(f"RESUMING - already have {len(results)} ayahs (up to surah {last_surah})")
else:
    results = []
    processed = set()
    print("Starting fresh")

save_counter = 0

for surah in range(SURAH_START, SURAH_END + 1):
    num_ayahs = AYAH_COUNTS[surah - 1]
    
    # Check if surah already complete
    surah_ayahs_done = sum(1 for s, a in processed if s == surah)
    if surah_ayahs_done == num_ayahs:
        print(f"Surah {surah} already complete, skipping")
        continue
    
    print(f"\nProcessing Surah {surah} ({num_ayahs} ayahs)...")
    
    for ayah in tqdm(range(1, num_ayahs + 1), desc=f"Surah {surah}"):
        # Skip if already processed
        if (surah, ayah) in processed:
            continue
            
        filename = f"{surah:03d}{ayah:03d}.mp3"
        url = f"https://everyayah.com/data/{RECITER}/{filename}"
        local_path = f"audio/{filename}"
        
        # Download if not exists
        if not os.path.exists(local_path):
            try:
                urllib.request.urlretrieve(url, local_path)
            except Exception as e:
                print(f"Failed to download {filename}: {e}")
                continue
        
        # Get reference text (BISMILLAH ALREADY SKIPPED)
        key = surah * 1000 + ayah
        if key not in quran_text:
            print(f"No reference text for {surah}:{ayah}")
            continue
        
        reference = quran_text[key]
        reference_words = reference.split()
        
        # Transcribe
        try:
            result = model.transcribe(local_path, language="ar", word_timestamps=True)
            
            words = []
            for segment in result.get("segments", []):
                for word_info in segment.get("words", []):
                    word = word_info.get("word", "").strip()
                    if word:
                        words.append(WordSegment(
                            word=word,
                            start_ms=int(word_info["start"] * 1000),
                            end_ms=int(word_info["end"] * 1000)
                        ))
            
            spans = align_words(words, reference_words)
            
            results.append({
                "surah": surah,
                "ayah": ayah,
                "segments": [[s.index_start, s.index_end, s.start_ms, s.end_ms] for s in spans]
            })
            processed.add((surah, ayah))
            save_counter += 1
            
            # Auto-save every 10 ayahs
            if save_counter >= 10:
                with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
                    json.dump(results, f, ensure_ascii=False)
                save_counter = 0
                
        except Exception as e:
            print(f"Error processing {surah}:{ayah}: {e}")
    
    # Also save after each surah completes
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False)
    print(f"Surah {surah} done! Total: {len(results)} ayahs saved")

print(f"\n=== COMPLETE! Processed {len(results)} ayahs ===")

In [None]:
# Save results
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"Saved to {OUTPUT_FILE}")

# Download
from google.colab import files
files.download(OUTPUT_FILE)

## Notes

This version **skips Bismillah** (first 4 words) for the first ayah of each surah,
except Surah 1 (Al-Fatiha) and Surah 9 (At-Tawbah).

Use this for reciters whose audio does NOT have Bismillah at the start of each surah.

### Available Reciters on EveryAyah
- `Abdullah_Basfar_64kbps`
- `Ghamadi_40kbps`
- `Akram_AlAlaqimy_128kbps`
- And many more at https://everyayah.com/data/