In [None]:
# Install necessary libraries
!pip install flask pyngrok sentence-transformers

import flask
from flask import Flask, request, jsonify
import json
import os
from sentence_transformers import SentenceTransformer, util
from pyngrok import ngrok

# Define the Flask app
app = Flask(__name__)

# Paths to local JSON files
books = {
    "bukhari": "content/bukhari.json",
    "muslim": "content/muslim.json",
    "malik": "content/malik.json",
    "darimi": "content/darimi.json",
    "tirmidhi": "content/tirmidhi.json",
    "ibnulmaja": "content/ibnmajah.json",
}

# Cache for Hadith data
hadith_cache = {}

# Load and cache Hadith data from local files
def load_hadith_data(book_slug):
    if book_slug not in hadith_cache:
        file_path = books.get(book_slug)
        if not file_path or not os.path.exists(file_path):
            raise Exception(f"File for {book_slug} not found at {file_path}")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                hadith_cache[book_slug] = json.load(f)
        except Exception as e:
            raise Exception(f"Failed to load data for {book_slug}: {str(e)}")
    return hadith_cache[book_slug]

# Load semantic search model
print("Loading semantic search model (this may take a moment)...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully.")

# Semantic search function
def semantic_search(query, selected_books, top_k=5):
    results = []
    query_embedding = model.encode(query, convert_to_tensor=True)

    for book_slug in selected_books:
        book_data = load_hadith_data(book_slug)
        hadiths = book_data.get('hadiths', [])

        # Combine text and arabic for encoding
        texts = [f"{h['text']} {h['arabic']}" for h in hadiths]
        embeddings = model.encode(texts, convert_to_tensor=True)

        # Compute cosine similarities
        similarities = util.cos_sim(query_embedding, embeddings)[0]

        # Pair hadiths with their scores
        for idx, score in enumerate(similarities):
            if idx < len(hadiths):
                results.append({
                    "book": book_slug,
                    "reference": hadiths[idx].get('reference', ''),
                    "text": hadiths[idx].get('text', ''),
                    "arabic": hadiths[idx].get('arabic', ''),
                    "score": float(score)
                })

    # Sort by score and return top_k results
    results = sorted(results, key=lambda x: x['score'], reverse=True)[:top_k]
    return results

# API endpoint for search
@app.route('/search', methods=['GET'])
def search():
    query = request.args.get('query', '').strip()
    books_param = request.args.get('books', '')
    top_k = int(request.args.get('top_k', 5))
    selected_books = books_param.split(',') if books_param else list(books.keys())

    if not query:
        return jsonify({"error": "Query parameter is required"}), 400

    # Validate selected books
    invalid_books = [b for b in selected_books if b not in books]
    if invalid_books:
        return jsonify({"error": f"Invalid book slugs: {', '.join(invalid_books)}"}), 400

    try:
        print(f"Performing semantic search for query: '{query}'...")
        results = semantic_search(query, selected_books, top_k)
        return jsonify({"results": results})
    except Exception as e:
        return jsonify({"error": f"Search failed: {str(e)}"}), 500

# Start the Flask server
if __name__ == '__main__':
    # Start ngrok tunnel
    public_url = ngrok.connect(5000)
    print(f"Public URL: {public_url}")

    # Run Flask app
    app.run(port=5000)

In [4]:
# Install necessary libraries
!pip install flask pyngrok sentence-transformers

import flask
from flask import Flask, request, jsonify
import json
import os
from sentence_transformers import SentenceTransformer, util
from pyngrok import ngrok

# Define the Flask app
app = Flask(__name__)

# Paths to local JSON files
books = {
    "bukhari": "content/bukhari.json",
    "muslim": "content/muslim.json",
    "malik": "content/malik.json",
    "darimi": "content/darimi.json",
    "tirmidhi": "content/tirmidhi.json",
    "ibnulmaja": "content/ibnulmaja.json",
}

# Cache for Hadith data
hadith_cache = {}

# Load and cache Hadith data from local files
def load_hadith_data(book_slug):
    if book_slug not in hadith_cache:
        file_path = books.get(book_slug)
        if not file_path or not os.path.exists(file_path):
            raise Exception(f"File for {book_slug} not found at {file_path}")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                hadith_cache[book_slug] = json.load(f)
        except Exception as e:
            raise Exception(f"Failed to load data for {book_slug}: {str(e)}")
    return hadith_cache[book_slug]

# Load semantic search model
print("Loading semantic search model (this may take a moment)...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully.")

# Semantic search function
def semantic_search(query, selected_books, top_k=5):
    results = []
    query_embedding = model.encode(query, convert_to_tensor=True)

    for book_slug in selected_books:
        book_data = load_hadith_data(book_slug)
        hadiths = book_data.get('hadiths', [])

        # Combine text and arabic for encoding
        texts = [f"{h['text']} {h['arabic']}" for h in hadiths]
        embeddings = model.encode(texts, convert_to_tensor=True)

        # Compute cosine similarities
        similarities = util.cos_sim(query_embedding, embeddings)[0]

        # Pair hadiths with their scores
        for idx, score in enumerate(similarities):
            if idx < len(hadiths):
                results.append({
                    "book": book_slug,
                    "reference": hadiths[idx].get('reference', ''),
                    "text": hadiths[idx].get('text', ''),
                    "arabic": hadiths[idx].get('arabic', ''),
                    "score": float(score)
                })

    # Sort by score and return top_k results
    results = sorted(results, key=lambda x: x['score'], reverse=True)[:top_k]
    return results

# API endpoint for search
@app.route('/search', methods=['GET'])
def search():
    query = request.args.get('query', '').strip()
    books_param = request.args.get('books', '')
    top_k = int(request.args.get('top_k', 5))
    selected_books = books_param.split(',') if books_param else list(books.keys())

    if not query:
        return jsonify({"error": "Query parameter is required"}), 400

    # Validate selected books
    invalid_books = [b for b in selected_books if b not in books]
    if invalid_books:
        return jsonify({"error": f"Invalid book slugs: {', '.join(invalid_books)}"}), 400

    try:
        print(f"Performing semantic search for query: '{query}'...")
        results = semantic_search(query, selected_books, top_k)
        return jsonify({"results": results})
    except Exception as e:
        return jsonify({"error": f"Search failed: {str(e)}"}), 500


        # Start the Flask server
if __name__ == '__main__':
    # Start ngrok tunnel
    # Replace "YOUR_AUTHTOKEN" with the actual authtoken from your ngrok dashboard
    # This line configures pyngrok with your authentication token
    ngrok.set_auth_token("2wwoXpT4cJkm5oFI6tbeH3zYJPR_871PxxvpSbr1aXdwxGY3e")

    public_url = ngrok.connect(5000)
    print(f"Public URL: {public_url}")

    # Run Flask app
    app.run(port=5000)


Loading semantic search model (this may take a moment)...
Model loaded successfully.
Public URL: NgrokTunnel: "https://1c3a-34-80-58-154.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [21/May/2025 16:43:07] "[35m[1mGET /search?query=prayer HTTP/1.1[0m" 500 -


Performing semantic search for query: 'prayer'...


INFO:werkzeug:127.0.0.1 - - [21/May/2025 16:43:08] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [21/May/2025 16:43:11] "[35m[1mGET /search?query=prayer HTTP/1.1[0m" 500 -


Performing semantic search for query: 'prayer'...


INFO:werkzeug:127.0.0.1 - - [21/May/2025 16:44:11] "[31m[1mPOST /search?query=prayer HTTP/1.1[0m" 405 -
INFO:werkzeug:127.0.0.1 - - [21/May/2025 16:44:19] "[33mPOST / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [21/May/2025 17:12:36] "[33mGET / HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [21/May/2025 17:12:51] "[35m[1mGET /search?query=prayer HTTP/1.1[0m" 500 -


Performing semantic search for query: 'prayer'...




In [8]:
from sentence_transformers import SentenceTransformer

# Download and save the model locally
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
model.save('./local_model')  # Save the model to a local folder
print(f"Model '{model_name}' saved to './semanticmdl'")


Model 'all-MiniLM-L6-v2' saved to './semanticmdl'


In [9]:
import re
import requests
import torch
from sentence_transformers import SentenceTransformer, util

def fetch_data(url):
    """
    Fetch data from a raw GitHub URL.
    """
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print("Error: Failed to retrieve data.")
        return None

def semantic_search(data, query, model, top_k=5):
    """
    Perform semantic search over the text.
    Splits the data into non-empty lines, computes embeddings,
    and returns the top_k lines most similar to the query.
    """
    lines = [line for line in data.splitlines() if line.strip() != ""]
    embeddings = model.encode(lines, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)
    cosine_scores = util.cos_sim(query_embedding, embeddings)[0]
    top_results = torch.topk(cosine_scores, k=top_k)

    results = []
    for score, idx in zip(top_results[0], top_results[1]):
        results.append((lines[idx], score.item()))
    return results

def extract_surah_info(data, surah_number):
    """
    Extract surah information from the markdown data.
    Captures the header (in the form '# <number>') that matches the surah_number
    and all subsequent lines until the next such header is encountered.
    """
    lines = data.splitlines()
    results = []
    capture = False
    header_pattern = re.compile(r"^#\s*(\d+)")

    for line in lines:
        line_stripped = line.strip()
        if line_stripped.startswith("#"):
            match = header_pattern.match(line_stripped)
            if match:
                num = match.group(1)
                if num == surah_number:
                    capture = True
                    results.append(line_stripped)
                    continue
                elif capture:
                    # We've already captured our surah and now hit the next surah header.
                    break
        elif capture:
            results.append(line)
    return results

def main():
    print("Select what you want to search:")
    print("1: Ayats (Uthmani text)")
    print("2: English Translation (Sahih)")
    print("3: Urdu Translation (Qadri)")
    print("4: Surah Information (Select by Surah Number)")

    option = input("Enter option number (1/2/3/4): ").strip()

    # Define the raw GitHub URLs for each option
    if option == "1":
        data_url = "https://raw.githubusercontent.com/hablullah/data-quran/master/ayah-text/uthmani-tanzil.md"
    elif option == "2":
        data_url = "https://raw.githubusercontent.com/hablullah/data-quran/master/ayah-translation/en-sahih-tanzil.md"
    elif option == "3":
        data_url = "https://raw.githubusercontent.com/hablullah/data-quran/master/ayah-translation/ur-qadri-tanzil.md"
    elif option == "4":
        data_url = "https://raw.githubusercontent.com/hablullah/data-quran/master/surah-info/ur-qurancom.md"
    else:
        print("Invalid option!")
        return

    print("\nFetching data, please wait...")
    data = fetch_data(data_url)
    if not data:
        return

    if option == "4":
        # Surah Information extraction
        while True:
            surah_number = input("Enter Surah number (1-114): ").strip()
            if surah_number.isdigit() and 1 <= int(surah_number) <= 114:
                break
            else:
                print("Invalid surah number. Please enter a number between 1 and 114.")

        info = extract_surah_info(data, surah_number)
        if info and len(info) > 1:
            print("\nSurah Information:")
            print("\n".join(info))
        else:
            # If only the header is found or nothing extra is available, print a message.
            print("No detailed information found for Surah " + surah_number)
    else:
        # For Ayats or translations, perform semantic search.
        query = input("Enter your search query: ").strip()
        print("\nLoading semantic search model (this may take a moment)...")
        model = SentenceTransformer('/content/local_model')
        print("Performing semantic search, please wait...")
        results = semantic_search(data, query, model, top_k=5)
        print("\nSemantic search results:")
        for line, score in results:
            print(f"Score: {score:.2f} -> {line}")

if __name__ == "__main__":
    main()


Select what you want to search:
1: Ayats (Uthmani text)
2: English Translation (Sahih)
3: Urdu Translation (Qadri)
4: Surah Information (Select by Surah Number)
Enter option number (1/2/3/4): 2

Fetching data, please wait...
Enter your search query: allah

Loading semantic search model (this may take a moment)...
Performing semantic search, please wait...

Semantic search results:
Score: 0.75 -> Allah, the Eternal Refuge.
Score: 0.71 -> Allah - there is no deity except Him, Lord of the Great Throne."
Score: 0.69 -> Allah, to whom belongs whatever is in the heavens and whatever is on the earth. And woe to the disbelievers from a severe punishment
Score: 0.68 -> Allah - there is no deity except Him. And upon Allah let the believers rely.
Score: 0.66 -> Allah - there is no deity except Him, the Ever-Living, the Sustainer of existence.


In [None]:
import re
import requests
import torch
from sentence_transformers import SentenceTransformer, util

def fetch_data(url):
    """
    Fetch data from a raw GitHub URL.
    """
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print("Error: Failed to retrieve data.")
        return None

def semantic_search(data, query, model, top_k=5):
    """
    Perform semantic search over the text.
    Splits the data into non-empty lines, computes embeddings,
    and returns the top_k lines most similar to the query.
    """
    lines = [line for line in data.splitlines() if line.strip() != ""]
    embeddings = model.encode(lines, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)
    cosine_scores = util.cos_sim(query_embedding, embeddings)[0]
    top_results = torch.topk(cosine_scores, k=top_k)

    results = []
    for score, idx in zip(top_results[0], top_results[1]):
        results.append((lines[idx], score.item()))
    return results

def extract_surah_info(data, surah_number):
    """
    Extract surah information from the markdown data.
    Captures the header (in the form '# <number>') that matches the surah_number
    and all subsequent lines until the next such header is encountered.
    """
    lines = data.splitlines()
    results = []
    capture = False
    header_pattern = re.compile(r"^#\s*(\d+)")

    for line in lines:
        line_stripped = line.strip()
        if line_stripped.startswith("#"):
            match = header_pattern.match(line_stripped)
            if match:
                num = match.group(1)
                if num == surah_number:
                    capture = True
                    results.append(line_stripped)
                    continue
                elif capture:
                    # We've already captured our surah and now hit the next surah header.
                    break
        elif capture:
            results.append(line)
    return results

def main():
    print("Select what you want to search:")
    print("1: Ayats (Uthmani text)")
    print("2: English Translation (Sahih)")
    print("3: Urdu Translation (Qadri)")
    print("4: Surah Information (Select by Surah Number)")

    option = input("Enter option number (1/2/3/4): ").strip()

    # Define the raw GitHub URLs for each option
    if option == "1":
        data_url = "https://raw.githubusercontent.com/hablullah/data-quran/master/ayah-text/uthmani-tanzil.md"
    elif option == "2":
        data_url = "https://raw.githubusercontent.com/hablullah/data-quran/master/ayah-translation/en-sahih-tanzil.md"
    elif option == "3":
        data_url = "https://raw.githubusercontent.com/hablullah/data-quran/master/ayah-translation/ur-qadri-tanzil.md"
    elif option == "4":
        data_url = "https://raw.githubusercontent.com/hablullah/data-quran/master/surah-info/ur-qurancom.md"
    else:
        print("Invalid option!")
        return

    print("\nFetching data, please wait...")
    data = fetch_data(data_url)
    if not data:
        return

    if option == "4":
        # Surah Information extraction
        while True:
            surah_number = input("Enter Surah number (1-114): ").strip()
            if surah_number.isdigit() and 1 <= int(surah_number) <= 114:
                break
            else:
                print("Invalid surah number. Please enter a number between 1 and 114.")

        info = extract_surah_info(data, surah_number)
        if info and len(info) > 1:
            print("\nSurah Information:")
            print("\n".join(info))
        else:
            # If only the header is found or nothing extra is available, print a message.
            print("No detailed information found for Surah " + surah_number)
    else:
        # For Ayats or translations, perform semantic search.
        query = input("Enter your search query: ").strip()
        print("\nLoading semantic search model (this may take a moment)...")
        model = SentenceTransformer('/content/local_model')
        print("Performing semantic search, please wait...")
        results = semantic_search(data, query, model, top_k=5)
        print("\nSemantic search results:")
        for line, score in results:
            print(f"Score: {score:.2f} -> {line}")

if __name__ == "__main__":
    main()


In [12]:
!pip install fastapi uvicorn nest-asyncio pyngrok sentence-transformers torch requests




In [13]:
%%writefile app.py

import re
import requests
import torch
from fastapi import FastAPI, Query
from sentence_transformers import SentenceTransformer, util
from pydantic import BaseModel
from typing import List

app = FastAPI()

model = SentenceTransformer('/content/local_model')  # Make sure your model is here

DATA_URLS = {
    "1": "https://raw.githubusercontent.com/hablullah/data-quran/master/ayah-text/uthmani-tanzil.md",
    "2": "https://raw.githubusercontent.com/hablullah/data-quran/master/ayah-translation/en-sahih-tanzil.md",
    "3": "https://raw.githubusercontent.com/hablullah/data-quran/master/ayah-translation/ur-qadri-tanzil.md",
    "4": "https://raw.githubusercontent.com/hablullah/data-quran/master/surah-info/ur-qurancom.md"
}


def fetch_data(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None


def semantic_search(data, query, model, top_k=5):
    lines = [line for line in data.splitlines() if line.strip() != ""]
    embeddings = model.encode(lines, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)
    cosine_scores = util.cos_sim(query_embedding, embeddings)[0]
    top_results = torch.topk(cosine_scores, k=top_k)
    results = []
    for score, idx in zip(top_results[0], top_results[1]):
        results.append({"line": lines[idx], "score": round(score.item(), 4)})
    return results


def extract_surah_info(data, surah_number):
    lines = data.splitlines()
    results = []
    capture = False
    header_pattern = re.compile(r"^#\s*(\d+)")
    for line in lines:
        line_stripped = line.strip()
        if line_stripped.startswith("#"):
            match = header_pattern.match(line_stripped)
            if match:
                num = match.group(1)
                if num == surah_number:
                    capture = True
                    results.append(line_stripped)
                    continue
                elif capture:
                    break
        elif capture:
            results.append(line)
    return results


class SearchRequest(BaseModel):
    option: str
    query: str
    top_k: int = 5


@app.get("/")
def root():
    return {"message": "Quran Semantic Search API is running."}


@app.post("/search")
def search(req: SearchRequest):
    if req.option not in ["1", "2", "3"]:
        return {"error": "Invalid option. Choose from 1, 2, 3."}
    data = fetch_data(DATA_URLS[req.option])
    if not data:
        return {"error": "Failed to fetch data."}
    results = semantic_search(data, req.query, model, req.top_k)
    return {"results": results}


@app.get("/surah-info")
def get_surah_info(surah_number: int = Query(..., ge=1, le=114)):
    data = fetch_data(DATA_URLS["4"])
    if not data:
        return {"error": "Failed to fetch Surah info."}
    results = extract_surah_info(data, str(surah_number))
    return {"surah_number": surah_number, "info": results}


Writing app.py


In [14]:
import nest_asyncio
import uvicorn
from pyngrok import ngrok

# Allow nested event loops in Colab
nest_asyncio.apply()

# Create public URL
public_url = ngrok.connect(8000)
print("Public URL:", public_url)

# Run the FastAPI app
uvicorn.run("app:app", host="0.0.0.0", port=8000)


Public URL: NgrokTunnel: "https://0e86-34-80-58-154.ngrok-free.app" -> "http://localhost:8000"


INFO:     Started server process [150]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     182.190.160.19:0 - "GET / HTTP/1.1" 200 OK
INFO:     182.190.160.19:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     182.190.160.19:0 - "GET /docs HTTP/1.1" 200 OK
INFO:     182.190.160.19:0 - "GET /openapi.json HTTP/1.1" 200 OK


ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-1' coro=<Server.serve() done, defined at /usr/local/lib/python3.11/dist-packages/uvicorn/server.py:68> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/main.py", line 580, in run
    server.run()
  File "/usr/local/lib/python3.11/dist-packages/uvicorn/server.py", line 66, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 133, in _run_once
    handle._run()
  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run
    se

INFO:     182.190.160.19:0 - "POST /search HTTP/1.1" 200 OK
INFO:     182.190.160.19:0 - "GET /%27%3B HTTP/1.1" 404 Not Found
INFO:     182.190.160.19:0 - "GET /%27%3B HTTP/1.1" 404 Not Found


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [150]
