In [None]:
"""
A two-stage program for answering Islam-related questions by leveraging a large language model and searching through authoritative Islamic sources.

The program first accepts a question, determines its language, and generates optimized search queries in Arabic for multiple sources, including Hadith, Quran, and the Encyclopedia of Islamic Jurisprudence (Mawsuah). It does not provide a direct answer at this stage.

In the second stage, the program utilizes the original question, its language, and the search results from the selected sources to generate a factually based final answer. The answer is provided in the same language as the input question and begins by proofreading the question without altering its semantic meaning. The program exclusively follows the Sunni tradition and cites relevant classical scholars, as well as modern scholars to ensure a comprehensive and accurate response.
"""


In [None]:
# TODO:
# 2. adapt the current signature descriptions and answer requirements to the original Ansari prompt
# 3. put the whole thing in one dspy.module
# 4. provide ~50 example questions
# 5. define a metric for evaluation
# 6. run corpo or mirpo optimization

In [None]:
from env_vars import *

In [None]:
from enum import Enum
from pprint import pprint

import dspy
from pydantic import BaseModel, Field

from tools.search_hadith import SearchHadith
from tools.search_quran import SearchQuran
from tools.search_mawsuah import SearchMawsuah

In [None]:
# Stage 1
class SearchSource(str, Enum):
    HADITH = "Hadith"
    QURAN = "Quran"
    MAWSUAH = "Mawsuah"
    NONE = "None"

class Stage1InputQuery(BaseModel):
    query: str = Field(description="Islam-related question")

class Stage1OutputResult(BaseModel):
    language: str = Field(description="Language of the question")
    search_queries: dict[SearchSource, str] = Field(description="Dictionary containing search queries for each source in Arabic")


class GenerateStage1Output(dspy.Signature):
    """Determine search requirements and generate search queries for multiple Islamic sources based on a given question.

    Accept an Islam-related question, which may not be in Arabic, and generate optimized search queries in Arabic for each required source.
    """
    input: Stage1InputQuery = dspy.InputField()
    output: Stage1OutputResult = dspy.OutputField()


In [None]:
# Stage 2
class Stage2InputData(BaseModel):
    question: str = Field(description="An Islam-related question")
    language: str = Field(description="Language of the question")
    search_results: dict[SearchSource, list[str]] = Field(description="Dictionary containing search results for each source")

class Stage2OutputAnswer(BaseModel):
    answer: str = Field(description="Final answer to the question in the same language as the question, based on the relevant search results from Hadith, Quran, or Mawsuah. The answer is concise, evidence-based, and cites classical and modern scholars from the Sunni tradition. When presenting the Qur'an, include the ayah number, Arabic text, and translation (if the user's language is different from Arabic). Only use hadith that are the result of the search. If the hadith is from the search results, present it with the collection, LK id, text, and grade. Otherwise, present the hadith as 'I believe (though not 100% sure of the reference) there is a hadith that says: [text of hadith]'.")

class GenerateStage2FinalAnswer(dspy.Signature):
    """Produce a factually based final answer to an Islam-related question using search results from multiple sources.

    Accept an input containing the original question, its language, and the search results from Hadith, Quran, and Mawsuah. Begin by proofreading the question without altering its semantic meaning. Then, provide a concise, evidence-based answer that exclusively follows the Sunni tradition, citing relevant classical scholars such as Al Ghazali, Ibn Al Qayyim, Ibn Taymiyah, Imam Shafiee, Imam Nawawi, Imam Abu Hanifah, Ibn Hajr al Asqalani, Imam Ahmad bin Hanbal, Imam Malik, and Ibn Hazm, as well as modern scholars like Yusuf Al Qaradawi, Yasir Qadhi, Ma'in Al Qudah, Shu'aib Al Arnaout, Hamza Yusuf, Zaid Shakir, Taqiuddin Usmani, Muhammad Shinqeeti, Ismail Menk, Omar Suleiman, Salman Al-Awdah, Jamaaluddin Zarabozo, and Yaser Birjas. Be particularly careful about matters that are obligatory or prohibited, ensuring that evidences directly support the assertion. Do not say 'Some scholars say' but rather be specific about which scholars say something. Utilize search results from multiple sources as necessary to ensure a comprehensive and accurate response.
    """

    input: Stage2InputData = dspy.InputField()
    output: Stage2OutputAnswer = dspy.OutputField()

In [None]:
def process_question(question: str, llm_gpt4, stage1_predictor, hs, qs, ms, stage2_predictor):
    # Stage 1: Generate search queries or provide a direct answer
    with dspy.context(lm=llm_gpt4):
        stage1_input = Stage1InputQuery(query=question)
        stage1_prediction = stage1_predictor(input=stage1_input)

    # Stage 1 search queries and results
    search_results = {source: [] for source in SearchSource}

    for source, search_query in stage1_prediction.output.search_queries.items():
        if source == SearchSource.HADITH:
            search_results[source] = hs.run_as_list(search_query)
        elif source == SearchSource.QURAN:
            search_results[source] = qs.run_as_list(search_query)
        elif source == SearchSource.MAWSUAH:
            search_results[source] = ms.run_as_list(search_query)

    # Stage 2: Generate the final answer using search results
    with dspy.context(lm=llm_gpt4):
        stage2_input = Stage2InputData(question=question, language=stage1_prediction.output.language, search_results=search_results)
        stage2_prediction = stage2_predictor(input=stage2_input)
    return stage2_prediction.output.answer

In [None]:
stage1_predictor = dspy.TypedPredictor(GenerateStage1Output, max_retries=5, explain_errors=True)
stage2_predictor = dspy.TypedPredictor(GenerateStage2FinalAnswer, max_retries=5, explain_errors=True)

In [None]:
llm_gpt4 = dspy.OpenAI(model='gpt-4-turbo-2024-04-09', api_key=openai_key)
llm_gpt3 = dspy.OpenAI(model='gpt-3.5-turbo-0125', api_key=openai_key)
llm_gpt4o = dspy.OpenAI(model='gpt-4o-2024-05-13', api_key=openai_key)
ms = SearchMawsuah(auth_token=VECTARA_AUTH_TOKEN, customer_id=VECTARA_CUSTOMER_ID, corpus_id=VECTARA_CORPUS_ID)
hs = SearchHadith(api_key=kalimat_api_key)
qs = SearchQuran(api_key=kalimat_api_key)

In [None]:
# convert the process question function to a dspy module
# the queries are not good: they mention the source name inside the query text, they don't maximize retrieval recall. ask what makes a good query, then update the description accordingly
# the final answer format is not working! do sth about it

In [None]:
question = "kadinlarin basortusu takmalari sart mi?"
final_answer = process_question(question, llm_gpt4o, stage1_predictor, hs, qs, ms, stage2_predictor)
pprint(final_answer)

In [None]:
llm_gpt4o.inspect_history()