In [1]:
! pip3 install -qU python-dotenv PyPDF2 langchain langchain-community langchain-core langchain-text-splitters langchain_upstage oracledb python-dotenv langchain-upstage

In [2]:
! pip install wikipedia-api



# Import Libraries

In [3]:
import numpy as np
import pandas as pd
import wikipediaapi
import os
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

# langchain_upstage
from langchain_upstage import UpstageLayoutAnalysisLoader
from langchain_upstage import UpstageEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_upstage import ChatUpstage
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

API KEY

In [None]:
api_key = "YOUR_API"

# Data Preprocessing

## Load Data

In [None]:
data_path = "YOUR_PATH" # folder path containing neccessary files

UPSTAGE_API_KEY = api_key

In [6]:
layzer = UpstageLayoutAnalysisLoader(api_key=UPSTAGE_API_KEY,file_path=os.path.join('ewha.pdf'), output_type="text")
docs = layzer.load()



In [7]:
layzer_law = UpstageLayoutAnalysisLoader(api_key=UPSTAGE_API_KEY, file_path=os.path.join('law.pdf'), output_type="text")
docs_law = layzer_law.load()

In [11]:
layzer_business = UpstageLayoutAnalysisLoader(api_key=UPSTAGE_API_KEY, file_path=os.path.join('business-math.pdf'), output_type="text")
docs_busi = layzer_business.load()

In [14]:
lazer_psychology = UpstageLayoutAnalysisLoader(api_key=UPSTAGE_API_KEY, file_path=os.path.join('Psychology.pdf'), output_type="text")
docs_psy = lazer_psychology.load()

In [8]:
lazer_philosophy = UpstageLayoutAnalysisLoader(api_key=UPSTAGE_API_KEY, file_path=os.path.join('philosophy.pdf'), output_type="text")
docs_phil = lazer_philosophy.load()

## Load sample data

In [15]:
def read_data(data_path):
    data = pd.read_csv(data_path)
    prompts = data['prompts']
    answers = data['answers']
    return prompts, answers

In [16]:
prompts, answers = read_data(os.path.join('./testset.csv'))

## Split Data

In [17]:
# Data Split: define chunk size and overlap size
text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=1000, chunk_overlap=100, language=Language.HTML
)

splits = text_splitter.split_documents(docs)
splits_law = text_splitter.split_documents(docs_law)
splits_busi = text_splitter.split_documents(docs_busi)
splits_psy = text_splitter.split_documents(docs_psy)
splits_phil = text_splitter.split_documents(docs_phil)

In [18]:
print("Splits:", len(splits))
print("Splits_law:", len(splits_law))
print("Splits_business:", len(splits_busi))
print("Splits_psychology:", len(splits_psy))
print("Splits_philosophy:", len(splits_phil))

Splits: 53
Splits_law: 245
Splits_business: 2242
Splits_psychology: 1602
Splits_philosophy: 476


## Embedding

In [19]:
# Embeddings: use embeddings for query and passage from upstage
query_embeddings = UpstageEmbeddings(api_key=UPSTAGE_API_KEY, model="solar-embedding-1-large-query")
passage_embeddings = UpstageEmbeddings(api_key=UPSTAGE_API_KEY, model="solar-embedding-1-large-passage")

In [20]:
# Extract content from splits
split_texts = [split.page_content for split in splits]
split_texts_law = [split_law.page_content for split_law in splits_law]
split_texts_busi = [split_busi.page_content for split_busi in splits_busi]
split_texts_psy = [split_psy.page_content for split_psy in splits_psy]
split_texts_phil = [split_phil.page_content for split_phil in splits_phil]

In [21]:
# Embeddings of extracted contents
split_embeddings = passage_embeddings.embed_documents(split_texts)
split_embeddings_law = passage_embeddings.embed_documents(split_texts_law)
split_embeddings_busi = passage_embeddings.embed_documents(split_texts_busi)
split_embeddings_psy = passage_embeddings.embed_documents(split_texts_psy)
split_embeddings_phil = passage_embeddings.embed_documents(split_texts_phil)

In [22]:
# Embedding of queries
prompt_embeddings = query_embeddings.embed_query(prompts)

## Domain Embedding

In [23]:
# Related keywords for each domain

law_words = ['law', 'jurisprudence', 'constitution', 'state', 'canon law', 'civil law', 'common law', 'natural law', 'rule', 'legislation',
             'lawyer', 'police', 'practice of law', 'public law', 'case law', 'justice', 'court', 'recusation', 'law enforcement', 'law of nature',
             'sharia law', 'regulation', 'international law', 'legal profession', 'torah', 'rule of raw', 'sharia', 'tort', 'precedent',
             'old testament', 'courts', 'derogation', 'lawmaking', 'impoundment', 'code of hammurabi', 'guilty', 'non guilty', 'innoscent'
             'bill', 'statute', 'ordinance', 'amendment', 'constitution', 'enactment', 'prohibition', 'defendant', 'man', 'woman', 'contract',
             'friend', 'admissible', 'owner', 'company', 'evidence', 'property', 'inadmissible', 'car', 'right', 'trial', 'tenant', 'federal', 'murder',
             'prevail', 'buyer', 'city', 'victim', 'attorney', 'son', 'jury', 'land', 'statement', 'recover', 'store', 'neighbor', 'homeowner', 'student',
             'testimony', 'case', 'witness', 'wife', 'plaintiff', 'rights', 'defense', 'deed', 'divorce']

In [24]:
psy_words = ['psychology', 'cognitive science', 'physiology', 'neurobiology', 'psychologist', 'social science', 'psychopathology', 'developmental psychology',
             'humanities', 'psychiatry', 'emotion', 'behaviour', 'psychoanalysis', 'clinical psychology', 'ethology', 'applied psychology',
             'sociology', 'neuroscience', 'phrenology', 'psychotherapy', 'neuropsychology', 'experimental psychology', 'phenomenology',
             'anthropology', 'linguistics', 'biophysics', 'biology', 'comparative psychology', 'theory', 'therapy', 'test', 'disorder', 'learning', 'client',
             'intelligence', 'cognitive', 'stimulus', 'memory', 'group', 'personality', 'social', 'development', 'validity', 'conditioning', 'time', 'research',
             'self', 'response', 'language', 'children', 'information', 'reinforcement', 'job', 'study', 'individual', 'age', 'performance', 'person',
             'aggression', 'disorders', 'ability', 'emotional', 'patient', 'psychology', 'mode', 'sample', 'anxiety',
             'mental', 'cognitive', 'cerebral', 'internal', 'intellectual', 'inner', 'conscious', 'psychic', 'thinking']

In [25]:
busi_words = ['business', 'jurisdiction', 'shareholder', 'management', 'stock', 'asset', 'manufacturing', 'finance', 'marketing', 'product',
              'income', 'property', 'price', 'year', 'rate', 'cost', 'years', 'days', 'sales', 'annual', 'value', 'company', 'total', 'month',
              'share', 'stock', 'pay', 'net', 'discount', 'purchased', 'paid', 'selling', 'tax', 'monthly', 'percent', 'policy', 'loan', 'hours',
              'week', 'markup', 'day', 'plan', 'market', 'inventory', 'payment', 'installment', 'store', 'charge', 'discounted', 'sold', 'compounded']

In [26]:
phil_words = ['philosophy', 'metaphysics', 'humanism', 'ethics', 'dualism', 'logic', 'plato', 'aristotle', 'aesthetics', 'kant', 'singer',
              'epistemology', 'theology', 'philosopher', 'immanuel kant', 'rationalism', 'creed', 'Descartes', 'Socrates', 'Spinoza', 'Nietzsche',
              'john locke', 'theory', 'dialectic', 'naturalism', 'marxism', 'doctrine', 'thomas hobbes', 'pragmatism', 'socratic method',
              'formalism', 'belief', 'realism', 'thomas aquinas', 'francis bacon', 'hume', 'hegel', 'marx', 'heidegger', 'moral', 'fallacy', 'argument', 'invalid',
              'person', 'appeal', 'human', 'good', 'claims', 'valid', 'logic', 'life', 'based', 'translation', 'predicate', 'select', 'right', 'consistent', 'action',
              'conclusion', 'people', 'arguing', 'personal', 'actions', 'man', 'reasoning', 'rights', 'virtue', 'straw', 'self', 'truth', 'slope', 'claim', 'justice',
              'evidence', 'ethics', 'happiness', 'natural']

In [27]:
his_words = ['history', 'story', 'chronicle', 'historiography', 'account', 'past', 'annals', 'prehistory', 'renaissance', 'historical', 'record',
             'culture', 'antiquity', 'geography', 'saracen', 'humanities', 'book', 'life', 'case history', 'narrative', 'historic period',
             'renascence', 'etymology', 'herodotus', 'thucydides', 'philosophy', 'prehistoric','art', 'ancient greek', 'ago', 'ancient',
             'career', 'society', 'literature', 'century', 'origins', 'modern', 'homo', 'emperor', 'australopithecus', 'afarensis', 'people',
             'war', 'years', 'states', 'king', 'america', 'information', 'new', 'united', 'ago', 'government', 'world', 'great', 'religious',
             'europe', 'africa', 'american', 'trade', 'rights', 'century', 'power', 'ancient', 'political', 'man', 'state', 'civil', 'south', 'men',
             'act', 'chinese', 'china', 'north', 'right', 'peace', 'complex', 'human', 'religion', 'time', 'movement', 'period', 'social', 'development', 'dynasty']

In [28]:
# Embeddings for each domain
law_ebd = passage_embeddings.embed_documents(law_words)
psy_ebd = passage_embeddings.embed_documents(psy_words)
busi_ebd = passage_embeddings.embed_documents(busi_words)
phil_ebd = passage_embeddings.embed_documents(phil_words)
his_ebd = passage_embeddings.embed_documents(his_words)

# Load Wikipedia Data

In [29]:
# Wikipedia API
wiki_wiki = wikipediaapi.Wikipedia('NLP_Project/1.0', 'en')

In [30]:
# Download nltk stop words
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LeeJiin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
def extract_keywords(text, max_keywords=5):
    # Find the first index of option (A)
    first_option_index = text.find('(A)')
    if first_option_index == -1:
        return "No question found."

    # Extract and preprocess query
    question = text[:first_option_index].strip()
    question = question.split(')', 1)[-1].strip()

    # Split words from the query
    one_words = question.split()
    one_words = [word.lower() for word in one_words if len(word) >= 4]

    # Generate bi-gram with two adjacent words
    adjacent_bigrams = [' '.join(one_words[i:i+2]) for i in range(len(one_words)-1)]
    # print(adjacent_bigrams)

    # Retrieve stop words list
    stop_words = set(stopwords.words('english'))
    additional_stopwords = {
    'below', 'above', 'around', 'near', 'between', 'within', 'among', 'along', 'behind', 'beside',
    'under', 'over', 'across', 'against', 'without', 'throughout', 'amongst', 'before', 'after',
    'during', 'between', 'till', 'until', 'through', 'although', 'unless', 'whether', 'though',
    'until', 'whether', 'if', 'even', 'because', 'as', 'so', 'although', 'unless', 'despite',
    'unless', 'regardless', 'among', 'beside', 'amongst', 'towards', 'with', 'in', 'out',
    'around', 'since', 'both', 'nor', 'neither', 'either', 'also', 'further', 'more', 'even',
    'just', 'so', 'very', 'too', 'such', 'now', 'always', 'never', 'often', 'sometimes', 'usually',
    'rather', 'quite', 'some', 'any', 'none', 'each', 'every', 'some', 'another', 'more', 'other',
    'many', 'few', 'lot', 'enough', 'much', 'all', 'anybody', 'everyone', 'nobody', 'somebody',
    'each', 'those', 'these', 'this', 'that', 'there', 'here', 'which', 'who', 'whom', 'whose',
    'what', 'where', 'when', 'why', 'how', 'could', 'would', 'should', 'might', 'may', 'will',
    'can', 'must', 'shall', 'ought', 'shall', 'do', 'does', 'did', 'done', 'having', 'being',
    'been', 'has', 'have', 'had', 'am', 'is', 'are', 'was', 'were', 'be', 'being', 'been',
    'could', 'would', 'should', 'might', 'may', 'will', 'can', 'must', 'shall', 'ought', 'be', 'an', 'a',
    'is', 'are', 'following', 'true', 'which', 'false', 'of', 'about', 'the', 'can', 'cannot'
    }

    new_stopwords = stop_words.union(additional_stopwords)
    sw_list = list(new_stopwords)

    # Generate words list excluding stop words
    two_words = [word for word in adjacent_bigrams if len(word) >= 4]

    words_to_remove = []
    for word in two_words:
        word_list = word.split()
        if any(i.lower() in sw_list for i in word_list):
            words_to_remove.append(word)

    # Remove unneccessary words from the list
    for word in words_to_remove:
        two_words.remove(word)

    combined_words = one_words + two_words

    # Define TF-IDF verctorizer
    tfidf = TfidfVectorizer(
        stop_words=sw_list,
        token_pattern=r'\b\w{3,}\b'  # Use uni-gram and bi-gram
    )

    # Generate TF-IDF matrix
    tfidf_matrix = tfidf.fit_transform([' '.join(combined_words)])
    feature_names = tfidf.get_feature_names_out()
    scores = tfidf_matrix.toarray()[0]

    # Sort in descending order
    sorted_indices = scores.argsort()[::-1]
    sorted_keywords = [feature_names[i] for i in sorted_indices]

    # Collect keywords
    keywords = []
    seen = set()

    for keyword in sorted_keywords:
        if keyword not in seen:
            keywords.append(keyword)
            seen.update(keyword.split())
        if len(keywords) >= max_keywords:
            break

    return keywords

In [32]:
# Retrieve related contexts from wikipedia
def get_wikipedia_context(query, num=7):
    keywords = extract_keywords(query)
    context = ""

    for keyword in keywords:
        page = wiki_wiki.page(keyword)
        if not page.exists():
            continue

        # Extract contents from the page
        sections = page.sections[:num]  # Use top num_sections sections
        context_text = "\n".join([section.text for section in sections])

        context += f"\n{context_text}"
    return context

# Retrieve Context

In [33]:
# Calculate similarity between prompt embedding and split embeddings
def calculate_similarity(prompt_embeddings, split_embeddings):
    similarity_scores = np.array(prompt_embeddings) @ np.array(split_embeddings).T
    return similarity_scores

In [34]:
# Select the top n contexts for each prompt and return the indices of the contexts
def get_top_indices(similarity_scores, top_n=7): # for mmlu dataset
    top_indices = similarity_scores.argsort()[::-1][:top_n]
    return top_indices

def get_top_indices_sc(similarity_scores, top_n=3): # for ewha dataset
    top_indices = similarity_scores.argsort()[::-1][:top_n]
    return top_indices

In [35]:
# Find the best domain that best correlates with the query
def find_domain(prompt_embeddings, domain_embeddings_dict):
    similarity_scores = {}
    for domain, embeddings in domain_embeddings_dict.items():
        similarities = calculate_similarity(prompt_embeddings, embeddings)
        similarity_scores[domain] = similarities.mean()

    best_domain = max(similarity_scores, key=similarity_scores.get)

    return best_domain

In [36]:
# Find the best contexts that correlates with the query for each domain

def get_law_context(prompt_embeddings, split_embeddings_law):
    similarity_scores_law = calculate_similarity(prompt_embeddings, split_embeddings_law)
    top_indices_law = get_top_indices(similarity_scores_law)
    top_contexts_law = [splits_law[i].page_content for i in top_indices_law]
    context_law = "\n".join(top_contexts_law)

    return context_law

In [37]:
def get_business_context(prompt_embeddings, split_embeddings_busi):
    similarity_scores_busi = calculate_similarity(prompt_embeddings, split_embeddings_busi)
    top_indices_busi = get_top_indices(similarity_scores_busi)
    top_contexts_busi = [splits_busi[i].page_content for i in top_indices_busi]
    context_busi = "\n".join(top_contexts_busi)

    return context_busi

In [38]:
def get_psychology_context(prompt_embeddings, split_embeddings_psychology):
    similarity_scores_psy = calculate_similarity(prompt_embeddings, split_embeddings_psychology)
    top_indices_psy = get_top_indices(similarity_scores_psy)
    top_contexts_psy = [splits_psy[i].page_content for i in top_indices_psy]
    context_psy = "\n".join(top_contexts_psy)

    return context_psy

In [39]:
def get_philosophy_context(prompt_embeddings, split_embeddings_philosophy):
    similarity_scores_phil = calculate_similarity(prompt_embeddings, split_embeddings_philosophy)
    top_indices_phil = get_top_indices(similarity_scores_phil)
    top_contexts_phil = [splits_phil[i].page_content for i in top_indices_phil]
    context_phil = "\n".join(top_contexts_phil)

    return context_phil

# LLM

In [40]:
llm = ChatUpstage(api_key = UPSTAGE_API_KEY, temperature=0)

## Templates

In [41]:
# Define prompting templates adjusted for each domain

school_template = """
Analyze the provided context carefully and extract the most correct answer. Follow these guidelines strictly:

1. Read the entire context thoroughly and identify key facts that align with the question.
2. Ensure the extracted answer strictly matches the facts in the context. If there are conflicting statements, choose the one most consistent with the majority of the context or state that the answer cannot be determined.
3. Avoid assumptions or interpretations not explicitly supported by the context.
4. Judge the right choices and the wrong choices exactly.
For each option, perform the following steps:

Verify if the option aligns with the context or principles of the question.
Eliminate options that contain errors, inconsistencies, or lack relevance.
If two or more options are valid, choose the one that is more specific or comprehensive.

    ---
    Question & Options : {question}
    ---
    Context: {context}
    ---

### Output Format (Must be strictly adhered to / Do not include the question & options in the beginning of output.):
"[ANSWER]: (alphabet), reason: [Provide a short explanation for your choice, referring to specific phrases or facts in the context]."
"""

In [42]:
law_template = """
You are an expert in constitutional law, legal principles, and case law. Analyze and solve the following legal question systematically:

1. Understand the Problem:
- Identify the core legal issue in the question (e.g., constitutional law, criminal law, contract law).
- Determine the context, such as applicable statutes, precedents, or legal doctrines.

2. Analyze the Situation:
- Break down the problem into key components.
- Apply relevant legal principles, statutes, or case law to assess each option.

3. Evaluate the Options:
- For each option, determine its correctness based on legal grounds.
- Use logical reasoning to support or refute each option.

4. Provide a Clear Answer:
- Select the option that aligns best with legal principles and the context.
For each option, perform the following steps:
- Verify its consistency with the question.
- Eliminate options containing legal inaccuracies or irrelevance.
- Choose the most precise and legally valid option.

### Example:
Question: Which amendment to the U.S. Constitution guarantees freedom of speech? Options: (A) First Amendment, (B) Fourth Amendment, (C) Tenth Amendment, (D) Fourteenth Amendment.
Context:
1. The First Amendment guarantees fundamental freedoms, including freedom of speech, press, assembly, and religion.
2. The Fourth Amendment protects against unreasonable searches and seizures.
3. The Tenth Amendment reserves powers to the states or people.
4. The Fourteenth Amendment ensures equal protection under the law.
Analysis:
1. Identify the core issue: Freedom of speech.
2. Evaluate the options:
   - (A) Correct, as it explicitly guarantees freedom of speech.
   - (B) Incorrect, as it concerns searches and seizures.
   - (C) Incorrect, as it relates to state powers.
   - (D) Incorrect, as it pertains to equal protection.
[ANSWER]: (A), Domain: Law, reason: The First Amendment explicitly guarantees freedom of speech as stated in the context.

---

### Question & Options: {question}
Context: {context}

---

### Instructions:
1. Identify the core legal issue in the question.
2. Use the context, statutes, or precedents provided to analyze the options.
3. For each option:
   - Verify its consistency with the context or legal principles.
   - Eliminate irrelevant or incorrect options.
4. Select the most accurate and legally valid answer.

### Output Format (Must be strictly adhered to / Do not include the question & options in the beginning of output.):
"[ANSWER]: (alphabet), Domain : Business, reason: [Provide a short explanation for your choice, referring to specific phrases or facts in the context, do not include alphabet of prompt]."
"""

In [43]:
psy_template = """
You are an expert in psychology, well-versed in various psychological theories, concepts, and research methodologies. Using your deep understanding of psychology, answer the following question systematically by following these steps:

1. Understand the Question:
   - Identify the key psychological concept, theory, or research focus in the question.
   - Determine the context (e.g., clinical, cognitive, behavioral, developmental, social psychology).

2. Analyze the Problem:
   - Break down the question into its essential components.
   - Apply relevant psychological theories, principles, or research findings to interpret the problem.

3. Evaluate the Options:
   - For each option, determine its relevance and accuracy based on established psychological knowledge.

4. Provide a Clear Answer:
   - State the most accurate answer or conclusion.
   For each option, perform the following steps:
   Verify if the option aligns with the context or principles of the question.
   Eliminate options that contain errors, inconsistencies, or lack relevance.
   If two or more options are valid, choose the one that is more specific or comprehensive.

    ---
    Question & Options : {question}
    ---
    Context: {context}
    ---

### Output Format (Must be strictly adhered to / Do not include the question & options in the beginning of output.):
"[ANSWER]: (alphabet), Domain : Psychology, reason: [Provide a short explanation for your choice, referring to specific phrases or facts in the context, do not include alphabet of prompt]."
"""

In [45]:
busi_template = """
You are an expert in business strategy, management, finance, marketing, and operations. With a comprehensive understanding of business concepts and practices, systematically analyze and solve the following question using these steps:

1. Understand the Problem:
- Identify the core business issue (e.g., finance, marketing, operations, strategy, organizational behavior).
- Define the context, such as industry, market conditions, or organizational goals.
- If the question addresses numbers, it is likely to be a math calculation problem, so conduct addition, subtractrion, multiplication or division appropriately.

2. Analyze the Situation:
- Break down the problem into key components.
- Apply relevant business theories, frameworks, or case studies (e.g., SWOT analysis, Porter’s Five Forces, financial ratios).

3. Evaluate the Options:
- If multiple options are presented, assess each option logically.
- Use quantitative and qualitative reasoning to determine the viability and alignment of each option with the problem.

4. Provide a Clear Answer:
- Select the best solution or strategy based on the analysis.
For each option, perform the following steps:
Verify if the option aligns with the context or principles of the question.
Eliminate options that contain errors, inconsistencies, or lack relevance.
If two or more options are valid, choose the one that is more specific or comprehensive.

### Example:
Question: A company takes out a loan of $10,000 with an annual interest rate of 8% compounded quarterly. If the loan is to be paid off in 5 years, what is the total amount payable at the end of the term? Options: (A) $12,000, (B) $14,693, (C) $15,000, (D) $16,500.
Context: 
1. The formula for compound interest is:
FV = PV * (1 + r/n)^(n * t)
where:
- FV = Future Value
- PV = Present Value (Loan Amount)
- r = Annual Interest Rate (Decimal)
- n = Number of Compounding Periods Per Year
- t = Time in Years
2. Substituting the values:
FV = 10,000 * (1 + 0.08/4)^(4 * 5)
3. Calculation:
FV = 10,000 * (1.02)^20 = 14,859.50
4. The closest answer is (B) $14,693, given minor rounding differences.
Analysis:
1. Identify the formula for compound interest based on the given data.
2. Substitute the values into the formula and solve systematically.
3. Compare the calculated value to the given options and select the closest answer.
   [ANSWER]: (B), reason: The calculated future value matches closely with $14,693, the closest option, based on the compound interest formula provided.

    ---

    ### Question & Options: {question}
    Context: {context}

    ---

### Instructions:
1. Identify the business concept or problem in the question.
2. Use the context, formulas, or principles provided to analyze the options.
3. For each option:
- Verify if it aligns with the context or formula provided.
- Eliminate inconsistent or irrelevant options.
4. Select the most accurate and specific answer based on business principles.

### Output Format (Must be strictly adhered to / Do not include the question & options in the beginning of output.):
"[ANSWER]: (alphabet), Domain : Business, reason: [Provide a short explanation for your choice, referring to specific phrases or facts in the context, do not include alphabet of prompt]."
"""

In [46]:
phil_template = """
You are an expert in philosophy, specializing in ethics, metaphysics, and logic. Analyze and solve the following philosophical question systematically:

1. Understand the Problem:
- Identify the philosophical concept, argument, or theory in the question.
- Determine the context, such as ethical frameworks, metaphysical assumptions, or logical structures.
- If the question addresses an ethical dilemma, fallacy, or metaphysical issue, specify the primary philosophical frameworks that would traditionally be applied to such problems.

2. Analyze the Situation:
- Break down the question into its components.
- Apply relevant philosophical arguments, theories, or frameworks.
- Reference specific philosophers or schools (e.g., Kant, Hume, Nietzsche, Singer, Stoicism, etc.) when relevant to analyze how they would approach the issue.

3. Evaluate the Options:
- For each option, assess its philosophical validity or coherence.
- Use logical reasoning or philosophical texts to support or refute each option.
- Reference philosophers or key philosophical texts (e.g., Aristotle’s "Nicomachean Ethics," Descartes' "Meditations," Mill’s "Utilitarianism") when evaluating the options.

4. Provide a Clear Answer:
- Select the option that aligns best with the context and philosophical principles.

### Example:
Question: According to utilitarianism, which action is morally right? Options: (A) Maximizes happiness, (B) Adheres to divine commands, (C) Respects virtues, (D) Follows categorical imperatives.
Context:
1. Utilitarianism judges morality by the consequences of an action, aiming for the greatest happiness for the greatest number.
2. Divine command theory bases morality on divine will.
3. Virtue ethics focuses on character and virtues.
4. Kantian ethics emphasizes categorical imperatives.
Analysis:
1. Identify the framework: Utilitarianism.
2. Evaluate the options:
   - (A) Correct, as it aligns with utilitarian principles.
   - (B) Incorrect, as it relates to divine command theory.
   - (C) Incorrect, as it relates to virtue ethics.
   - (D) Incorrect, as it relates to Kantian ethics.
[ANSWER]: (A), Domain: Philosophy, reason: Utilitarianism emphasizes actions that maximize happiness.

---

### Question & Options: {question}
Context: {context}

---

### Instructions:
1. Identify the philosophical concept in the question.
2. Use the context, arguments, or frameworks provided to analyze the options.
3. For each option:
   - Verify its alignment with philosophical principles.
   - Eliminate irrelevant or incoherent options.
4. Select the most valid answer.

### Output Format (Must be strictly adhered to / Do not include the question & options in the beginning of output.):
"[ANSWER]: (alphabet), Domain : Philosophy, reason: [Provide a short explanation for your choice, referring to specific phrases or facts in the context, do not include alphabet of prompt]."
"""

In [47]:
his_template = """
You are a historian with a deep understanding of historical events, figures, contexts, and trends. Use your expertise to analyze and answer the following question systematically by following these steps:

1. **Understand the Question**:
   - Identify the key historical topic, event, or figure the question refers to.
   - Determine the time period, geographical region, and cultural or political context.

2. **Analyze the Historical Context**:
   - Break down the question into its core components.
   - Apply relevant historical knowledge, such as causes, consequences, and connections to other events or trends.

3. **Evaluate the Options**:
   - If multiple options are presented, critically assess each option.
   - Determine the accuracy and relevance of each option based on historical evidence and interpretations.

4. **Provide a Clear Answer**:
   - Select the most accurate or relevant answer based on historical facts and reasoning.
   For each option, perform the following steps:
   Verify if the option aligns with the context or principles of the question.
   Eliminate options that contain errors, inconsistencies, or lack relevance.
   If two or more options are valid, choose the one that is more specific or comprehensive.

    ---
    Question & Options : {question}
    ---
    Context: {context}
    ---

### Output Format (Must be strictly adhered to / Do not include the question & options in the beginning of output.):
"[ANSWER]: (alphabet), Domain : history, reason: [Provide a short explanation for your choice, referring to specific phrases or facts in the context, do not include alphabet of prompt]."
"""

In [48]:
# Make a dictionary for domain embeddings and templates

domain_embeddings_dict = {
    "Law": law_ebd,
    "Psychology": psy_ebd,
    "Business": busi_ebd,
    "Philosophy": phil_ebd,
    "History": his_ebd
}

domain_template_dict = {
    "Law": law_template,
    "Psychology": psy_template,
    "Business": busi_template,
    "Philosophy": phil_template,
    "History": his_template
}

## Responses

In [49]:
responses = []

for prompt in prompts:
    # Generate context by combining the top related contexts
    context = ""
    prompt_embeddings = query_embeddings.embed_query(prompt)
    similarity_scores = calculate_similarity(prompt_embeddings, split_embeddings)

    threshold = 0.3
    if (similarity_scores >= threshold).any():
        top_indices = get_top_indices_sc(similarity_scores)
        top_contexts = [splits[i].page_content for i in top_indices]
        context = "\n".join(top_contexts)

        prompt_template = PromptTemplate.from_template(school_template)


    else:  # Search context from Wikipedia if the similarity score is low

        domain = find_domain(prompt_embeddings, domain_embeddings_dict)

        # If the domain is one of the four below, get the context from the pdf file instead of Wikipedia
        if domain == "Law":
            context = get_law_context(prompt_embeddings, split_embeddings_law)

        elif domain == "Business":
            context = get_business_context(prompt_embeddings, split_embeddings_busi)

        elif domain == "Psychology":
            context = get_psychology_context(prompt_embeddings, split_embeddings_psy)

        elif domain == "Philosophy":
            context = get_philosophy_context(prompt_embeddings, split_embeddings_phil)

        wiki_context = get_wikipedia_context(prompt)
        context += wiki_context

        template = domain_template_dict[domain]

        prompt_template = PromptTemplate.from_template(template)

    chain = prompt_template | llm

    print(f"{prompt} 에 대한 context : {context}")
    print("-" * 50)

    # Call the model
    response = chain.invoke({"question": prompt, "context": context})
    responses.append(response.content)

QUESTION1) 재학 중인 학생이 휴학을 하려면 학기 개시일로부터 며칠 이내에 휴학을 신청하야하나요?
(A) 30일
(B) 45일 
(C) 60일
(D) 90일 에 대한 context : .21)
⑥ 휴학기간은 재학연한에 산입하지 아니한다. (개정 2014.11.21)제7장 교내전학제25조(전과) ① 학생이 소속 학부 또는 학과의 변경(이하 “전과”라 한다)을 원할 때에는 「고
등교육법 시행령」 제29조제3항의 규정이 정하는 범위내에서 허가할 수 있다. (개정
1999.2.9., 2015.9.18.)② 전과의 허가는 매 학년도말에 할 수 있다. (개정 1999.2.9., 2017.8.16.)
③ 전과의 기준․대상․절차 등 세부사항은 총장이 따로 정한다.
(개정 1992.3.16)제8장 휴학, 복학, 제적, 자퇴 및 재입학 (개정 2017.8.16.)제26조(휴학) ① 질병 기타 부득이한 사정으로 3주일 이상 수강할 수 없는 자는 총장의 허가
를 얻어 휴학할 수 있다.
② 총장은 건강상의 이유로 정상적인 수업을 받을 수 없다고 인정되는 자에 대하여 휴학을
명할 수 있다. (개정 1988.7.28)
③ 1회의 휴학기간은 1년 이내로 한다. 다만, 교과과정상의 필요에 따라 총장이 지정하는
학부, 학과 또는 전공에 있어서는 이를 1년으로 한다. (개정 1996.2.15)
④ 휴학기간은 통산하여 3년(건축학전공의 경우 4년, 의예과의 경우 3학기)을 초과할 수
없다. 다만, 임신, 출산 및 육아로 인한 휴학, 창업으로 인한 휴학은 2년 이내의 기간을,
군복무로 인한 휴학은 의무복무기간을 추가 휴학기간으로 허가할 수 있다. (개정 2013.
2.25., 2015.9.18., 2016.2.16.)
⑤ 삭제 (1985.9.9.)
⑥ 재학 중인 자가 휴학을 하고자 하는 경우 학기개시일로부터 90일 이내에 휴학을 신청하
여야 한다. (신설 2015.9.18.)
⑦ 신입생, 편입학한 학생, 재입학한 학생은 중대한 질병 및 그에 준하는 사유를 제외하고
는 입학 후 첫 학기를 휴

In [50]:
# Print responses
for i in range(len(responses)):
    print(i+1)
    print(responses[i])
    print('-'*10)

1
[ANSWER]: (D), reason: "휴학기간은 통산하여 3년(건축학전공의 경우 4년, 의예과의 경우 3학기)을 초과할 수 없다. 다만, 임신, 출산 및 육아로 인한 휴학, 창업으로 인한 휴학은 2년 이내의 기간을, 군복무로 인한 휴학은 의무복무기간을 추가 휴학기간으로 허가할 수 있다. 재학 중인 자가 휴학을 하고자 하는 경우 학기개시일로부터 90일 이내에 휴학을 신청하여야 한다."
----------
2
[ANSWER]: (B), reason: "제28조제4호에 의하여 제적된 자는 제적된 날부터 1년이 경과한 후 재입학 할 수 있다."라는 문장에서 a는 1, b는 1이므로 a+b의 값은 2가 됩니다.
----------
3
[ANSWER]: (B), reason: "The context states that a student can be recognized as having completed a minor by taking 21 or more credits in a major subject other than their own major or department, as determined by the president. Therefore, the correct answer is (B) 21학점."
----------
4
[ANSWER]: (D), reason: 로라는 휴학기간 경과 후 3주가 지났으나 큰 사고가 났다는 정당한 이유 때문에 복학하지 못하였으므로 제적을 당하지 않습니다.
----------
5
[ANSWER]: (C), reason: 휴먼기계바이오공학부의 입학 정원은 110명으로 명시되어 있습니다.
----------
6
[ANSWER]: (B), reason: The context states that for students who entered before the 1980 academic year, the grade-based performance points are as follows: A+ 4, A 4, A- 3.7, B+ 3.3

In [51]:
import re

def extract_answer(response):
    """
    응답에서 정답을 추출합니다. 형식: "[ANSWER]: (A) convolutional networks"
    """
    pattern = r"\[ANSWER\]:\s*\(([A-J])\)"  # Extract answers as (A) format
    match = re.search(pattern, response)

    if match:
        return match.group(1)  # Extract the alphabet from (A)
    else:
        return extract_again(response)


def extract_again(response):
    pattern = r"\b[A-J]\b(?!.*\b[A-J]\b)"
    match = re.search(pattern, response)
    if match:
        return match.group(0)
    else:
        return None

In [52]:
# Print accuracy

cnt = 0

for answer, response in zip(answers, responses):
    print("-"*10)
    generated_answer = extract_answer(response)
    print(response)
    # Check
    if generated_answer:
        print(f"generated answer: {generated_answer}, answer: {answer}")
    else:
        print("extraction fail")


    if generated_answer == None:
        continue
    if generated_answer in answer:
        cnt += 1

print()
print(f"acc: {(cnt/len(answers))*100}%")

----------
[ANSWER]: (D), reason: "휴학기간은 통산하여 3년(건축학전공의 경우 4년, 의예과의 경우 3학기)을 초과할 수 없다. 다만, 임신, 출산 및 육아로 인한 휴학, 창업으로 인한 휴학은 2년 이내의 기간을, 군복무로 인한 휴학은 의무복무기간을 추가 휴학기간으로 허가할 수 있다. 재학 중인 자가 휴학을 하고자 하는 경우 학기개시일로부터 90일 이내에 휴학을 신청하여야 한다."
generated answer: D, answer: (D)
----------
[ANSWER]: (B), reason: "제28조제4호에 의하여 제적된 자는 제적된 날부터 1년이 경과한 후 재입학 할 수 있다."라는 문장에서 a는 1, b는 1이므로 a+b의 값은 2가 됩니다.
generated answer: B, answer: (A)
----------
[ANSWER]: (B), reason: "The context states that a student can be recognized as having completed a minor by taking 21 or more credits in a major subject other than their own major or department, as determined by the president. Therefore, the correct answer is (B) 21학점."
generated answer: B, answer: (C)
----------
[ANSWER]: (D), reason: 로라는 휴학기간 경과 후 3주가 지났으나 큰 사고가 났다는 정당한 이유 때문에 복학하지 못하였으므로 제적을 당하지 않습니다.
generated answer: D, answer: (D)
----------
[ANSWER]: (C), reason: 휴먼기계바이오공학부의 입학 정원은 110명으로 명시되어 있습니다.
generated answer: C, answer: (C)
----------
[ANSWER]: (B), 