## Setup & Imports

In [None]:
import kaggle_benchmarks as kbench
import pandas as pd
from dataclasses import dataclass
from typing import List, Optional
import re

# Check available models
print("Available models:")
print(list(kbench.llms.keys()))

---
## Task 1: Logical Deduction (Syllogisms)

Tests the model's ability to apply logical rules and deduce conclusions from premises.

In [None]:
# Dataset for logical deduction
logic_problems = pd.DataFrame([
    # Original problems
    {
        "premises": "All mammals are warm-blooded. All dogs are mammals.",
        "question": "Are all dogs warm-blooded?",
        "answer": "yes",
        "explanation": "Dogs are mammals, and all mammals are warm-blooded, so dogs must be warm-blooded."
    },
    {
        "premises": "No reptiles are mammals. All snakes are reptiles.",
        "question": "Are any snakes mammals?",
        "answer": "no",
        "explanation": "Snakes are reptiles, and no reptiles are mammals, so snakes cannot be mammals."
    },
    {
        "premises": "All programmers use computers. Some artists are programmers.",
        "question": "Do some artists use computers?",
        "answer": "yes",
        "explanation": "Since some artists are programmers, and all programmers use computers, those artists must use computers."
    },
    {
        "premises": "All squares are rectangles. All rectangles have four sides.",
        "question": "Do all squares have four sides?",
        "answer": "yes",
        "explanation": "Squares are rectangles, rectangles have four sides, therefore squares have four sides."
    },
    {
        "premises": "If it rains, the ground gets wet. The ground is wet.",
        "question": "Did it definitely rain?",
        "answer": "no",
        "explanation": "The ground being wet doesn't prove rain - it could have other causes (sprinklers, etc.). This is the fallacy of affirming the consequent."
    },
    # New problems - Modus Ponens
    {
        "premises": "If a person is a bachelor, then they are unmarried. John is a bachelor.",
        "question": "Is John unmarried?",
        "answer": "yes",
        "explanation": "By modus ponens: If P then Q, P is true, therefore Q must be true."
    },
    # Modus Tollens
    {
        "premises": "If it is a weekday, the office is open. The office is not open.",
        "question": "Is it a weekday?",
        "answer": "no",
        "explanation": "By modus tollens: If P then Q, Q is false, therefore P must be false."
    },
    # Disjunctive Syllogism
    {
        "premises": "Either the package was delivered or it was lost. The package was not delivered.",
        "question": "Was the package lost?",
        "answer": "yes",
        "explanation": "By disjunctive syllogism: Either A or B, not A, therefore B."
    },
    # Hypothetical Syllogism
    {
        "premises": "If it snows, the schools close. If the schools close, children stay home.",
        "question": "If it snows, do children stay home?",
        "answer": "yes",
        "explanation": "By hypothetical syllogism: If A then B, If B then C, therefore If A then C."
    },
    # Existential Fallacy
    {
        "premises": "All unicorns have horns. All unicorns are magical creatures.",
        "question": "Are there magical creatures with horns?",
        "answer": "no",
        "explanation": "This is the existential fallacy - we cannot conclude existence from universal statements about possibly empty sets."
    },
    # Complex Syllogism
    {
        "premises": "No honest politician takes bribes. Some senators are honest politicians.",
        "question": "Do some senators not take bribes?",
        "answer": "yes",
        "explanation": "Some senators are honest politicians, and no honest politician takes bribes, so those senators don't take bribes."
    },
    # Denying the Antecedent (Fallacy)
    {
        "premises": "If you study hard, you will pass the exam. You did not study hard.",
        "question": "Can we conclude you will fail the exam?",
        "answer": "no",
        "explanation": "This is the fallacy of denying the antecedent. Not studying hard doesn't guarantee failure - you might pass anyway."
    },
    # Contraposition
    {
        "premises": "All birds have feathers. Penguins are birds.",
        "question": "Do penguins have feathers?",
        "answer": "yes",
        "explanation": "Penguins are birds, all birds have feathers, therefore penguins have feathers."
    },
    # Exclusive Disjunction
    {
        "premises": "A number is either even or odd, but not both. The number 7 is not even.",
        "question": "Is 7 odd?",
        "answer": "yes",
        "explanation": "Since 7 is not even, and a number must be either even or odd, 7 must be odd."
    },
    # Universal Negative
    {
        "premises": "No fish can breathe air directly. Salmon are fish.",
        "question": "Can salmon breathe air directly?",
        "answer": "no",
        "explanation": "Salmon are fish, and no fish can breathe air directly, so salmon cannot breathe air directly."
    },
])

print(f"Logical deduction dataset: {len(logic_problems)} problems")
logic_problems.head()

In [None]:
# Advanced Logical Deduction Problems (10 new harder problems)
advanced_logic_problems = pd.DataFrame([
    {"premises": "A shape is a square if and only if it has four equal sides and four right angles. This shape has four equal sides and four right angles.", "question": "Is this shape a square?", "answer": "yes", "explanation": "Biconditional: If and only if means both directions hold."},
    {"premises": "If it's Monday, then if it's raining, the market is closed. It's Monday and it's raining.", "question": "Is the market closed?", "answer": "yes", "explanation": "Nested conditional with both antecedents satisfied."},
    {"premises": "If I study, I'll pass. If I work, I'll earn money. I will either study or work.", "question": "Will I either pass or earn money?", "answer": "yes", "explanation": "Constructive dilemma: (P‚ÜíQ) ‚àß (R‚ÜíS) ‚àß (P‚à®R) ‚Üí (Q‚à®S)"},
    {"premises": "If the car starts, the battery is charged. If the lights work, the battery is charged. The battery is not charged.", "question": "Can we conclude neither the car starts nor the lights work?", "answer": "yes", "explanation": "Destructive dilemma: contrapositive of both conditionals."},
    {"premises": "All A are B. All B are C. All C are D. All D are E. X is an A.", "question": "Is X an E?", "answer": "yes", "explanation": "Sorites: transitive chain through multiple categories."},
    {"premises": "All cats are mammals. No non-mammals are cats.", "question": "Are these two statements logically equivalent?", "answer": "yes", "explanation": "Obversion: 'All S are P' is equivalent to 'No S are non-P'."},
    {"premises": "All dogs are animals. No cats are dogs.", "question": "Can we conclude that no cats are animals?", "answer": "no", "explanation": "Illicit major fallacy: the major term is undistributed in premise but distributed in conclusion."},
    {"premises": "All cats are mammals. All dogs are mammals.", "question": "Can we conclude anything about the relationship between cats and dogs?", "answer": "no", "explanation": "Undistributed middle: the middle term 'mammals' is not distributed in either premise."},
    {"premises": "Every person loves someone. There is someone who is loved by every person.", "question": "Do these two statements mean the same thing?", "answer": "no", "explanation": "Different quantifier scope: ‚àÄx‚àÉy vs ‚àÉy‚àÄx - the first allows different people for each person."},
    {"premises": "It is necessary that if it rains, the ground gets wet. It is possible that it rains.", "question": "Is it possible that the ground gets wet?", "answer": "yes", "explanation": "Modal logic: necessary conditional + possible antecedent ‚Üí possible consequent."},
])

# Combine original and advanced logic problems
logic_problems = pd.concat([logic_problems, advanced_logic_problems], ignore_index=True)
print(f"Extended logical deduction dataset: {len(logic_problems)} problems (15 original + 10 advanced)")

In [None]:
@dataclass
class LogicalAnswer:
    """Structured response for logical deduction."""
    answer: str  # "yes" or "no"
    reasoning: str  # Step-by-step reasoning


@kbench.task(name="logical_deduction")
def logical_deduction_task(llm, premises: str, question: str, answer: str, explanation: str) -> bool:
    """
    Evaluate LLM's ability to perform logical deduction from given premises.
    
    The model must:
    1. Understand the logical structure of the premises
    2. Apply valid logical rules
    3. Arrive at the correct conclusion
    """
    prompt = f"""You are a logical reasoning expert. Given the following premises, answer the question.

PREMISES:
{premises}

QUESTION:
{question}

Think step-by-step and provide your answer as either "yes" or "no", along with your reasoning.
"""
    
    response = llm.prompt(prompt, schema=LogicalAnswer)
    
    # Normalize the answer
    model_answer = response.answer.lower().strip()
    expected_answer = answer.lower().strip()
    
    is_correct = model_answer == expected_answer
    
    # Record assertion for visibility
    kbench.assertions.assert_true(
        is_correct,
        expectation=f"Model answered '{model_answer}', expected '{expected_answer}'. Correct reasoning: {explanation}"
    )
    
    return is_correct


# Test with a single example
logical_deduction_task.run(
    llm=kbench.llm,
    premises="All mammals are warm-blooded. All dogs are mammals.",
    question="Are all dogs warm-blooded?",
    answer="yes",
    explanation="Dogs are mammals, and all mammals are warm-blooded."
)

---
## Task 2: Mathematical Word Problems

Tests multi-step mathematical reasoning with real-world context.

In [None]:
# Dataset for math word problems
math_problems = pd.DataFrame([
    # Original problems
    {
        "problem": "A store sells apples for $2 each and oranges for $3 each. If Sarah buys 4 apples and 5 oranges, and pays with a $50 bill, how much change does she receive?",
        "answer": 27,
        "steps": "Apples: 4√ó$2=$8, Oranges: 5√ó$3=$15, Total: $8+$15=$23, Change: $50-$23=$27"
    },
    {
        "problem": "A train travels at 60 mph for 2 hours, then at 80 mph for 1.5 hours. What is the total distance traveled?",
        "answer": 240,
        "steps": "First leg: 60√ó2=120 miles, Second leg: 80√ó1.5=120 miles, Total: 120+120=240 miles"
    },
    {
        "problem": "If 3 workers can complete a job in 12 days, how many days would it take 4 workers to complete the same job, assuming they work at the same rate?",
        "answer": 9,
        "steps": "Total work = 3√ó12 = 36 worker-days, With 4 workers: 36√∑4 = 9 days"
    },
    {
        "problem": "A rectangle has a perimeter of 36 cm. If its length is twice its width, what is the area of the rectangle in square centimeters?",
        "answer": 72,
        "steps": "Let width=w, length=2w. Perimeter: 2(w+2w)=36, so 6w=36, w=6. Length=12. Area=6√ó12=72"
    },
    {
        "problem": "A tank is 1/3 full. After adding 40 liters, it becomes 2/3 full. What is the total capacity of the tank?",
        "answer": 120,
        "steps": "40 liters fills 2/3 - 1/3 = 1/3 of the tank. So full capacity = 40 √ó 3 = 120 liters"
    },
    # New problems - Percentage
    {
        "problem": "A shirt originally costs $80. It's on sale for 25% off. What is the sale price?",
        "answer": 60,
        "steps": "Discount: 80 √ó 0.25 = $20, Sale price: $80 - $20 = $60"
    },
    # Compound Interest (simple)
    {
        "problem": "You invest $1000 at 5% simple interest per year. How much money will you have after 3 years?",
        "answer": 1150,
        "steps": "Interest per year: 1000 √ó 0.05 = $50, Total interest: $50 √ó 3 = $150, Final: $1000 + $150 = $1150"
    },
    # Age Problem
    {
        "problem": "Tom is twice as old as Jerry. In 5 years, Tom will be 1.5 times as old as Jerry. How old is Jerry now?",
        "answer": 10,
        "steps": "Let Jerry=x, Tom=2x. In 5 years: 2x+5 = 1.5(x+5), 2x+5 = 1.5x+7.5, 0.5x = 2.5, x = 5. Wait, recalculating: 2x+5 = 1.5(x+5) ‚Üí 2x+5 = 1.5x+7.5 ‚Üí 0.5x = 2.5 ‚Üí x = 5. Jerry is 5... Actually let me redo: If Jerry=10, Tom=20. In 5 years: Jerry=15, Tom=25. 25/15 = 1.67 ‚â† 1.5. Let's solve: 2x+5 = 1.5(x+5) ‚Üí 2x+5 = 1.5x+7.5 ‚Üí 0.5x = 2.5 ‚Üí x=5"
    },
    # Distance/Speed/Time
    {
        "problem": "Two cars start from the same point traveling in opposite directions. Car A travels at 50 mph and Car B at 70 mph. After how many hours will they be 360 miles apart?",
        "answer": 3,
        "steps": "Combined speed: 50 + 70 = 120 mph, Time: 360 √∑ 120 = 3 hours"
    },
    # Ratio Problem
    {
        "problem": "The ratio of boys to girls in a class is 3:5. If there are 24 students total, how many girls are there?",
        "answer": 15,
        "steps": "Total parts: 3+5=8, Each part: 24√∑8=3 students, Girls: 5√ó3=15"
    },
    # Mixture Problem
    {
        "problem": "How many liters of a 20% salt solution must be mixed with 10 liters of a 50% salt solution to get a 30% salt solution?",
        "answer": 20,
        "steps": "Let x = liters of 20% solution. 0.2x + 0.5(10) = 0.3(x+10), 0.2x + 5 = 0.3x + 3, 2 = 0.1x, x = 20"
    },
    # Profit/Loss
    {
        "problem": "A merchant buys an item for $200 and sells it for $250. What is the profit percentage?",
        "answer": 25,
        "steps": "Profit: $250 - $200 = $50, Profit %: (50/200) √ó 100 = 25%"
    },
    # Geometry - Circle
    {
        "problem": "A circular garden has a radius of 7 meters. What is its area in square meters? (Use œÄ = 22/7)",
        "answer": 154,
        "steps": "Area = œÄr¬≤ = (22/7) √ó 7¬≤ = (22/7) √ó 49 = 22 √ó 7 = 154 sq meters"
    },
    # Average
    {
        "problem": "The average of 5 numbers is 20. If one number is removed, the average becomes 15. What number was removed?",
        "answer": 40,
        "steps": "Sum of 5 numbers: 5 √ó 20 = 100, Sum of 4 numbers: 4 √ó 15 = 60, Removed number: 100 - 60 = 40"
    },
    # Sequence
    {
        "problem": "What is the sum of the first 10 positive even numbers?",
        "answer": 110,
        "steps": "Even numbers: 2,4,6,8,10,12,14,16,18,20. Sum = n(n+1) where n=10: 10√ó11 = 110. Or: 2+4+6+8+10+12+14+16+18+20 = 110"
    },
])

print(f"Math problems dataset: {len(math_problems)} problems")
math_problems.head()

In [None]:
# Advanced Math Problems (10 new harder problems)
advanced_math_problems = pd.DataFrame([
    {"problem": "A farmer has 100 meters of fencing. What is the maximum rectangular area (in square meters) that can be enclosed?", "answer": 625, "steps": "For max area, rectangle should be square. Perimeter=100, side=25, area=25¬≤=625"},
    {"problem": "A bag contains 3 red balls and 2 blue balls. If two balls are drawn without replacement, what is the probability (as a percentage) that both are red?", "answer": 30, "steps": "P(both red) = (3/5) √ó (2/4) = 6/20 = 0.30 = 30%"},
    {"problem": "A bacteria colony doubles every 4 hours. If there are 100 bacteria initially, how many will there be after 12 hours?", "answer": 800, "steps": "12 hours = 3 doubling periods. 100 √ó 2¬≥ = 100 √ó 8 = 800"},
    {"problem": "How many different 3-letter arrangements can be made from the letters A, B, C, D, E if no letter can be repeated?", "answer": 60, "steps": "Permutation P(5,3) = 5 √ó 4 √ó 3 = 60"},
    {"problem": "What is the greatest common divisor (GCD) of 84 and 126?", "answer": 42, "steps": "84 = 2¬≤ √ó 3 √ó 7, 126 = 2 √ó 3¬≤ √ó 7. GCD = 2 √ó 3 √ó 7 = 42"},
    {"problem": "If log‚ÇÅ‚ÇÄ(x) = 3, what is x?", "answer": 1000, "steps": "log‚ÇÅ‚ÇÄ(x) = 3 means 10¬≥ = x, so x = 1000"},
    {"problem": "In a right triangle, one leg is 3 cm and the hypotenuse is 5 cm. What is the length of the other leg in cm?", "answer": 4, "steps": "Pythagorean theorem: 3¬≤ + b¬≤ = 5¬≤, 9 + b¬≤ = 25, b¬≤ = 16, b = 4"},
    {"problem": "Simplify: (x¬≤ - 9)/(x - 3) when x = 5. What is the result?", "answer": 8, "steps": "(x¬≤ - 9)/(x - 3) = (x+3)(x-3)/(x-3) = x+3. When x=5: 5+3=8"},
    {"problem": "A ball is thrown upward with initial velocity 40 m/s. Using h = 40t - 5t¬≤, at what time t (in seconds) does it reach maximum height?", "answer": 4, "steps": "Maximum at vertex: t = -b/(2a) = -40/(2√ó-5) = 40/10 = 4 seconds"},
    {"problem": "What is the sum of the interior angles of a hexagon in degrees?", "answer": 720, "steps": "Sum = (n-2) √ó 180 = (6-2) √ó 180 = 4 √ó 180 = 720 degrees"},
])

# Combine original and advanced math problems
math_problems = pd.concat([math_problems, advanced_math_problems], ignore_index=True)
print(f"Extended math dataset: {len(math_problems)} problems (15 original + 10 advanced)")

In [None]:
@dataclass
class MathSolution:
    """Structured response for math problems."""
    step_by_step: str  # Detailed solution steps
    final_answer: float  # Numerical answer


@kbench.task(name="math_word_problems")
def math_reasoning_task(llm, problem: str, answer: float, steps: str) -> bool:
    """
    Evaluate LLM's math reasoning: extract info, set up equations, calculate accurately, and provide correct numerical answers.
    """
    prompt = f"""Solve this math word problem step by step.

PROBLEM:
{problem}

Show your work clearly, then provide the final numerical answer.
"""
    
    response = llm.prompt(prompt, schema=MathSolution)
    
    # Allow small floating point tolerance
    is_correct = abs(response.final_answer - answer) < 0.01
    
    kbench.assertions.assert_true(
        is_correct,
        expectation=f"Model answered {response.final_answer}, expected {answer}. Correct steps: {steps}"
    )
    
    return is_correct


# Test with a single example
math_reasoning_task.run(
    llm=kbench.llm,
    problem="A store sells apples for $2 each and oranges for $3 each. If Sarah buys 4 apples and 5 oranges, and pays with a $50 bill, how much change does she receive?",
    answer=27,
    steps="Apples: 4√ó$2=$8, Oranges: 5√ó$3=$15, Total: $23, Change: $27"
)

---
## Task 3: Causal Reasoning

Tests understanding of cause-and-effect relationships and counterfactual thinking.

In [None]:
# Dataset for causal reasoning
causal_problems = pd.DataFrame([
    # Original problems
    {
        "scenario": "John was late to work because his alarm didn't go off. His alarm didn't go off because there was a power outage overnight.",
        "question": "What was the root cause of John being late?",
        "answer": "power outage",
        "reasoning_type": "causal chain"
    },
    {
        "scenario": "Plants in Garden A received fertilizer and grew 30% taller than plants in Garden B which received no fertilizer. Both gardens had identical soil, sunlight, and water.",
        "question": "What caused the difference in plant height?",
        "answer": "fertilizer",
        "reasoning_type": "controlled experiment"
    },
    {
        "scenario": "Every time it rains, the street gets wet. The street is currently wet.",
        "question": "Can we conclude it rained?",
        "answer": "no",
        "reasoning_type": "correlation vs causation"
    },
    {
        "scenario": "A factory produces widgets. Machine A breaks down. Production stops. Machine A is repaired. Production resumes.",
        "question": "What would have happened if Machine A hadn't been repaired?",
        "answer": "production would not have resumed",
        "reasoning_type": "counterfactual"
    },
    {
        "scenario": "Ice cream sales and drowning incidents both increase in summer. Someone claims eating ice cream causes drowning.",
        "question": "Is this causal claim valid?",
        "answer": "no",
        "reasoning_type": "spurious correlation"
    },
    # New problems - Confounding Variable
    {
        "scenario": "A study finds that people who drink coffee live longer. However, coffee drinkers also tend to be wealthier and have better access to healthcare.",
        "question": "Can we conclude coffee causes longer life?",
        "answer": "no",
        "reasoning_type": "confounding variable"
    },
    # Necessary Condition
    {
        "scenario": "To start a car, you need a key (or key fob). The car started.",
        "question": "Was a key used?",
        "answer": "yes",
        "reasoning_type": "necessary condition"
    },
    # Sufficient Condition
    {
        "scenario": "Getting 100% on the final exam guarantees passing the course. Maria passed the course.",
        "question": "Did Maria definitely get 100% on the final exam?",
        "answer": "no",
        "reasoning_type": "sufficient but not necessary"
    },
    # Reverse Causation
    {
        "scenario": "Countries with more hospitals have higher death rates. Someone concludes hospitals cause death.",
        "question": "Is this reasoning correct?",
        "answer": "no",
        "reasoning_type": "reverse causation"
    },
    # Multiple Causes
    {
        "scenario": "A fire requires heat, fuel, and oxygen. A fire started in a warehouse that had all three elements.",
        "question": "Would removing the oxygen have prevented the fire?",
        "answer": "yes",
        "reasoning_type": "necessary conditions"
    },
    # Natural Experiment
    {
        "scenario": "Two identical twin cities exist on opposite sides of a border. One city banned smoking in public, the other didn't. After 10 years, the city with the ban had 20% fewer lung cancer cases.",
        "question": "What likely caused the difference in lung cancer rates?",
        "answer": "smoking ban",
        "reasoning_type": "natural experiment"
    },
    # Causal Chain (Complex)
    {
        "scenario": "Deforestation leads to soil erosion. Soil erosion causes rivers to become silted. Silted rivers lead to flooding. A region experienced severe flooding.",
        "question": "Could deforestation be a root cause of the flooding?",
        "answer": "yes",
        "reasoning_type": "complex causal chain"
    },
    # Selection Bias
    {
        "scenario": "A survey of gym members found that 90% exercise regularly. The researcher concluded that 90% of the population exercises regularly.",
        "question": "Is this conclusion valid?",
        "answer": "no",
        "reasoning_type": "selection bias"
    },
    # Post Hoc Fallacy
    {
        "scenario": "I wore my lucky socks and my team won. Therefore, my lucky socks caused the win.",
        "question": "Is this causal reasoning valid?",
        "answer": "no",
        "reasoning_type": "post hoc fallacy"
    },
    # Intervention vs Observation
    {
        "scenario": "Observational data shows that students who sit in the front row get better grades. A student decides to sit in the front row expecting better grades.",
        "question": "Will sitting in the front row definitely improve grades?",
        "answer": "no",
        "reasoning_type": "intervention vs observation"
    },
])

print(f"Causal reasoning dataset: {len(causal_problems)} problems")
causal_problems

In [None]:
# Advanced Causal Reasoning Problems (10 new harder problems)
advanced_causal_problems = pd.DataFrame([
    {"scenario": "Treatment A has better success rates than Treatment B for both mild and severe cases when analyzed separately. However, when all cases are combined, Treatment B appears more successful overall.", "question": "Is Treatment B actually better overall?", "answer": "no", "reasoning_type": "simpson's paradox"},
    {"scenario": "Education level is associated with higher income. Education also leads to better job positions, and better positions lead to higher income.", "question": "Is job position a mediating variable between education and income?", "answer": "yes", "reasoning_type": "mediating variable"},
    {"scenario": "A coach punishes players after their worst performances. Players tend to perform better after being punished. The coach concludes punishment improves performance.", "question": "Is this conclusion justified?", "answer": "no", "reasoning_type": "regression to the mean"},
    {"scenario": "A researcher studies successful startups and finds they all took big risks. The researcher concludes that taking big risks leads to success.", "question": "Is this conclusion valid?", "answer": "no", "reasoning_type": "survivorship bias"},
    {"scenario": "In a hospital, there appears to be a negative correlation between two diseases - patients with Disease A are less likely to have Disease B. Both diseases independently cause hospitalization.", "question": "Does Disease A protect against Disease B in the general population?", "answer": "no", "reasoning_type": "berkson's paradox"},
    {"scenario": "To study if education causes higher earnings (controlling for motivation), a researcher uses distance to college as an instrument. Distance affects education but doesn't directly affect earnings.", "question": "Is distance to college a valid instrumental variable?", "answer": "yes", "reasoning_type": "instrumental variable"},
    {"scenario": "Increased police presence leads to more arrests. More arrests lead to increased crime statistics. Higher crime statistics lead to more police funding.", "question": "Does this prove that more police causes more crime?", "answer": "no", "reasoning_type": "feedback loop"},
    {"scenario": "Countries with higher chocolate consumption have more Nobel Prize winners per capita. Therefore, eating chocolate makes individuals smarter.", "question": "Is this individual-level conclusion valid?", "answer": "no", "reasoning_type": "ecological fallacy"},
    {"scenario": "A study finds that firefighters are present at larger fires. Conclusion: firefighters cause fires to be larger.", "question": "Is this causal claim valid?", "answer": "no", "reasoning_type": "omitted variable bias"},
    {"scenario": "In a study, depression was measured at the same time as social media use. High social media use correlated with more depression.", "question": "Can we conclude social media causes depression?", "answer": "no", "reasoning_type": "temporal precedence"},
])

# Combine original and advanced causal problems
causal_problems = pd.concat([causal_problems, advanced_causal_problems], ignore_index=True)
print(f"Extended causal reasoning dataset: {len(causal_problems)} problems (15 original + 10 advanced)")

In [None]:
@kbench.task(name="causal_reasoning")
def causal_reasoning_task(llm, scenario: str, question: str, answer: str, reasoning_type: str) -> bool:
    """
    Evaluate LLM's ability to understand causal relationships.
    
    Tests:
    - Causal chain identification
    - Distinguishing correlation from causation
    - Counterfactual reasoning
    - Identifying spurious correlations
    """
    prompt = f"""Analyze the following scenario and answer the question about causality.

SCENARIO:
{scenario}

QUESTION:
{question}

Provide a clear, concise answer with brief reasoning.
"""
    
    response = llm.prompt(prompt)
    
    # Check if the key answer concept is present
    is_correct = answer.lower() in response.lower()
    
    kbench.assertions.assert_true(
        is_correct,
        expectation=f"[{reasoning_type}] Expected answer to contain '{answer}'. Model response: {response[:200]}..."
    )
    
    return is_correct


# Test with a single example
causal_reasoning_task.run(
    llm=kbench.llm,
    scenario="Ice cream sales and drowning incidents both increase in summer. Someone claims eating ice cream causes drowning.",
    question="Is this causal claim valid?",
    answer="no",
    reasoning_type="spurious correlation"
)

---
## Task 4: Analogical Reasoning

Tests the ability to identify patterns and complete analogies (A:B :: C:?).

In [None]:
# Dataset for analogical reasoning
analogy_problems = pd.DataFrame([
    {
        "analogy": "Painter : Canvas :: Writer : ?",
        "options": ["Book", "Paper", "Pen", "Story"],
        "answer": "Paper",
        "relationship": "Artist : Medium they work on"
    },
    {
        "analogy": "Fish : Swim :: Bird : ?",
        "options": ["Feather", "Fly", "Nest", "Beak"],
        "answer": "Fly",
        "relationship": "Animal : Primary mode of locomotion"
    },
    {
        "analogy": "Chapter : Book :: Scene : ?",
        "options": ["Movie", "Play", "Actor", "Stage"],
        "answer": "Play",
        "relationship": "Component : Larger work it belongs to"
    },
    {
        "analogy": "Thermometer : Temperature :: Speedometer : ?",
        "options": ["Car", "Velocity", "Dashboard", "Needle"],
        "answer": "Velocity",
        "relationship": "Instrument : What it measures"
    },
    {
        "analogy": "Caterpillar : Butterfly :: Tadpole : ?",
        "options": ["Fish", "Pond", "Frog", "Egg"],
        "answer": "Frog",
        "relationship": "Juvenile form : Adult form"
    },
    {
        "analogy": "Doctor : Hospital :: Teacher : ?",
        "options": ["Student", "School", "Classroom", "Education"],
        "answer": "School",
        "relationship": "Professional : Their workplace"
    },
])

print(f"Analogy dataset: {len(analogy_problems)} problems")
analogy_problems

In [None]:
# Advanced Analogical Reasoning Problems (10 new problems)
advanced_analogy_problems = pd.DataFrame([
    {"analogy": "Sword : Knight :: Stethoscope : ?", "options": ["Hospital", "Doctor", "Patient", "Medicine"], "answer": "Doctor", "relationship": "Tool : Professional who uses it"},
    {"analogy": "Telescope : Stars :: Microscope : ?", "options": ["Bacteria", "Laboratory", "Scientist", "Glass"], "answer": "Bacteria", "relationship": "Instrument : What it's used to observe"},
    {"analogy": "Conductor : Orchestra :: Director : ?", "options": ["Film", "Actor", "Camera", "Script"], "answer": "Film", "relationship": "Leader : What they lead/create"},
    {"analogy": "Hunger : Eat :: Thirst : ?", "options": ["Water", "Drink", "Throat", "Dehydration"], "answer": "Drink", "relationship": "Sensation : Action to satisfy it"},
    {"analogy": "Seed : Tree :: Egg : ?", "options": ["Nest", "Bird", "Shell", "Hatch"], "answer": "Bird", "relationship": "Beginning stage : Fully developed form"},
    {"analogy": "Petal : Flower :: Feather : ?", "options": ["Wing", "Bird", "Flight", "Nest"], "answer": "Bird", "relationship": "Component : The whole organism"},
    {"analogy": "Verse : Poem :: Movement : ?", "options": ["Dance", "Symphony", "Orchestra", "Conductor"], "answer": "Symphony", "relationship": "Section : Complete artistic work"},
    {"analogy": "Author : Novel :: Composer : ?", "options": ["Music", "Symphony", "Piano", "Orchestra"], "answer": "Symphony", "relationship": "Creator : Their major work type"},
    {"analogy": "Dilute : Concentrate :: Expand : ?", "options": ["Contract", "Grow", "Stretch", "Increase"], "answer": "Contract", "relationship": "Antonyms/Opposites"},
    {"analogy": "Marathon : Sprint :: Novel : ?", "options": ["Book", "Story", "Short Story", "Author"], "answer": "Short Story", "relationship": "Long form : Short form of same type"},
])

# Combine original and advanced analogy problems
analogy_problems = pd.concat([analogy_problems, advanced_analogy_problems], ignore_index=True)
print(f"Extended analogy dataset: {len(analogy_problems)} problems (6 original + 10 advanced)")

In [None]:
@dataclass
class AnalogyAnswer:
    """Structured response for analogy problems."""
    selected_answer: str
    relationship_explanation: str


@kbench.task(name="analogical_reasoning")
def analogy_task(llm, analogy: str, options: List[str], answer: str, relationship: str) -> bool:
    """
    Evaluate LLM's ability to complete analogies.
    
    The model must:
    1. Identify the relationship between the first pair
    2. Apply the same relationship to find the missing term
    3. Select the correct answer from options
    """
    options_str = ", ".join(options)
    
    prompt = f"""Complete this analogy by selecting the best option.

ANALOGY:
{analogy}

OPTIONS:
{options_str}

First identify the relationship in the analogy, then select the answer that best completes it.
"""
    
    response = llm.prompt(prompt, schema=AnalogyAnswer)
    
    is_correct = response.selected_answer.lower().strip() == answer.lower().strip()
    
    kbench.assertions.assert_true(
        is_correct,
        expectation=f"Model selected '{response.selected_answer}', expected '{answer}'. Relationship: {relationship}"
    )
    
    return is_correct


# Test with a single example
analogy_task.run(
    llm=kbench.llm,
    analogy="Fish : Swim :: Bird : ?",
    options=["Feather", "Fly", "Nest", "Beak"],
    answer="Fly",
    relationship="Animal : Primary mode of locomotion"
)

---
## Task 5: Multi-Step Planning (River Crossing Puzzle)

Tests complex sequential reasoning with constraints.

In [None]:
@dataclass
class PlanningAnswer:
    """Structured response for planning problems."""
    steps: List[str]  # Ordered list of steps
    total_crossings: int
    explanation: str


@kbench.task(name="multi_step_planning")
def planning_task(llm) -> bool:
    """
    River crossing puzzle: transport wolf, goat, cabbage across river one at a time without leaving incompatible pairs alone.
    """
    prompt = """Solve this classic river crossing puzzle:

A farmer needs to cross a river with three items: a wolf, a goat, and a cabbage.
The farmer has a small boat that can only carry himself and ONE item at a time.

CONSTRAINTS:
- The wolf cannot be left alone with the goat (the wolf will eat the goat)
- The goat cannot be left alone with the cabbage (the goat will eat the cabbage)
- The farmer must be present to prevent any eating

How can the farmer get all three items across the river safely?

Provide the step-by-step solution with each river crossing.
"""
    
    response = llm.prompt(prompt, schema=PlanningAnswer)
    
    # Validate the solution
    # The minimum solution requires 7 crossings
    min_crossings = 7
    
    # Handle steps that may be strings or dicts
    steps_list = response.steps if response.steps else []
    normalized_steps = []
    for step in steps_list:
        if isinstance(step, dict):
            normalized_steps.append(str(step.get('description', step.get('step', str(step)))))
        else:
            normalized_steps.append(str(step))
    
    # Check for key solution elements
    solution_text = " ".join(normalized_steps).lower()
    
    # The goat must go first (critical insight)
    goat_first = "goat" in normalized_steps[0].lower() if normalized_steps else False
    
    # Check if solution addresses all items
    has_wolf = "wolf" in solution_text
    has_goat = "goat" in solution_text
    has_cabbage = "cabbage" in solution_text
    
    is_valid = goat_first and has_wolf and has_goat and has_cabbage
    
    kbench.assertions.assert_true(
        goat_first,
        expectation="The goat must be taken across first (critical insight for this puzzle)"
    )
    
    kbench.assertions.assert_true(
        has_wolf and has_goat and has_cabbage,
        expectation="Solution must address transporting all three items: wolf, goat, and cabbage"
    )
    
    kbench.assertions.assert_true(
        response.total_crossings >= min_crossings,
        expectation=f"Minimum valid solution requires {min_crossings} crossings. Model proposed {response.total_crossings}."
    )
    
    return is_valid


# Run the planning task
planning_task.run(llm=kbench.llm)

---
## Task 6: Comprehensive Reasoning Evaluation

This is the **main task** that combines all reasoning types and evaluates across datasets.

In [None]:
# Combine all datasets with task type labels
# Define datasets inline to ensure availability in Kaggle Benchmarks execution
_logic_problems = pd.DataFrame([
    {"premises": "All mammals are warm-blooded. All dogs are mammals.", "question": "Are all dogs warm-blooded?", "answer": "yes", "explanation": "Dogs are mammals, and all mammals are warm-blooded."},
    {"premises": "No reptiles are mammals. All snakes are reptiles.", "question": "Are any snakes mammals?", "answer": "no", "explanation": "Snakes are reptiles, and no reptiles are mammals."},
    {"premises": "All programmers use computers. Some artists are programmers.", "question": "Do some artists use computers?", "answer": "yes", "explanation": "Some artists are programmers who use computers."},
    {"premises": "All squares are rectangles. All rectangles have four sides.", "question": "Do all squares have four sides?", "answer": "yes", "explanation": "Squares are rectangles with four sides."},
    {"premises": "If it rains, the ground gets wet. The ground is wet.", "question": "Did it definitely rain?", "answer": "no", "explanation": "Affirming the consequent fallacy."},
    {"premises": "If a person is a bachelor, then they are unmarried. John is a bachelor.", "question": "Is John unmarried?", "answer": "yes", "explanation": "Modus ponens."},
    {"premises": "If it is a weekday, the office is open. The office is not open.", "question": "Is it a weekday?", "answer": "no", "explanation": "Modus tollens."},
    {"premises": "Either the package was delivered or it was lost. The package was not delivered.", "question": "Was the package lost?", "answer": "yes", "explanation": "Disjunctive syllogism."},
    {"premises": "If it snows, the schools close. If the schools close, children stay home.", "question": "If it snows, do children stay home?", "answer": "yes", "explanation": "Hypothetical syllogism."},
    {"premises": "All unicorns have horns. All unicorns are magical creatures.", "question": "Are there magical creatures with horns?", "answer": "no", "explanation": "Existential fallacy."},
    {"premises": "No honest politician takes bribes. Some senators are honest politicians.", "question": "Do some senators not take bribes?", "answer": "yes", "explanation": "Valid syllogism."},
    {"premises": "If you study hard, you will pass the exam. You did not study hard.", "question": "Can we conclude you will fail the exam?", "answer": "no", "explanation": "Denying the antecedent fallacy."},
    {"premises": "All birds have feathers. Penguins are birds.", "question": "Do penguins have feathers?", "answer": "yes", "explanation": "Valid syllogism."},
    {"premises": "A number is either even or odd, but not both. The number 7 is not even.", "question": "Is 7 odd?", "answer": "yes", "explanation": "Exclusive disjunction."},
    {"premises": "No fish can breathe air directly. Salmon are fish.", "question": "Can salmon breathe air directly?", "answer": "no", "explanation": "Universal negative."},
    # Advanced logic problems
    {"premises": "A shape is a square if and only if it has four equal sides and four right angles. This shape has four equal sides and four right angles.", "question": "Is this shape a square?", "answer": "yes", "explanation": "Biconditional."},
    {"premises": "If it's Monday, then if it's raining, the market is closed. It's Monday and it's raining.", "question": "Is the market closed?", "answer": "yes", "explanation": "Nested conditional."},
    {"premises": "If I study, I'll pass. If I work, I'll earn money. I will either study or work.", "question": "Will I either pass or earn money?", "answer": "yes", "explanation": "Constructive dilemma."},
    {"premises": "If the car starts, the battery is charged. If the lights work, the battery is charged. The battery is not charged.", "question": "Can we conclude neither the car starts nor the lights work?", "answer": "yes", "explanation": "Destructive dilemma."},
    {"premises": "All A are B. All B are C. All C are D. All D are E. X is an A.", "question": "Is X an E?", "answer": "yes", "explanation": "Sorites chain."},
    {"premises": "All cats are mammals. No non-mammals are cats.", "question": "Are these two statements logically equivalent?", "answer": "yes", "explanation": "Obversion."},
    {"premises": "All dogs are animals. No cats are dogs.", "question": "Can we conclude that no cats are animals?", "answer": "no", "explanation": "Illicit major fallacy."},
    {"premises": "All cats are mammals. All dogs are mammals.", "question": "Can we conclude anything about the relationship between cats and dogs?", "answer": "no", "explanation": "Undistributed middle."},
    {"premises": "Every person loves someone. There is someone who is loved by every person.", "question": "Do these two statements mean the same thing?", "answer": "no", "explanation": "Quantifier scope difference."},
    {"premises": "It is necessary that if it rains, the ground gets wet. It is possible that it rains.", "question": "Is it possible that the ground gets wet?", "answer": "yes", "explanation": "Modal logic."},
])

_math_problems = pd.DataFrame([
    {"problem": "A store sells apples for $2 each and oranges for $3 each. If Sarah buys 4 apples and 5 oranges, and pays with a $50 bill, how much change does she receive?", "answer": 27, "steps": "4√ó$2=$8, 5√ó$3=$15, Total=$23, Change=$27"},
    {"problem": "A train travels at 60 mph for 2 hours, then at 80 mph for 1.5 hours. What is the total distance traveled?", "answer": 240, "steps": "60√ó2=120, 80√ó1.5=120, Total=240"},
    {"problem": "If 3 workers can complete a job in 12 days, how many days would it take 4 workers?", "answer": 9, "steps": "3√ó12=36 worker-days, 36√∑4=9 days"},
    {"problem": "A rectangle has a perimeter of 36 cm. If its length is twice its width, what is the area?", "answer": 72, "steps": "w=6, l=12, Area=72"},
    {"problem": "A tank is 1/3 full. After adding 40 liters, it becomes 2/3 full. What is the total capacity?", "answer": 120, "steps": "40L = 1/3, capacity=120L"},
    {"problem": "A shirt originally costs $80. It's on sale for 25% off. What is the sale price?", "answer": 60, "steps": "80√ó0.25=$20, $80-$20=$60"},
    {"problem": "You invest $1000 at 5% simple interest per year. How much after 3 years?", "answer": 1150, "steps": "$50√ó3=$150, $1000+$150=$1150"},
    {"problem": "Tom is twice as old as Jerry. In 5 years, Tom will be 1.5 times as old as Jerry. How old is Jerry?", "answer": 5, "steps": "2x+5=1.5(x+5), x=5"},
    {"problem": "Two cars travel opposite directions at 50 and 70 mph. When are they 360 miles apart?", "answer": 3, "steps": "120 mph combined, 360√∑120=3 hours"},
    {"problem": "The ratio of boys to girls is 3:5. If there are 24 students, how many girls?", "answer": 15, "steps": "24√∑8=3, 5√ó3=15"},
    {"problem": "Mix 20% salt solution with 10L of 50% to get 30%. How many liters of 20%?", "answer": 20, "steps": "0.2x+5=0.3(x+10), x=20"},
    {"problem": "Buy for $200, sell for $250. What is the profit percentage?", "answer": 25, "steps": "50/200√ó100=25%"},
    {"problem": "A circular garden has radius 7m. What is its area? (œÄ=22/7)", "answer": 154, "steps": "(22/7)√ó49=154"},
    {"problem": "Average of 5 numbers is 20. Remove one, average becomes 15. What was removed?", "answer": 40, "steps": "100-60=40"},
    {"problem": "What is the sum of the first 10 positive even numbers?", "answer": 110, "steps": "2+4+...+20=110"},
    # Advanced math problems
    {"problem": "A farmer has 100 meters of fencing. What is the maximum rectangular area (in square meters) that can be enclosed?", "answer": 625, "steps": "Square side=25, area=625"},
    {"problem": "A bag contains 3 red and 2 blue balls. If two balls drawn without replacement, what is the probability both are red (as percentage)?", "answer": 30, "steps": "(3/5)√ó(2/4)=30%"},
    {"problem": "A bacteria colony doubles every 4 hours. Starting with 100, how many after 12 hours?", "answer": 800, "steps": "3 doublings: 100√ó8=800"},
    {"problem": "How many 3-letter arrangements from A,B,C,D,E with no repetition?", "answer": 60, "steps": "5√ó4√ó3=60"},
    {"problem": "What is the GCD of 84 and 126?", "answer": 42, "steps": "2√ó3√ó7=42"},
    {"problem": "If log‚ÇÅ‚ÇÄ(x) = 3, what is x?", "answer": 1000, "steps": "10¬≥=1000"},
    {"problem": "Right triangle: one leg is 3cm, hypotenuse is 5cm. What is the other leg in cm?", "answer": 4, "steps": "3¬≤+b¬≤=5¬≤, b=4"},
    {"problem": "Simplify (x¬≤-9)/(x-3) when x=5. What is the result?", "answer": 8, "steps": "(x+3)=8"},
    {"problem": "Ball thrown up with v=40m/s. Using h=40t-5t¬≤, when does it reach max height (seconds)?", "answer": 4, "steps": "t=-40/(2√ó-5)=4"},
    {"problem": "What is the sum of interior angles of a hexagon in degrees?", "answer": 720, "steps": "(6-2)√ó180=720"},
])

_causal_problems = pd.DataFrame([
    {"scenario": "John was late because his alarm didn't go off due to a power outage.", "question": "What was the root cause?", "answer": "power outage", "reasoning_type": "causal chain"},
    {"scenario": "Garden A with fertilizer grew 30% taller than Garden B without. Same soil, sun, water.", "question": "What caused the difference?", "answer": "fertilizer", "reasoning_type": "controlled experiment"},
    {"scenario": "Every time it rains, the street gets wet. The street is wet.", "question": "Can we conclude it rained?", "answer": "no", "reasoning_type": "correlation vs causation"},
    {"scenario": "Machine A breaks, production stops. Machine A repaired, production resumes.", "question": "What if Machine A wasn't repaired?", "answer": "production would not have resumed", "reasoning_type": "counterfactual"},
    {"scenario": "Ice cream sales and drowning both increase in summer. Claim: ice cream causes drowning.", "question": "Is this causal claim valid?", "answer": "no", "reasoning_type": "spurious correlation"},
    {"scenario": "Coffee drinkers live longer but are also wealthier with better healthcare.", "question": "Can we conclude coffee causes longer life?", "answer": "no", "reasoning_type": "confounding variable"},
    {"scenario": "To start a car, you need a key. The car started.", "question": "Was a key used?", "answer": "yes", "reasoning_type": "necessary condition"},
    {"scenario": "100% on final guarantees passing. Maria passed.", "question": "Did Maria get 100%?", "answer": "no", "reasoning_type": "sufficient but not necessary"},
    {"scenario": "Countries with more hospitals have higher death rates.", "question": "Do hospitals cause death?", "answer": "no", "reasoning_type": "reverse causation"},
    {"scenario": "Fire needs heat, fuel, oxygen. All three present, fire started.", "question": "Would removing oxygen prevent fire?", "answer": "yes", "reasoning_type": "necessary conditions"},
    {"scenario": "Twin cities, one banned smoking. 10 years later, 20% fewer lung cancer cases.", "question": "What caused the difference?", "answer": "smoking ban", "reasoning_type": "natural experiment"},
    {"scenario": "Deforestation‚Üíerosion‚Üísilted rivers‚Üíflooding.", "question": "Could deforestation cause flooding?", "answer": "yes", "reasoning_type": "complex causal chain"},
    {"scenario": "Survey of gym members: 90% exercise. Conclusion: 90% of population exercises.", "question": "Is this valid?", "answer": "no", "reasoning_type": "selection bias"},
    {"scenario": "Wore lucky socks, team won. Lucky socks caused win.", "question": "Is this valid?", "answer": "no", "reasoning_type": "post hoc fallacy"},
    {"scenario": "Front row students get better grades. Student sits in front expecting better grades.", "question": "Will grades improve?", "answer": "no", "reasoning_type": "intervention vs observation"},
    # Advanced causal problems
    {"scenario": "Treatment A better than B for mild and severe cases separately, but B better overall.", "question": "Is Treatment B actually better overall?", "answer": "no", "reasoning_type": "simpson's paradox"},
    {"scenario": "Education‚Üíhigher income. Education‚Üíbetter jobs‚Üíhigher income.", "question": "Is job position a mediating variable?", "answer": "yes", "reasoning_type": "mediating variable"},
    {"scenario": "Coach punishes players after worst performances. Players improve after punishment.", "question": "Is punishment improving performance?", "answer": "no", "reasoning_type": "regression to mean"},
    {"scenario": "Researcher studies successful startups: all took big risks. Concludes big risks lead to success.", "question": "Is this conclusion valid?", "answer": "no", "reasoning_type": "survivorship bias"},
    {"scenario": "In hospital, Disease A patients less likely to have Disease B. Both cause hospitalization.", "question": "Does Disease A protect against B in general population?", "answer": "no", "reasoning_type": "berkson's paradox"},
    {"scenario": "Distance to college affects education but not earnings directly. Used to study education‚Üíearnings.", "question": "Is distance a valid instrumental variable?", "answer": "yes", "reasoning_type": "instrumental variable"},
    {"scenario": "More police‚Üímore arrests‚Üíhigher crime stats‚Üímore police funding.", "question": "Does this prove more police causes more crime?", "answer": "no", "reasoning_type": "feedback loop"},
    {"scenario": "Countries with more chocolate have more Nobel winners. Conclusion: chocolate makes individuals smarter.", "question": "Is this individual-level conclusion valid?", "answer": "no", "reasoning_type": "ecological fallacy"},
    {"scenario": "Firefighters present at larger fires. Conclusion: firefighters cause larger fires.", "question": "Is this causal claim valid?", "answer": "no", "reasoning_type": "omitted variable bias"},
    {"scenario": "Depression and social media use measured simultaneously. High correlation found.", "question": "Can we conclude social media causes depression?", "answer": "no", "reasoning_type": "temporal precedence"},
])

_analogy_problems = pd.DataFrame([
    {"analogy": "Painter : Canvas :: Writer : ?", "options": ["Book", "Paper", "Pen", "Story"], "answer": "Paper", "relationship": "Artist : Medium"},
    {"analogy": "Fish : Swim :: Bird : ?", "options": ["Feather", "Fly", "Nest", "Beak"], "answer": "Fly", "relationship": "Animal : Locomotion"},
    {"analogy": "Chapter : Book :: Scene : ?", "options": ["Movie", "Play", "Actor", "Stage"], "answer": "Play", "relationship": "Part : Whole"},
    {"analogy": "Thermometer : Temperature :: Speedometer : ?", "options": ["Car", "Velocity", "Dashboard", "Needle"], "answer": "Velocity", "relationship": "Instrument : Measurement"},
    {"analogy": "Caterpillar : Butterfly :: Tadpole : ?", "options": ["Fish", "Pond", "Frog", "Egg"], "answer": "Frog", "relationship": "Juvenile : Adult"},
    {"analogy": "Doctor : Hospital :: Teacher : ?", "options": ["Student", "School", "Classroom", "Education"], "answer": "School", "relationship": "Professional : Workplace"},
    # Advanced analogies
    {"analogy": "Sword : Knight :: Stethoscope : ?", "options": ["Hospital", "Doctor", "Patient", "Medicine"], "answer": "Doctor", "relationship": "Tool : Professional"},
    {"analogy": "Telescope : Stars :: Microscope : ?", "options": ["Bacteria", "Laboratory", "Scientist", "Glass"], "answer": "Bacteria", "relationship": "Instrument : Observed"},
    {"analogy": "Conductor : Orchestra :: Director : ?", "options": ["Film", "Actor", "Camera", "Script"], "answer": "Film", "relationship": "Leader : Creation"},
    {"analogy": "Hunger : Eat :: Thirst : ?", "options": ["Water", "Drink", "Throat", "Dehydration"], "answer": "Drink", "relationship": "Sensation : Action"},
    {"analogy": "Seed : Tree :: Egg : ?", "options": ["Nest", "Bird", "Shell", "Hatch"], "answer": "Bird", "relationship": "Beginning : Developed"},
    {"analogy": "Petal : Flower :: Feather : ?", "options": ["Wing", "Bird", "Flight", "Nest"], "answer": "Bird", "relationship": "Part : Whole"},
    {"analogy": "Verse : Poem :: Movement : ?", "options": ["Dance", "Symphony", "Orchestra", "Conductor"], "answer": "Symphony", "relationship": "Section : Work"},
    {"analogy": "Author : Novel :: Composer : ?", "options": ["Music", "Symphony", "Piano", "Orchestra"], "answer": "Symphony", "relationship": "Creator : Work"},
    {"analogy": "Dilute : Concentrate :: Expand : ?", "options": ["Contract", "Grow", "Stretch", "Increase"], "answer": "Contract", "relationship": "Antonyms"},
    {"analogy": "Marathon : Sprint :: Novel : ?", "options": ["Book", "Story", "Short Story", "Author"], "answer": "Short Story", "relationship": "Long : Short"},
])

def create_combined_dataset(logic_problems, math_problems, causal_problems, analogy_problems):
    """Create a unified reasoning evaluation dataset."""
    
    # Add task_type column to each dataset
    logic_df = logic_problems.copy()
    logic_df['task_type'] = 'logical_deduction'
    logic_df['prompt_template'] = logic_df.apply(
        lambda r: f"Premises: {r['premises']}\nQuestion: {r['question']}", axis=1
    )
    logic_df['expected'] = logic_df['answer']
    
    math_df = math_problems.copy()
    math_df['task_type'] = 'math_reasoning'
    math_df['prompt_template'] = math_df['problem']
    math_df['expected'] = math_df['answer'].astype(str)
    
    causal_df = causal_problems.copy()
    causal_df['task_type'] = 'causal_reasoning'
    causal_df['prompt_template'] = causal_df.apply(
        lambda r: f"Scenario: {r['scenario']}\nQuestion: {r['question']}", axis=1
    )
    causal_df['expected'] = causal_df['answer']
    
    analogy_df = analogy_problems.copy()
    analogy_df['task_type'] = 'analogical_reasoning'
    analogy_df['prompt_template'] = analogy_df.apply(
        lambda r: f"Analogy: {r['analogy']}\nOptions: {', '.join(r['options'])}", axis=1
    )
    analogy_df['expected'] = analogy_df['answer']
    
    # Select common columns
    common_cols = ['task_type', 'prompt_template', 'expected']
    
    combined = pd.concat([
        logic_df[common_cols],
        math_df[common_cols],
        causal_df[common_cols],
        analogy_df[common_cols],
    ], ignore_index=True)
    
    return combined


combined_dataset = create_combined_dataset(_logic_problems, _math_problems, _causal_problems, _analogy_problems)
print(f"Combined dataset: {len(combined_dataset)} reasoning problems")
print(f"\nTask type distribution:")
print(combined_dataset['task_type'].value_counts())

In [None]:
@kbench.task(store_task=False)
def single_reasoning_task(llm, task_type: str, prompt_template: str, expected: str) -> dict:
    """
    Single reasoning task for dataset evaluation.
    """
    system_prompts = {
        'logical_deduction': "You are a logical reasoning expert. Answer with 'yes' or 'no' and explain your reasoning.",
        'math_reasoning': "You are a math expert. Solve step by step and provide the final numerical answer.",
        'causal_reasoning': "You are an expert in causal analysis. Identify cause-and-effect relationships.",
        'analogical_reasoning': "You are an expert in pattern recognition. Complete the analogy by selecting the best option.",
    }
    
    system_prompt = system_prompts.get(task_type, "Answer the following question.")
    full_prompt = f"{system_prompt}\n\n{prompt_template}\n\nProvide a clear, concise answer."
    
    response = llm.prompt(full_prompt)
    
    # Check if expected answer is in response (flexible matching)
    is_correct = expected.lower() in response.lower()
    
    return {
        "task_type": task_type,
        "prompt": prompt_template[:100] + "...",
        "expected": expected,
        "response": response[:200],
        "is_correct": is_correct,
    }


@kbench.task(name="comprehensive_reasoning_benchmark")
def comprehensive_reasoning_benchmark(llm, df: pd.DataFrame) -> tuple[float, float]:
    """
    Benchmark evaluating logical, mathematical, causal, and analogical reasoning. Returns (accuracy, std_dev).
    """
    with kbench.client.enable_cache():
        runs = single_reasoning_task.evaluate(
            stop_condition=lambda runs: len(runs) == df.shape[0],
            max_attempts=1,
            llm=[llm],
            evaluation_data=df,
            n_jobs=2,
            timeout=120,
            remove_run_files=True,
        )
    
    eval_df = runs.as_dataframe()
    
    # Calculate overall accuracy
    accuracy = float(eval_df['result'].apply(lambda x: x.get('is_correct', False)).mean())
    std = float(eval_df['result'].apply(lambda x: x.get('is_correct', False)).std())
    
    # Calculate per-task-type accuracy
    task_types = eval_df['result'].apply(lambda x: x.get('task_type', 'unknown'))
    correctness = eval_df['result'].apply(lambda x: x.get('is_correct', False))
    
    print("\n" + "="*50)
    print("REASONING BENCHMARK RESULTS")
    print("="*50)
    print(f"\nOverall Accuracy: {accuracy:.2%} (¬±{std:.2%})")
    print(f"\nPer-Category Breakdown:")
    
    for task_type in task_types.unique():
        mask = task_types == task_type
        type_accuracy = correctness[mask].mean()
        print(f"  ‚Ä¢ {task_type}: {type_accuracy:.2%}")
    
    print("="*50)
    
    # Record assertions for visibility
    kbench.assertions.assert_true(
        accuracy > 0,
        expectation=f"Overall reasoning accuracy: {accuracy:.2%}"
    )
    
    return accuracy, std


# Run the comprehensive benchmark
run = comprehensive_reasoning_benchmark.run(
    llm=kbench.llm,
    df=combined_dataset
)

---
## Task 7: Advanced - Judge LLM Evaluation for Open-Ended Reasoning

Uses a judge LLM to evaluate quality of reasoning explanations.

In [None]:
@kbench.task(name="reasoning_quality_evaluation")
def evaluate_reasoning_quality(llm):
    """
    Evaluates the QUALITY of reasoning, not just correctness.
    
    Uses a judge LLM to assess:
    - Logical coherence
    - Step-by-step clarity
    - Correct identification of key concepts
    - Absence of logical fallacies
    """
    # Complex reasoning problem
    problem = """
    Consider the following scenario:
    
    A company has 100 employees. 60% are engineers, 30% are managers, and 10% are executives.
    All executives are also managers. 50% of engineers have a master's degree.
    20% of managers (not counting executives) have a PhD.
    All executives have either a master's or PhD.
    
    Question: What percentage of the company has at least a master's degree?
    Show your reasoning step by step.
    """
    
    response = llm.prompt(problem)
    
    # Use judge LLM to evaluate the reasoning quality
    assessment = kbench.assertions.assess_response_with_judge(
        criteria=[
            "The response correctly identifies that executives are a subset of managers (not additive).",
            "The response correctly calculates engineers with master's: 60% √ó 50% = 30%.",
            "The response correctly accounts for manager degrees separately from executive degrees.",
            "The response shows clear step-by-step mathematical reasoning.",
            "The response arrives at a plausible final answer with proper justification.",
            "The response avoids double-counting any group.",
        ],
        response_text=response,
        judge_llm=kbench.judge_llm,
    )
    
    # Record each criterion as an assertion
    for result in assessment.results:
        kbench.assertions.assert_true(
            result.passed,
            expectation=f"Criterion: {result.criterion}. Evaluation: {result.reason}"
        )
    
    # Calculate pass rate
    passed_count = sum(1 for r in assessment.results if r.passed)
    total_count = len(assessment.results)
    
    print(f"\nReasoning Quality Score: {passed_count}/{total_count} criteria passed")


# Run the quality evaluation
evaluate_reasoning_quality.run(llm=kbench.llm)

---
## üìä Final: Select Main Task for Leaderboard

Choose which task to publish to the Kaggle Benchmarks leaderboard.

In [None]:
# Select the comprehensive benchmark as the main task for the leaderboard
%choose comprehensive_reasoning_benchmark

---
## üìù Summary

This benchmark evaluates LLM reasoning capabilities across:

| Category | # Problems | Description |
|----------|------------|-------------|
| Logical Deduction | 5 | Syllogisms, logical inference |
| Math Word Problems | 5 | Multi-step calculations |
| Causal Reasoning | 5 | Cause-effect, counterfactuals |
| Analogical Reasoning | 6 | Pattern completion |
| **Total** | **21** | Comprehensive reasoning evaluation |

### Making This Benchmark Public

1. Click **"Save Version"** in the notebook
2. Go to **Settings** ‚Üí **Visibility** ‚Üí **Public**
3. Add more models via "Evaluate More Models" button
4. Share your benchmark URL!

---

**Happy Benchmarking! üöÄ**