In [49]:
# In this notebook, we will evaluate the responses generated by the LLMs.

import pandas as pd
import numpy as np
import os
import json
from tqdm.auto import tqdm
import ast


In [50]:
df_questions = pd.read_csv('data/habermas_machine_questions_with_responses.csv')
df_questions['own_opinion.text'] = df_questions['own_opinion.text'].apply(ast.literal_eval)
df_questions.head()

Unnamed: 0,question.text,own_opinion.text,question_topic,question_id,gpt-4o-mini,gpt-3.5-turbo,gpt-4o
0,Are laws banning private education a good idea?,[I am indifferent regarding this statement for...,70,7,The debate over whether laws banning private e...,Banning private education completely may not b...,The question of whether laws banning private e...
1,"If we could extend human life by 50 years, sho...",[I think we should always advance science and ...,90,128,The question of whether we should extend human...,This question is highly subjective and there a...,Extending human life by 50 years is a complex ...
2,Is it OK to use animal products at all for bea...,[I do not think it is ok to use animal product...,5,157,The use of animal products in beauty care is a...,This is a personal choice and ultimately depen...,The use of animal products in beauty care is a...
3,Is the use of fossil fuels doing more harm tha...,[This question is if you believe in man- made ...,39,343,The impact of fossil fuels on the economy is a...,The use of fossil fuels has both positive and ...,The use of fossil fuels is a complex issue wit...
4,Should parents have a legal right to override ...,[Parents should have a legal right to some deg...,61,511,The question of whether parents should have a ...,There is no simple answer to this question as ...,The question of whether parents should have a ...


In [40]:
# Now we want to see if the opinions are all represented in the generated responses.
from openai import OpenAI
client = OpenAI()

def is_my_opinion_represented(question, response, opinion):
    """
    Determine if the following opinion is represented in the response to a question. This is the simplest approach I can think of.
    """
    prompt = f"""Task: Determine if the following opinion is represented in the response to a question.

Question: {question}
Response: {response}
Opinion to check for: {opinion}

Instructions:
- Answer ONLY with 'yes' or 'no'
- Answer 'yes' if the opinion is clearly represented
- Answer 'no' if the opinion is absent
- Do not explain your reasoning
- Do not add any other text

Answer:"""

    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a precise evaluator that only responds with 'yes' or 'no'."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,  # Use 0 for maximum consistency
        max_tokens=1, # We only need one token for 'yes' or 'no'
    )
    
    return completion.choices[0].message.content.strip().lower()

In [None]:
def is_my_opinion_represented_with_cot_michiel(question, response, opinion):
    """
    Determine if the following opinion is represented in the response to a question. Using a CoT with XML formatting (task for Michiel Bakker).
    """
    return False

In [123]:
from pydantic import BaseModel

class ThinkingStep(BaseModel):
    step_number: int
    thought: str

class RepresentationChain(BaseModel):
    steps: list[ThinkingStep]
    relevance_score: int

def is_my_opinion_represented_structured_cot(question, response, opinion):
    """
    Determine if the opinion is represented in the response to a question, using structured CoT generation.
    """
    system_prompt = f"""Task: Determine if the opinion is represented in the response to the question. 
You should think step-by-step about the question and response to determine if the opinion is represented and then give a score between 0 and 10 of how well it is represented.
"""
    
    prompt ="""Question: {question}
Response: {response}
Opinion to check for: {opinion}
"""

    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a precise evaluator that only responds with 'yes' or 'no'."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,  # Use 0 for maximum consistency
        response_format={
            'type': 'json_schema',
            'json_schema': 
                {
                "name": "RepresentationChain", 
                "schema": RepresentationChain.model_json_schema()
                }
            } 
    )
    
    result_object = json.loads(completion.choices[0].message.content)
    return result_object

def process_representation_result(result_object):
    return result_object['relevance_score']


In [124]:
from pydantic import BaseModel

class ThinkingStep(BaseModel):
    step_number: int
    thought: str

class EntailmentJSON(BaseModel):
    matches: list[str]

class EntailmentChain(BaseModel):
    steps: list[ThinkingStep]
    final_answer: EntailmentJSON

def entailment_from_gpt_json(question: str, response: str, opinion: str):
    """
    Find exact text matches between rich text and opinion using GPT-4.

    Args:
        question: Context question or query
        response: Source text to find matches in
        opinion: Opinion text to match against

    Returns:
        Dictionary containing matches as an array of strings
    """
    system_prompt = f"""Task: Text Entailment. Find the exact words in the Rich Text question response that specifically represent the content expressed in the Opinion.

Instructions:
1. For each concept in the Opinion, find the sentences in the Rich Text that specifically represent the content expressed in the Opinion. Think through this step-by-step.
2. Return the EXACT text from the Rich Text that represents the content expressed in the Opinion.

Important:
- Do not paraphrase or interpret - use the exact text.
- Each match must be a continuous span of text.
"""

    prompt = f"""Context question: {question}
Rich Text: {response}
Opinion: {opinion}

Find the text that is entailed by the opinion."""

    chat_response = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        temperature=0,  # Use 0 for maximum consistency
        response_format={
            'type': 'json_schema',
            'json_schema': 
                {
                "name": "EntailmentChain", 
                "schema": EntailmentChain.model_json_schema()
                }
            } 
    )

    result_object = json.loads(chat_response.choices[0].message.content)
    return result_object

def process_entailment_result(result_object, response):
    matches = []
    for match in result_object['final_answer']['matches']:
        start_index = response.lower().find(match.lower())
        end_index = start_index + len(match)
        matches.append((start_index, end_index))
    return matches


In [128]:
models = ['gpt-4o-mini', 'gpt-3.5-turbo','gpt-4o']

overton_results = []
for _, row in tqdm(df_questions.iterrows(), total=df_questions.shape[0]):
    question = row['question.text']
    opinions = row['own_opinion.text']
    question_id = row['question_id']
    for opinion_idx, opinion in enumerate(opinions):
        for model in models:
            response = row[model]
            result_opinion_represented = is_my_opinion_represented(question, response, opinion)
            result_structured_cot = is_my_opinion_represented_structured_cot(question, response, opinion)
            entailment_result = entailment_from_gpt_json(question, response, opinion)
            entailment_matches = process_entailment_result(entailment_result, response)

            overton_results.append({
                'question_id': question_id,
                'opinion_idx': opinion_idx,
                'model': model,
                'is_represented_simple_prompt': result_opinion_represented == 'yes',
                'is_represented_structured_cot': result_structured_cot,
                'is_represented_structured_cot_score': process_representation_result(result_structured_cot),
                'entailment_result': entailment_result,
                'entailment_matches': entailment_matches
            })

df_overton_results = pd.DataFrame(overton_results)

In [126]:
df_overton_results = pd.DataFrame(overton_results)

In [203]:
df_overton_results.to_csv('data/overton_results.csv', index=False)

In [135]:
df_overton_results.groupby(['model', 'question_id'])['is_represented_simple_prompt'].mean().reset_index().groupby('model')['is_represented_simple_prompt'].mean()

model
gpt-3.5-turbo    0.066667
gpt-4o           0.200000
gpt-4o-mini      0.266667
Name: is_represented_simple_prompt, dtype: float64

# Analysis

In [198]:
df_overton_results = pd.read_csv('data/overton_results.csv')

In [199]:
df_overton_results['is_represented_structured_cot_score.bool'] = df_overton_results['is_represented_structured_cot_score'] > 5
df_overton_results['meta_analysis.simpleXstructured'] = (df_overton_results['is_represented_simple_prompt'] == df_overton_results['is_represented_structured_cot_score.bool']).astype(int)
df_overton_results['meta_analysis.entailmentLength'] = df_overton_results['entailment_matches'].apply(lambda x: len(x))
df_overton_results['meta_analysis.entailmentXstructured'] = ((df_overton_results['meta_analysis.entailmentLength'] > 1) == df_overton_results['is_represented_structured_cot_score.bool'])


In [200]:
print(f"""Meta analysis:
- Percent represented (structured cot) > 5: {df_overton_results['is_represented_structured_cot_score.bool'].mean()}
- Percent represented (simple prompt) == Percent represented (structured cot): {df_overton_results['meta_analysis.simpleXstructured'].mean()}
- Percent entailment length > 1 == represented (structured cot): {df_overton_results['meta_analysis.entailmentXstructured'].mean()}
- Mean Entailment Length: {df_overton_results['meta_analysis.entailmentLength'].mean()}
- Max Entailment Length: {df_overton_results['meta_analysis.entailmentLength'].max()}
""")


Meta analysis:
- Percent represented (structured cot) > 5: 0.4
- Percent represented (simple prompt) == Percent represented (structured cot): 0.7142857142857143
- Percent entailment length > 1 == represented (structured cot): 0.4
- Mean Entailment Length: 24.97142857142857
- Max Entailment Length: 44



# Visual example

In [201]:
example =df_overton_results.sample(1).iloc[0]
example_question = df_questions.loc[df_questions['question_id'] == example['question_id']].iloc[0]
opinion = example_question['own_opinion.text'][example['opinion_idx']]
response =example_question[example['model']]


In [202]:
print(f"""Question: {example_question['question.text']}
      
Response: {response}

From model: {example['model']}

Opinion: {opinion}

ANALYSIS:

Is represented (simple prompt): {example['is_represented_simple_prompt']}

Is represented (structured cot): {example['is_represented_structured_cot']}

Entailment result: {example['entailment_matches']}

Entailment Reasoning: {example['entailment_result']}

META ANALYSIS:

Simple prompt == Structured cot: {example['meta_analysis.simpleXstructured'] == 1}

Entailment length > 1 AND simple prompt == structured cot: {example['meta_analysis.entailmentXstructured']}

""")


Question: If we could extend human life by 50 years, should we do it?
      
Response: This question is highly subjective and there are various ethical, social, and scientific considerations to take into account when discussing the idea of extending human life by 50 years. Some arguments in favor of extending human life may include advancements in medical technology and the potential for individuals to experience more of their lives, achieve personal goals, and contribute to society. On the other hand, concerns may be raised about overpopulation, resource depletion, inequality in access to life-extending technologies, and the potential negative impacts on the environment.

Ultimately, the decision to extend human life by 50 years would require careful consideration of these factors and a thorough examination of the possible consequences. It is important to engage in open and transparent discussions with experts, policymakers, ethicists, and members of the public to ensure that any deci