# Running the entailment methods on the full dataset

This notebook takes the lessons from the `comparing_entailment_methods.ipynb` notebook and runs the final entailment methods on the full dataset to generate metrics for the different LLM models.

In [None]:
import pandas as pd
import numpy as np
import os
import json
from tqdm import tqdm
import ast
from dotenv import load_dotenv
load_dotenv()

from pydantic import BaseModel
import json

import openai
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [None]:
from src.entailment import entailment_from_gpt_json, process_entailment_result


In [None]:
df_questions = pd.read_csv('data/habermas_machine_questions_with_responses.csv')
df_questions['own_opinion.text'] = df_questions['own_opinion.text'].apply(ast.literal_eval)
df_questions.head()

In [None]:
# You should make sure these models are up to date before running this.
response_models = ['gpt-3.5-turbo', 'gpt-4o', 'llama-3.1-8B', 'mistral-7B'] 
assert all([model in df_questions.columns for model in response_models]) # Quick check

In [None]:
entailment_model= 'gpt-4o-mini'
entailment_results = []
for _, row in tqdm(df_questions.iterrows(), total=df_questions.shape[0], desc="Questions", leave=True):
    question = row['question.text']
    question_id = row['question_id']
    opinions = row['own_opinion.text']
    with tqdm(total=len(opinions), desc="Opinions", leave=False) as opinion_bar:
        for opinion_idx, opinion in enumerate(opinions):
            # with tqdm(total=len(response_models), desc="Response Models", leave=False) as response_bar:
            for response_model in response_models:
                response = row[response_model]
                entailment_result = entailment_from_gpt_json(question, response, opinion, model='gpt-4o-mini')
                matches = process_entailment_result(entailment_result, response)
                entailment_results.append({
                    'question_id': question_id,
                    'opinion_idx': opinion_idx,
                    'response_model': response_model,
                    'entailment_model': entailment_model,
                    'entailment_result': entailment_result,
                    'matches': matches
                })
                # response_bar.update(1)
            opinion_bar.update(1)


In [None]:
raw_entailment_results = pd.DataFrame(entailment_results)
raw_entailment_results.to_csv('data/raw_entailment_results.csv', index=False)
raw_entailment_results.head()

## Generating metrics

Here we're going to generate a sample of metrics over the entailment results for each of the models.

In [None]:
raw_entailment_results['match_total_length'] = raw_entailment_results['matches'].apply(
    lambda x: sum([match[1]-match[0] for match in x])
)

In [None]:
raw_entailment_results['question_length'] = raw_entailment_results['question_id'].map(df_questions.set_index('question_id')['question.text'].str.len())
raw_entailment_results['match_length_ratio'] = raw_entailment_results['match_total_length'] / raw_entailment_results['question_length']

In [None]:
raw_entailment_results.groupby(['response_model'])['match_length_ratio'].mean()

In [None]:
# Now seaborn heatmap plot the correlation between the entailment length correlation across pairs of models
import seaborn as sns
import matplotlib.pyplot as plt

# Pivot the data to create a matrix of match lengths by question/opinion and model
pivot_df = raw_entailment_results.pivot_table(
    index=['question_id', 'opinion_idx'],
    columns='response_model',
    values='match_length_ratio'
)

# Calculate correlation matrix
corr_matrix = pivot_df.corr()

# Create heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(
    corr_matrix,
    annot=True,  # Show correlation values
    cmap='coolwarm',  # Color scheme
    vmin=-1, vmax=1,  # Force scale from -1 to 1
    center=0,  # Center the colormap at 0
    square=True  # Make cells square
)
plt.title('Correlation of Entailment Lengths Between Models')
plt.tight_layout()
plt.show()