## Evaluation + Metrics

- PAPER: [Collective Reasoning Among LLMs: A Framework for Answer Validation Without Ground Truth](https://arxiv.org/pdf/2502.20758) by Davoudi et al., 2025

In [1]:
import os
import sys
import warnings

import numpy as np
import pandas as pd

from tqdm import tqdm

import statsmodels
from statsmodels.stats.inter_rater import fleiss_kappa
from statistics import mean

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

# import log_files
from data_processing import DataProcessing
from classification_models import EvaluationMetric

## Load Data

In [2]:
base_data_path = os.path.join(notebook_dir, '../data/')
combine_data_path = os.path.join(base_data_path, 'combined_generated_fin_phrase_bank')

In [3]:
ml_classifiers_data_path = os.path.join(combine_data_path, 'ml_classifiers-v1.csv')
ml_classifiers_df = DataProcessing.load_from_file(ml_classifiers_data_path, 'csv')
ml_classifiers_df = DataProcessing.drop_df_columns(ml_classifiers_df, ['Unnamed: 0'])
# len(ml_classifiers_df), ml_classifiers_df.head(3)
ml_classifiers_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier
0,"On August 15, 2024, marketing expert David Lee...",1,[-2.67128736e-01 3.99917126e-01 1.51789542e-...,[-1.92454374e+00 1.91188228e+00 6.52823076e-...,1,1,1,1,1
1,Operating profit for the three-month period in...,0,[-8.39660242e-02 1.80937588e-01 -4.57216762e-...,[ 0.32650337 -1.0380536 -0.91864634 0.511139...,0,0,0,0,0
2,Cybersecurity threats should stay same in 2026...,1,[-9.97022167e-02 2.27282479e-01 8.88563171e-...,[ 1.33107498e-01 -4.13728386e-01 1.25563562e+...,1,1,1,1,1


In [4]:
llm_classifiers_data_path = os.path.join(combine_data_path, 'llm_classifiers-v1.csv')
llm_classifiers_df = DataProcessing.load_from_file(llm_classifiers_data_path, 'csv')
llm_classifiers_df = DataProcessing.drop_df_columns(llm_classifiers_df, ['Unnamed: 0'])
# len(llm_classifiers_df), llm_classifiers_df.head(3)
llm_classifiers_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct
0,"On August 15, 2024, marketing expert David Lee...",1,[-2.67128736e-01 3.99917126e-01 1.51789542e-...,[-1.92454374e+00 1.91188228e+00 6.52823076e-...,1,1,1,1
1,Operating profit for the three-month period in...,0,[-8.39660242e-02 1.80937588e-01 -4.57216762e-...,[ 0.32650337 -1.0380536 -0.91864634 0.511139...,0,0,0,0
2,Cybersecurity threats should stay same in 2026...,1,[-9.97022167e-02 2.27282479e-01 8.88563171e-...,[ 1.33107498e-01 -4.13728386e-01 1.25563562e+...,1,1,1,1


In [5]:
print(all(ml_classifiers_df.loc[:, 'Base Sentence'].values == llm_classifiers_df.loc[:, 'Base Sentence'].values))

print(all(ml_classifiers_df.loc[:, 'Sentence Label'].values == llm_classifiers_df.loc[:, 'Sentence Label'].values))

True
True


In [6]:
llm_classifiers_cols = llm_classifiers_df.iloc[:, 5:]
llm_classifiers_cols.head(3)

Unnamed: 0,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct
0,1,1,1
1,0,0,0
2,1,1,1


In [7]:
combined_df = pd.concat([ml_classifiers_df, llm_classifiers_cols], axis=1)
combined_df

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct
0,"On August 15, 2024, marketing expert David Lee...",1,[-2.67128736e-01 3.99917126e-01 1.51789542e-...,[-1.92454374e+00 1.91188228e+00 6.52823076e-...,1,1,1,1,1,1,1,1
1,Operating profit for the three-month period in...,0,[-8.39660242e-02 1.80937588e-01 -4.57216762e-...,[ 0.32650337 -1.0380536 -0.91864634 0.511139...,0,0,0,0,0,0,0,0
2,Cybersecurity threats should stay same in 2026...,1,[-9.97022167e-02 2.27282479e-01 8.88563171e-...,[ 1.33107498e-01 -4.13728386e-01 1.25563562e+...,1,1,1,1,1,1,1,1
3,"According to Gran , the company has no plans t...",1,[ 2.75615864e-02 1.79783881e-01 -1.21603109e-...,[ 1.697164 -1.0535955 -2.1446092 -2.578123...,1,1,1,1,1,0,0,0
4,Policy analyst David Lee predicts on 2025-02-1...,1,[-2.79465199e-01 4.11210299e-01 1.06271811e-...,[-2.07615733e+00 2.06401587e+00 1.53700554e+...,1,1,1,1,1,1,1,1
5,"Dr. Smith predicts on November 20, 2029, the n...",1,[-1.23342037e-01 4.48737800e-01 7.77667686e-...,[-1.57422990e-01 2.56955957e+00 1.07646954e+...,1,1,1,1,1,1,1,1
6,Technopolis plans to develop in stages an area...,1,[ 2.36152317e-02 1.75637215e-01 -4.75921258e-...,[ 1.6486638e+00 -1.1094565e+00 -9.4886589e-01 ...,1,0,0,0,0,0,1,1
7,"According to economist Emily Patel, the unempl...",1,[-1.08750269e-01 2.72238493e-01 3.78524251e-...,[ 2.19080448e-02 1.91886961e-01 4.31601644e-...,1,1,1,1,1,1,1,1
8,The Centers for Disease Control and Prevention...,1,[-5.03036678e-02 1.87489763e-01 -3.05069238e-...,[ 0.7402096 -0.94978744 -0.6728323 -0.639951...,1,1,1,1,1,1,1,1
9,"In 08/2024, Coach Michael Brown envisions that...",1,[-2.95873899e-02 1.89737260e-01 -1.18197471e-...,[ 9.94810045e-01 -9.19510782e-01 -2.08958673e+...,1,1,1,1,1,1,1,1


## Accuracy

+ [x] ML only
+ [x] LLM only

In [8]:
ml_model_names = ml_classifiers_df.columns.to_list()[5:]
llm_model_names = llm_classifiers_df.columns.to_list()[5:]

model_names = ml_model_names + llm_model_names
model_names

['Perceptron',
 'SDG Classifier',
 'Logistic Regression',
 'Ridge Classifier',
 'llama-3.1-8b-instant',
 'llama-3.3-70b-versatile',
 'llama-3.3-70b-instruct']

In [9]:
llm_model_names

['llama-3.1-8b-instant', 'llama-3.3-70b-versatile', 'llama-3.3-70b-instruct']

In [10]:
get_metrics = EvaluationMetric()
actual_label = combined_df['Actual Label'].values
for model_name in model_names:
    print(f"=============================={model_name}==============================\n")
    # ml_model_name = model.__name__()
    print(f"Actual Label:\t\t{actual_label}")
    model_predictions = combined_df[model_name].values
    print(f"{model_name}:\t\t{model_predictions}")
    get_metrics.eval_classification_report(actual_label, model_predictions)
    print("==========================================================================\n")


Actual Label:		[1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0]
Perceptron:		[1 0 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 0]
              precision    recall  f1-score   support

           0       0.60      1.00      0.75         3
           1       1.00      0.89      0.94        18

    accuracy                           0.90        21
   macro avg       0.80      0.94      0.85        21
weighted avg       0.94      0.90      0.91        21



Actual Label:		[1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0]
SDG Classifier:		[1 0 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 0]
              precision    recall  f1-score   support

           0       0.60      1.00      0.75         3
           1       1.00      0.89      0.94        18

    accuracy                           0.90        21
   macro avg       0.80      0.94      0.85        21
weighted avg       0.94      0.90      0.91        21



Actual Label:		[1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0]
Logistic Regression:		[1 0 1 1 1 1 0 1 1 1 1 0

## Majority Vote

+ [x] ML only
+ [x] LLM only
+ [x] Combined

In [11]:
ml_majority_vote = combined_df.loc[:, ml_model_names].mode(axis=1)
# print(f"ML MAJORITY VOTE: {ml_majority_vote}")

llm_majority_vote = combined_df.loc[:, llm_model_names].mode(axis=1)
# print(f"LLM MAJORITY VOTE: {llm_majority_vote}")

combined_majorty_vote = combined_df.loc[:, model_names].mode(axis=1)
# print(f"ALL MODELS MAJORITY VOTE: {combined_majorty_vote}")

combined_df['ML Majority Vote'] = ml_majority_vote
combined_df['LLM Majority Vote'] = llm_majority_vote
combined_df['All Models Majority Vote'] = combined_majorty_vote
combined_df.head(7)

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct,ML Majority Vote,LLM Majority Vote,All Models Majority Vote
0,"On August 15, 2024, marketing expert David Lee...",1,[-2.67128736e-01 3.99917126e-01 1.51789542e-...,[-1.92454374e+00 1.91188228e+00 6.52823076e-...,1,1,1,1,1,1,1,1,1,1,1
1,Operating profit for the three-month period in...,0,[-8.39660242e-02 1.80937588e-01 -4.57216762e-...,[ 0.32650337 -1.0380536 -0.91864634 0.511139...,0,0,0,0,0,0,0,0,0,0,0
2,Cybersecurity threats should stay same in 2026...,1,[-9.97022167e-02 2.27282479e-01 8.88563171e-...,[ 1.33107498e-01 -4.13728386e-01 1.25563562e+...,1,1,1,1,1,1,1,1,1,1,1
3,"According to Gran , the company has no plans t...",1,[ 2.75615864e-02 1.79783881e-01 -1.21603109e-...,[ 1.697164 -1.0535955 -2.1446092 -2.578123...,1,1,1,1,1,0,0,0,1,0,1
4,Policy analyst David Lee predicts on 2025-02-1...,1,[-2.79465199e-01 4.11210299e-01 1.06271811e-...,[-2.07615733e+00 2.06401587e+00 1.53700554e+...,1,1,1,1,1,1,1,1,1,1,1
5,"Dr. Smith predicts on November 20, 2029, the n...",1,[-1.23342037e-01 4.48737800e-01 7.77667686e-...,[-1.57422990e-01 2.56955957e+00 1.07646954e+...,1,1,1,1,1,1,1,1,1,1,1
6,Technopolis plans to develop in stages an area...,1,[ 2.36152317e-02 1.75637215e-01 -4.75921258e-...,[ 1.6486638e+00 -1.1094565e+00 -9.4886589e-01 ...,1,0,0,0,0,0,1,1,0,1,0


##  Reliability Metric

##  Confidence Intervals via Bootstrap

## Chi-Square Test of Independence

## Fleissâ€™ Kappa

- How realiable are the raters?
    - Higher, then they tend to agree
    - Lower, the they tend to disagree
- Does NOT state if the raters are correct as in raters can say one thing and real value may be another. This deals with the validity.

In [12]:
def get_unique_classifier_labels(df: pd.DataFrame, col_names: list) -> list:
        # Get unique categories
    label_values = np.unique(df.loc[:, col_names].values)

    return label_values

In [13]:
get_unique_classifier_labels(combined_df, ml_model_names)

array([0, 1])

- Following: https://numiqo.com/tutorial/fleiss-kappa

In [63]:
def reformat_df_with_classifier_labels(df: pd.DataFrame, model_names: list, new_col_name) -> pd.DataFrame:
    copy_df = df.loc[:, model_names]
    # print(copy_df)

    results_per_value_dict = {}
    label_values = get_unique_classifier_labels(df, model_names)
    # print(f"Label values: {label_values}")
    results_for_all_values = []
    for label_value in label_values:
        results_per_value = []

        # per row, get all the models with their value
        for i in range(len(copy_df)):
            # if value in cell == label_value, then True, else False
            # so value in cell = 1 and label_value = 1, then True
            # do so we can count how Trues and Falses
            filt_label_value = (copy_df.iloc[i] == label_value)
            # print(filt_label_value)
            filt_label_value_summed = sum(filt_label_value)
            # print(filt_label_value_summed)
            # print(f"For current label {label_value}, there exists {filt_label_value_summed} in {copy_df.iloc[i].values} with current value.")

            results_per_value.append(filt_label_value_summed)
            col_name = f"{label_value} {new_col_name}"
            results_per_value_dict[col_name] = results_per_value



        print(f"Label value {label_value}: {results_per_value} -> summed(results for label value {label_value}): {sum(results_per_value)}")
        
        
        results_for_all_values.append(sum(results_per_value))
        sum_results_for_all_values = sum(results_for_all_values)        
        # copy_df[col_name] = results_per_value
    
    # print(copy_df)
    # print(sum_results_for_all_values)
        
    per_label_value_0 = results_for_all_values[0] / sum_results_for_all_values
    per_label_value_1 = results_for_all_values[1] / sum_results_for_all_values
    square_label_value_0 = per_label_value_0 ** 2
    square_label_value_1 = per_label_value_1 ** 2
    sum_the_squares = square_label_value_0 + square_label_value_1

    # print(per_label_value_0, per_label_value_1, square_label_value_0, square_label_value_1, sum_the_squares)
    # print(copy_df.iloc[:, [-2, -1]].values)
    # fleiss_kappa_score = fleiss_kappa(copy_df.iloc[:, [-2, -1]].values)
    print(results_per_value_dict)
    final_df = pd.DataFrame(results_per_value_dict)
    # print(final_df)
    fleiss_kappa_score = fleiss_kappa(final_df)
    print(fleiss_kappa_score)
    return final_df

In [64]:
new_col_name = "is the ML Model Class"
reformat_df_with_classifier_labels(combined_df, ml_model_names, new_col_name)

Label value 0: [0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 4] -> summed(results for label value 0): 19
Label value 1: [4, 0, 4, 4, 4, 4, 0, 4, 4, 4, 4, 1, 0, 4, 4, 4, 4, 4, 4, 4, 0] -> summed(results for label value 1): 65
{'0 is the ML Model Class': [0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 4], '1 is the ML Model Class': [4, 0, 4, 4, 4, 4, 0, 4, 4, 4, 4, 1, 0, 4, 4, 4, 4, 4, 4, 4, 0]}
0.9319838056680161


Unnamed: 0,0 is the ML Model Class,1 is the ML Model Class
0,0,4
1,4,0
2,0,4
3,0,4
4,0,4
5,0,4
6,4,0
7,0,4
8,0,4
9,0,4


In [65]:
new_col_name = "is the LLM Model Class"
reformat_df_with_classifier_labels(combined_df, llm_model_names, new_col_name)

Label value 0: [0, 3, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3] -> summed(results for label value 0): 13
Label value 1: [3, 0, 3, 0, 3, 3, 2, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0] -> summed(results for label value 1): 50
{'0 is the LLM Model Class': [0, 3, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3], '1 is the LLM Model Class': [3, 0, 3, 0, 3, 3, 2, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0]}
0.9030769230769228


Unnamed: 0,0 is the LLM Model Class,1 is the LLM Model Class
0,0,3
1,3,0
2,0,3
3,3,0
4,0,3
5,0,3
6,1,2
7,0,3
8,0,3
9,0,3


In [66]:
new_col_name = "is the ML x LLM Model Class"
reformat_df_with_classifier_labels(combined_df, model_names, new_col_name)

Label value 0: [0, 7, 0, 3, 0, 0, 5, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 7] -> summed(results for label value 0): 32
Label value 1: [7, 0, 7, 4, 7, 7, 2, 7, 7, 7, 7, 4, 0, 7, 7, 7, 7, 7, 7, 7, 0] -> summed(results for label value 1): 115
{'0 is the ML x LLM Model Class': [0, 7, 0, 3, 0, 0, 5, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 7], '1 is the ML x LLM Model Class': [7, 0, 7, 4, 7, 7, 2, 7, 7, 7, 7, 4, 0, 7, 7, 7, 7, 7, 7, 7, 0]}
0.7736413043478261


Unnamed: 0,0 is the ML x LLM Model Class,1 is the ML x LLM Model Class
0,0,7
1,7,0
2,0,7
3,3,4
4,0,7
5,0,7
6,5,2
7,0,7
8,0,7
9,0,7


> ML > LLM > ML x LLM, why? Due to randomness of LLM?

## Confusion Matrix