## Evaluation + Metrics

- PAPER: [Collective Reasoning Among LLMs: A Framework for Answer Validation Without Ground Truth](https://arxiv.org/pdf/2502.20758) by Davoudi et al., 2025

In [1]:
import os
import sys
import warnings
import statsmodels

import numpy as np
import pandas as pd

from tqdm import tqdm
from statistics import mean
from json import loads, dumps

from statsmodels.stats.inter_rater import fleiss_kappa

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

# import log_files
from data_processing import DataProcessing
from classification_models import EvaluationMetric

## Load Data

In [2]:
base_data_path = os.path.join(notebook_dir, '../data/')
combine_data_path = os.path.join(base_data_path, 'combined_generated_fin_phrase_bank')

In [3]:
ml_classifiers_data_path = os.path.join(combine_data_path, 'ml_classifiers-v1.csv')
ml_classifiers_df = DataProcessing.load_from_file(ml_classifiers_data_path, 'csv')
ml_classifiers_df = DataProcessing.drop_df_columns(ml_classifiers_df, ['Unnamed: 0'])
# len(ml_classifiers_df), ml_classifiers_df.head(3)
ml_classifiers_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier
0,"In 08/2024, Coach Michael Brown envisions that...",1,[-2.95873899e-02 1.89737260e-01 -1.18197471e-...,[ 0.94400704 -0.8645446 -2.054663 -0.388244...,1,0,1,1,1
1,Coach Rachel Thompson forecasts that the point...,1,[-7.31271803e-02 2.57879347e-01 -1.00163361e-...,[ 0.4060981 0.06459576 -0.33290786 -0.185250...,1,0,0,1,1
2,JPMorgan forecasts that the revenue at Microso...,1,[-2.24677563e-01 2.28302136e-01 -3.20941433e-...,[-1.4662192e+00 -3.3869946e-01 -6.8428683e-01 ...,1,1,1,1,1


In [4]:
llm_classifiers_data_path = os.path.join(combine_data_path, 'llm_classifiers-v1.csv')
llm_classifiers_df = DataProcessing.load_from_file(llm_classifiers_data_path, 'csv')
llm_classifiers_df = DataProcessing.drop_df_columns(llm_classifiers_df, ['Unnamed: 0'])
# len(llm_classifiers_df), llm_classifiers_df.head(3)
llm_classifiers_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct
0,"In 08/2024, Coach Michael Brown envisions that...",1,[-2.95873899e-02 1.89737260e-01 -1.18197471e-...,[ 0.94400704 -0.8645446 -2.054663 -0.388244...,1,1,1,1
1,Coach Rachel Thompson forecasts that the point...,1,[-7.31271803e-02 2.57879347e-01 -1.00163361e-...,[ 0.4060981 0.06459576 -0.33290786 -0.185250...,1,1,1,1
2,JPMorgan forecasts that the revenue at Microso...,1,[-2.24677563e-01 2.28302136e-01 -3.20941433e-...,[-1.4662192e+00 -3.3869946e-01 -6.8428683e-01 ...,1,1,1,1


In [5]:
print(all(ml_classifiers_df.loc[:, 'Base Sentence'].values == llm_classifiers_df.loc[:, 'Base Sentence'].values))

print(all(ml_classifiers_df.loc[:, 'Sentence Label'].values == llm_classifiers_df.loc[:, 'Sentence Label'].values))

True
True


In [6]:
llm_classifiers_cols = llm_classifiers_df.iloc[:, 5:]
llm_classifiers_cols.head(3)

Unnamed: 0,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct
0,1,1,1
1,1,1,1
2,1,1,1


In [7]:
combined_df = pd.concat([ml_classifiers_df, llm_classifiers_cols], axis=1)
combined_df

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct
0,"In 08/2024, Coach Michael Brown envisions that...",1,[-2.95873899e-02 1.89737260e-01 -1.18197471e-...,[ 0.94400704 -0.8645446 -2.054663 -0.388244...,1,0,1,1,1,1,1,1
1,Coach Rachel Thompson forecasts that the point...,1,[-7.31271803e-02 2.57879347e-01 -1.00163361e-...,[ 0.4060981 0.06459576 -0.33290786 -0.185250...,1,0,0,1,1,1,1,1
2,JPMorgan forecasts that the revenue at Microso...,1,[-2.24677563e-01 2.28302136e-01 -3.20941433e-...,[-1.4662192e+00 -3.3869946e-01 -6.8428683e-01 ...,1,1,1,1,1,1,1,1
3,The Brazilian unit of Finnish security solutio...,0,[-9.49790627e-02 1.75042719e-01 -4.31647301e-...,[ 0.13613077 -1.0649096 -0.86048055 -0.451764...,0,0,1,1,0,0,0,0
4,"On August 15, 2027, Dr. John Lee speculates th...",1,[-1.65867433e-01 3.57482523e-01 7.28247538e-...,[-7.39654064e-01 1.42271864e+00 9.85548079e-...,1,1,1,1,1,1,1,1
5,Analyst Emma Taylor noted that the home run co...,0,[-2.64777802e-02 1.62194327e-01 -6.83643995e-...,[ 0.9824245 -1.2401017 -0.28229827 1.113998...,0,0,1,0,0,0,0,0
6,"According to Goldman Sachs, the research and d...",1,[-1.21063471e-01 2.51935005e-01 -2.94214804e-...,[-1.86127022e-01 -1.64573379e-02 -6.41750097e-...,1,1,1,1,1,1,1,1
7,"According to Goldman Sachs, the research and d...",1,[-1.16282433e-01 2.23263055e-01 -4.77008447e-...,[-1.2706007e-01 -4.0740904e-01 -9.3267500e-01 ...,1,1,1,1,1,1,1,1
8,According to the company 's updated strategy f...,1,[-2.63320029e-01 3.28540474e-01 4.48558182e-...,[-1.9436246 1.028084 0.54040897 0.117793...,1,0,0,0,0,1,1,1
9,The Lithuanian beer market made up 14.41 milli...,0,[-6.91080689e-02 2.09023327e-01 7.64123956e-...,[ 0.4557519 -0.6015726 -0.05187901 0.517767...,0,0,0,0,0,0,0,0


## Accuracy

+ [x] ML only
+ [x] LLM only

In [8]:
ml_model_names = ml_classifiers_df.columns.to_list()[5:]
llm_model_names = llm_classifiers_df.columns.to_list()[5:]

model_names = ml_model_names + llm_model_names
model_names

['Perceptron',
 'SDG Classifier',
 'Logistic Regression',
 'Ridge Classifier',
 'llama-3.1-8b-instant',
 'llama-3.3-70b-versatile',
 'llama-3.3-70b-instruct']

In [9]:
llm_model_names

['llama-3.1-8b-instant', 'llama-3.3-70b-versatile', 'llama-3.3-70b-instruct']

In [10]:
get_metrics = EvaluationMetric()
actual_label = combined_df['Actual Label'].values

all_model_metrics = {}
for model_name in model_names:
    print(f"=============================={model_name}==============================\n")
    # ml_model_name = model.__name__()
    print(f"Actual Label:\t\t{actual_label}")
    model_predictions = combined_df[model_name].values
    print(f"{model_name}:\t\t{model_predictions}")
    get_metrics.eval_classification_report(actual_label, model_predictions)
    model_metrics = get_metrics.custom_evaluation_metrics(actual_label, model_predictions)
    all_model_metrics[model_name] = model_metrics
    print("==========================================================================\n")


Actual Label:		[1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1]
Perceptron:		[0 0 1 0 1 0 1 1 0 0 1 0 1 1 1 1 0 1 0 1 1 1 0 1]
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         5
           1       1.00      0.74      0.85        19

    accuracy                           0.79        24
   macro avg       0.75      0.87      0.76        24
weighted avg       0.90      0.79      0.81        24



Actual Label:		[1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1]
SDG Classifier:		[1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1]
              precision    recall  f1-score   support

           0       0.50      0.60      0.55         5
           1       0.89      0.84      0.86        19

    accuracy                           0.79        24
   macro avg       0.69      0.72      0.71        24
weighted avg       0.81      0.79      0.80        24



Actual Label:		[1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 1]
Logistic Regress

In [11]:
all_model_metrics_json = pd.DataFrame(all_model_metrics).to_json()
all_model_metrics_json

'{"Perceptron":{"Accuracy":0.7916666667,"Precision":1.0,"Recall":0.7368421053,"F1 Score":0.8484848485},"SDG Classifier":{"Accuracy":0.7916666667,"Precision":0.8888888889,"Recall":0.8421052632,"F1 Score":0.8648648649},"Logistic Regression":{"Accuracy":0.9166666667,"Precision":0.9473684211,"Recall":0.9473684211,"F1 Score":0.9473684211},"Ridge Classifier":{"Accuracy":0.9583333333,"Precision":1.0,"Recall":0.9473684211,"F1 Score":0.972972973},"llama-3.1-8b-instant":{"Accuracy":0.875,"Precision":1.0,"Recall":0.8421052632,"F1 Score":0.9142857143},"llama-3.3-70b-versatile":{"Accuracy":0.9166666667,"Precision":1.0,"Recall":0.8947368421,"F1 Score":0.9444444444},"llama-3.3-70b-instruct":{"Accuracy":0.9166666667,"Precision":1.0,"Recall":0.8947368421,"F1 Score":0.9444444444}}'

In [12]:
print(type(all_model_metrics))

combined_df['Accuracy Results'] = all_model_metrics_json
combined_df

<class 'dict'>


Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct,Accuracy Results
0,"In 08/2024, Coach Michael Brown envisions that...",1,[-2.95873899e-02 1.89737260e-01 -1.18197471e-...,[ 0.94400704 -0.8645446 -2.054663 -0.388244...,1,0,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis..."
1,Coach Rachel Thompson forecasts that the point...,1,[-7.31271803e-02 2.57879347e-01 -1.00163361e-...,[ 0.4060981 0.06459576 -0.33290786 -0.185250...,1,0,0,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis..."
2,JPMorgan forecasts that the revenue at Microso...,1,[-2.24677563e-01 2.28302136e-01 -3.20941433e-...,[-1.4662192e+00 -3.3869946e-01 -6.8428683e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis..."
3,The Brazilian unit of Finnish security solutio...,0,[-9.49790627e-02 1.75042719e-01 -4.31647301e-...,[ 0.13613077 -1.0649096 -0.86048055 -0.451764...,0,0,1,1,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis..."
4,"On August 15, 2027, Dr. John Lee speculates th...",1,[-1.65867433e-01 3.57482523e-01 7.28247538e-...,[-7.39654064e-01 1.42271864e+00 9.85548079e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis..."
5,Analyst Emma Taylor noted that the home run co...,0,[-2.64777802e-02 1.62194327e-01 -6.83643995e-...,[ 0.9824245 -1.2401017 -0.28229827 1.113998...,0,0,1,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis..."
6,"According to Goldman Sachs, the research and d...",1,[-1.21063471e-01 2.51935005e-01 -2.94214804e-...,[-1.86127022e-01 -1.64573379e-02 -6.41750097e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis..."
7,"According to Goldman Sachs, the research and d...",1,[-1.16282433e-01 2.23263055e-01 -4.77008447e-...,[-1.2706007e-01 -4.0740904e-01 -9.3267500e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis..."
8,According to the company 's updated strategy f...,1,[-2.63320029e-01 3.28540474e-01 4.48558182e-...,[-1.9436246 1.028084 0.54040897 0.117793...,1,0,0,0,0,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis..."
9,The Lithuanian beer market made up 14.41 milli...,0,[-6.91080689e-02 2.09023327e-01 7.64123956e-...,[ 0.4557519 -0.6015726 -0.05187901 0.517767...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis..."


## Majority Vote

+ [x] ML only
+ [x] LLM only
+ [x] Combined

In [15]:
ml_majority_vote = combined_df.loc[:, ml_model_names].mode(axis=1)
# print(f"ML MAJORITY VOTE: {ml_majority_vote}")

llm_majority_vote = combined_df.loc[:, llm_model_names].mode(axis=1)
# print(f"LLM MAJORITY VOTE: {llm_majority_vote}")

combined_majorty_vote = combined_df.loc[:, model_names].mode(axis=1)
# print(f"ALL MODELS MAJORITY VOTE: {combined_majorty_vote}")

# combined_df['ML Majority Vote'] = ml_majority_vote
combined_df['LLM Majority Vote'] = llm_majority_vote
combined_df['All Models Majority Vote'] = combined_majorty_vote
combined_df.head(7)

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct,Accuracy Results,LLM Majority Vote,All Models Majority Vote
0,"In 08/2024, Coach Michael Brown envisions that...",1,[-2.95873899e-02 1.89737260e-01 -1.18197471e-...,[ 0.94400704 -0.8645446 -2.054663 -0.388244...,1,0,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1
1,Coach Rachel Thompson forecasts that the point...,1,[-7.31271803e-02 2.57879347e-01 -1.00163361e-...,[ 0.4060981 0.06459576 -0.33290786 -0.185250...,1,0,0,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1
2,JPMorgan forecasts that the revenue at Microso...,1,[-2.24677563e-01 2.28302136e-01 -3.20941433e-...,[-1.4662192e+00 -3.3869946e-01 -6.8428683e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1
3,The Brazilian unit of Finnish security solutio...,0,[-9.49790627e-02 1.75042719e-01 -4.31647301e-...,[ 0.13613077 -1.0649096 -0.86048055 -0.451764...,0,0,1,1,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0
4,"On August 15, 2027, Dr. John Lee speculates th...",1,[-1.65867433e-01 3.57482523e-01 7.28247538e-...,[-7.39654064e-01 1.42271864e+00 9.85548079e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1
5,Analyst Emma Taylor noted that the home run co...,0,[-2.64777802e-02 1.62194327e-01 -6.83643995e-...,[ 0.9824245 -1.2401017 -0.28229827 1.113998...,0,0,1,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0
6,"According to Goldman Sachs, the research and d...",1,[-1.21063471e-01 2.51935005e-01 -2.94214804e-...,[-1.86127022e-01 -1.64573379e-02 -6.41750097e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1


##  Reliability Metric

##  Confidence Intervals via Bootstrap

## Chi-Square Test of Independence

## Fleissâ€™ Kappa

- How realiable are the raters?
    - Higher, then they tend to agree
    - Lower, the they tend to disagree
- Does NOT state if the raters are correct as in raters can say one thing and real value may be another. This deals with the validity.

In [16]:
def get_unique_classifier_labels(df: pd.DataFrame, col_names: list) -> list:
        # Get unique categories
    label_values = np.unique(df.loc[:, col_names].values)

    return label_values

In [17]:
get_unique_classifier_labels(combined_df, ml_model_names)

array([0, 1])

- Following: https://numiqo.com/tutorial/fleiss-kappa

In [18]:
def reformat_df_with_classifier_labels(df: pd.DataFrame, model_names: list, new_col_name) -> pd.DataFrame:
    copy_df = df.loc[:, model_names]
    # print(copy_df)

    results_per_value_dict = {}
    label_values = get_unique_classifier_labels(df, model_names)
    # print(f"Label values: {label_values}")
    results_for_all_values = []
    for label_value in label_values:
        results_per_value = []

        # per row, get all the models with their value
        for i in range(len(copy_df)):
            # if value in cell == label_value, then True, else False
            # so value in cell = 1 and label_value = 1, then True
            # do so we can count how Trues and Falses
            filt_label_value = (copy_df.iloc[i] == label_value)
            # print(filt_label_value)
            filt_label_value_summed = sum(filt_label_value)
            # print(filt_label_value_summed)
            # print(f"For current label {label_value}, there exists {filt_label_value_summed} in {copy_df.iloc[i].values} with current value.")

            results_per_value.append(filt_label_value_summed)
            col_name = f"{label_value} {new_col_name}"
            results_per_value_dict[col_name] = results_per_value



        print(f"Label value {label_value}: {results_per_value} -> summed(results for label value {label_value}): {sum(results_per_value)}")
        
        
        results_for_all_values.append(sum(results_per_value))
        sum_results_for_all_values = sum(results_for_all_values)        
        # copy_df[col_name] = results_per_value
    
    # print(copy_df)
    # print(sum_results_for_all_values)
        
    per_label_value_0 = results_for_all_values[0] / sum_results_for_all_values
    per_label_value_1 = results_for_all_values[1] / sum_results_for_all_values
    square_label_value_0 = per_label_value_0 ** 2
    square_label_value_1 = per_label_value_1 ** 2
    sum_the_squares = square_label_value_0 + square_label_value_1

    # print(per_label_value_0, per_label_value_1, square_label_value_0, square_label_value_1, sum_the_squares)
    # print(copy_df.iloc[:, [-2, -1]].values)
    # fleiss_kappa_score = fleiss_kappa(copy_df.iloc[:, [-2, -1]].values)
    print(results_per_value_dict)
    final_df = pd.DataFrame(results_per_value_dict)
    # print(final_df)
    fleiss_kappa_score = fleiss_kappa(final_df)
    print(fleiss_kappa_score)
    return final_df, fleiss_kappa_score

In [19]:
new_col_name = "is the ML Model Class"
ml_class_vote_df, ml_fleiss_kappa = reformat_df_with_classifier_labels(combined_df, ml_model_names, new_col_name)
combined_df['ML Fleiss Kappa'] = ml_fleiss_kappa
combined_df

Label value 0: [1, 2, 0, 2, 0, 3, 0, 0, 4, 4, 0, 1, 0, 0, 0, 0, 4, 1, 4, 0, 0, 0, 1, 0] -> summed(results for label value 0): 27
Label value 1: [3, 2, 4, 2, 4, 1, 4, 4, 0, 0, 4, 3, 4, 4, 4, 4, 0, 3, 0, 4, 4, 4, 3, 4] -> summed(results for label value 1): 69
{'0 is the ML Model Class': [1, 2, 0, 2, 0, 3, 0, 0, 4, 4, 0, 1, 0, 0, 0, 0, 4, 1, 4, 0, 0, 0, 1, 0], '1 is the ML Model Class': [3, 2, 4, 2, 4, 1, 4, 4, 0, 0, 4, 3, 4, 4, 4, 4, 0, 3, 0, 4, 4, 4, 3, 4]}
0.6049382716049381


Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct,Accuracy Results,LLM Majority Vote,All Models Majority Vote,ML Fleiss Kappa
0,"In 08/2024, Coach Michael Brown envisions that...",1,[-2.95873899e-02 1.89737260e-01 -1.18197471e-...,[ 0.94400704 -0.8645446 -2.054663 -0.388244...,1,0,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938
1,Coach Rachel Thompson forecasts that the point...,1,[-7.31271803e-02 2.57879347e-01 -1.00163361e-...,[ 0.4060981 0.06459576 -0.33290786 -0.185250...,1,0,0,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938
2,JPMorgan forecasts that the revenue at Microso...,1,[-2.24677563e-01 2.28302136e-01 -3.20941433e-...,[-1.4662192e+00 -3.3869946e-01 -6.8428683e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938
3,The Brazilian unit of Finnish security solutio...,0,[-9.49790627e-02 1.75042719e-01 -4.31647301e-...,[ 0.13613077 -1.0649096 -0.86048055 -0.451764...,0,0,1,1,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938
4,"On August 15, 2027, Dr. John Lee speculates th...",1,[-1.65867433e-01 3.57482523e-01 7.28247538e-...,[-7.39654064e-01 1.42271864e+00 9.85548079e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938
5,Analyst Emma Taylor noted that the home run co...,0,[-2.64777802e-02 1.62194327e-01 -6.83643995e-...,[ 0.9824245 -1.2401017 -0.28229827 1.113998...,0,0,1,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938
6,"According to Goldman Sachs, the research and d...",1,[-1.21063471e-01 2.51935005e-01 -2.94214804e-...,[-1.86127022e-01 -1.64573379e-02 -6.41750097e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938
7,"According to Goldman Sachs, the research and d...",1,[-1.16282433e-01 2.23263055e-01 -4.77008447e-...,[-1.2706007e-01 -4.0740904e-01 -9.3267500e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938
8,According to the company 's updated strategy f...,1,[-2.63320029e-01 3.28540474e-01 4.48558182e-...,[-1.9436246 1.028084 0.54040897 0.117793...,1,0,0,0,0,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,0,0.604938
9,The Lithuanian beer market made up 14.41 milli...,0,[-6.91080689e-02 2.09023327e-01 7.64123956e-...,[ 0.4557519 -0.6015726 -0.05187901 0.517767...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938


In [20]:
ml_class_vote_df

Unnamed: 0,0 is the ML Model Class,1 is the ML Model Class
0,1,3
1,2,2
2,0,4
3,2,2
4,0,4
5,3,1
6,0,4
7,0,4
8,4,0
9,4,0


In [21]:
combined_df

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct,Accuracy Results,LLM Majority Vote,All Models Majority Vote,ML Fleiss Kappa
0,"In 08/2024, Coach Michael Brown envisions that...",1,[-2.95873899e-02 1.89737260e-01 -1.18197471e-...,[ 0.94400704 -0.8645446 -2.054663 -0.388244...,1,0,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938
1,Coach Rachel Thompson forecasts that the point...,1,[-7.31271803e-02 2.57879347e-01 -1.00163361e-...,[ 0.4060981 0.06459576 -0.33290786 -0.185250...,1,0,0,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938
2,JPMorgan forecasts that the revenue at Microso...,1,[-2.24677563e-01 2.28302136e-01 -3.20941433e-...,[-1.4662192e+00 -3.3869946e-01 -6.8428683e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938
3,The Brazilian unit of Finnish security solutio...,0,[-9.49790627e-02 1.75042719e-01 -4.31647301e-...,[ 0.13613077 -1.0649096 -0.86048055 -0.451764...,0,0,1,1,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938
4,"On August 15, 2027, Dr. John Lee speculates th...",1,[-1.65867433e-01 3.57482523e-01 7.28247538e-...,[-7.39654064e-01 1.42271864e+00 9.85548079e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938
5,Analyst Emma Taylor noted that the home run co...,0,[-2.64777802e-02 1.62194327e-01 -6.83643995e-...,[ 0.9824245 -1.2401017 -0.28229827 1.113998...,0,0,1,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938
6,"According to Goldman Sachs, the research and d...",1,[-1.21063471e-01 2.51935005e-01 -2.94214804e-...,[-1.86127022e-01 -1.64573379e-02 -6.41750097e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938
7,"According to Goldman Sachs, the research and d...",1,[-1.16282433e-01 2.23263055e-01 -4.77008447e-...,[-1.2706007e-01 -4.0740904e-01 -9.3267500e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938
8,According to the company 's updated strategy f...,1,[-2.63320029e-01 3.28540474e-01 4.48558182e-...,[-1.9436246 1.028084 0.54040897 0.117793...,1,0,0,0,0,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,0,0.604938
9,The Lithuanian beer market made up 14.41 milli...,0,[-6.91080689e-02 2.09023327e-01 7.64123956e-...,[ 0.4557519 -0.6015726 -0.05187901 0.517767...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938


In [22]:
new_col_name = "is the LLM Model Class"
_, llm_fleiss_kappa = reformat_df_with_classifier_labels(combined_df, llm_model_names, new_col_name)
combined_df['LLM Fleiss Kappa'] = llm_fleiss_kappa
combined_df

Label value 0: [0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 0, 3, 0, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 1] -> summed(results for label value 0): 22
Label value 1: [3, 3, 3, 0, 3, 0, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0, 3, 0, 3, 3, 3, 3, 2] -> summed(results for label value 1): 50
{'0 is the LLM Model Class': [0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 0, 3, 0, 0, 3, 0, 3, 0, 3, 0, 0, 0, 0, 1], '1 is the LLM Model Class': [3, 3, 3, 0, 3, 0, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0, 3, 0, 3, 3, 3, 3, 2]}
0.9345454545454548


Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct,Accuracy Results,LLM Majority Vote,All Models Majority Vote,ML Fleiss Kappa,LLM Fleiss Kappa
0,"In 08/2024, Coach Michael Brown envisions that...",1,[-2.95873899e-02 1.89737260e-01 -1.18197471e-...,[ 0.94400704 -0.8645446 -2.054663 -0.388244...,1,0,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545
1,Coach Rachel Thompson forecasts that the point...,1,[-7.31271803e-02 2.57879347e-01 -1.00163361e-...,[ 0.4060981 0.06459576 -0.33290786 -0.185250...,1,0,0,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545
2,JPMorgan forecasts that the revenue at Microso...,1,[-2.24677563e-01 2.28302136e-01 -3.20941433e-...,[-1.4662192e+00 -3.3869946e-01 -6.8428683e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545
3,The Brazilian unit of Finnish security solutio...,0,[-9.49790627e-02 1.75042719e-01 -4.31647301e-...,[ 0.13613077 -1.0649096 -0.86048055 -0.451764...,0,0,1,1,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938,0.934545
4,"On August 15, 2027, Dr. John Lee speculates th...",1,[-1.65867433e-01 3.57482523e-01 7.28247538e-...,[-7.39654064e-01 1.42271864e+00 9.85548079e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545
5,Analyst Emma Taylor noted that the home run co...,0,[-2.64777802e-02 1.62194327e-01 -6.83643995e-...,[ 0.9824245 -1.2401017 -0.28229827 1.113998...,0,0,1,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938,0.934545
6,"According to Goldman Sachs, the research and d...",1,[-1.21063471e-01 2.51935005e-01 -2.94214804e-...,[-1.86127022e-01 -1.64573379e-02 -6.41750097e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545
7,"According to Goldman Sachs, the research and d...",1,[-1.16282433e-01 2.23263055e-01 -4.77008447e-...,[-1.2706007e-01 -4.0740904e-01 -9.3267500e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545
8,According to the company 's updated strategy f...,1,[-2.63320029e-01 3.28540474e-01 4.48558182e-...,[-1.9436246 1.028084 0.54040897 0.117793...,1,0,0,0,0,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,0,0.604938,0.934545
9,The Lithuanian beer market made up 14.41 milli...,0,[-6.91080689e-02 2.09023327e-01 7.64123956e-...,[ 0.4557519 -0.6015726 -0.05187901 0.517767...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938,0.934545


In [23]:
new_col_name = "is the ML x LLM Model Class"
_, models_fleiss_kappa = reformat_df_with_classifier_labels(combined_df, model_names, new_col_name)
combined_df['All Models Fleiss Kappa'] = models_fleiss_kappa
combined_df

Label value 0: [1, 2, 0, 5, 0, 6, 0, 0, 4, 7, 0, 4, 0, 0, 3, 0, 7, 1, 7, 0, 0, 0, 1, 1] -> summed(results for label value 0): 49
Label value 1: [6, 5, 7, 2, 7, 1, 7, 7, 3, 0, 7, 3, 7, 7, 4, 7, 0, 6, 0, 7, 7, 7, 6, 6] -> summed(results for label value 1): 119
{'0 is the ML x LLM Model Class': [1, 2, 0, 5, 0, 6, 0, 0, 4, 7, 0, 4, 0, 0, 3, 0, 7, 1, 7, 0, 0, 0, 1, 1], '1 is the ML x LLM Model Class': [6, 5, 7, 2, 7, 1, 7, 7, 3, 0, 7, 3, 7, 7, 4, 7, 0, 6, 0, 7, 7, 7, 6, 6]}
0.5870348139255703


Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct,Accuracy Results,LLM Majority Vote,All Models Majority Vote,ML Fleiss Kappa,LLM Fleiss Kappa,All Models Fleiss Kappa
0,"In 08/2024, Coach Michael Brown envisions that...",1,[-2.95873899e-02 1.89737260e-01 -1.18197471e-...,[ 0.94400704 -0.8645446 -2.054663 -0.388244...,1,0,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545,0.587035
1,Coach Rachel Thompson forecasts that the point...,1,[-7.31271803e-02 2.57879347e-01 -1.00163361e-...,[ 0.4060981 0.06459576 -0.33290786 -0.185250...,1,0,0,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545,0.587035
2,JPMorgan forecasts that the revenue at Microso...,1,[-2.24677563e-01 2.28302136e-01 -3.20941433e-...,[-1.4662192e+00 -3.3869946e-01 -6.8428683e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545,0.587035
3,The Brazilian unit of Finnish security solutio...,0,[-9.49790627e-02 1.75042719e-01 -4.31647301e-...,[ 0.13613077 -1.0649096 -0.86048055 -0.451764...,0,0,1,1,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938,0.934545,0.587035
4,"On August 15, 2027, Dr. John Lee speculates th...",1,[-1.65867433e-01 3.57482523e-01 7.28247538e-...,[-7.39654064e-01 1.42271864e+00 9.85548079e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545,0.587035
5,Analyst Emma Taylor noted that the home run co...,0,[-2.64777802e-02 1.62194327e-01 -6.83643995e-...,[ 0.9824245 -1.2401017 -0.28229827 1.113998...,0,0,1,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938,0.934545,0.587035
6,"According to Goldman Sachs, the research and d...",1,[-1.21063471e-01 2.51935005e-01 -2.94214804e-...,[-1.86127022e-01 -1.64573379e-02 -6.41750097e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545,0.587035
7,"According to Goldman Sachs, the research and d...",1,[-1.16282433e-01 2.23263055e-01 -4.77008447e-...,[-1.2706007e-01 -4.0740904e-01 -9.3267500e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545,0.587035
8,According to the company 's updated strategy f...,1,[-2.63320029e-01 3.28540474e-01 4.48558182e-...,[-1.9436246 1.028084 0.54040897 0.117793...,1,0,0,0,0,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,0,0.604938,0.934545,0.587035
9,The Lithuanian beer market made up 14.41 milli...,0,[-6.91080689e-02 2.09023327e-01 7.64123956e-...,[ 0.4557519 -0.6015726 -0.05187901 0.517767...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938,0.934545,0.587035


> ML > LLM > ML x LLM, why? Due to randomness of LLM?

## Confusion Matrix

## Save Data

In [25]:
combined_df

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct,Accuracy Results,LLM Majority Vote,All Models Majority Vote,ML Fleiss Kappa,LLM Fleiss Kappa,All Models Fleiss Kappa
0,"In 08/2024, Coach Michael Brown envisions that...",1,[-2.95873899e-02 1.89737260e-01 -1.18197471e-...,[ 0.94400704 -0.8645446 -2.054663 -0.388244...,1,0,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545,0.587035
1,Coach Rachel Thompson forecasts that the point...,1,[-7.31271803e-02 2.57879347e-01 -1.00163361e-...,[ 0.4060981 0.06459576 -0.33290786 -0.185250...,1,0,0,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545,0.587035
2,JPMorgan forecasts that the revenue at Microso...,1,[-2.24677563e-01 2.28302136e-01 -3.20941433e-...,[-1.4662192e+00 -3.3869946e-01 -6.8428683e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545,0.587035
3,The Brazilian unit of Finnish security solutio...,0,[-9.49790627e-02 1.75042719e-01 -4.31647301e-...,[ 0.13613077 -1.0649096 -0.86048055 -0.451764...,0,0,1,1,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938,0.934545,0.587035
4,"On August 15, 2027, Dr. John Lee speculates th...",1,[-1.65867433e-01 3.57482523e-01 7.28247538e-...,[-7.39654064e-01 1.42271864e+00 9.85548079e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545,0.587035
5,Analyst Emma Taylor noted that the home run co...,0,[-2.64777802e-02 1.62194327e-01 -6.83643995e-...,[ 0.9824245 -1.2401017 -0.28229827 1.113998...,0,0,1,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938,0.934545,0.587035
6,"According to Goldman Sachs, the research and d...",1,[-1.21063471e-01 2.51935005e-01 -2.94214804e-...,[-1.86127022e-01 -1.64573379e-02 -6.41750097e-...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545,0.587035
7,"According to Goldman Sachs, the research and d...",1,[-1.16282433e-01 2.23263055e-01 -4.77008447e-...,[-1.2706007e-01 -4.0740904e-01 -9.3267500e-01 ...,1,1,1,1,1,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,1,0.604938,0.934545,0.587035
8,According to the company 's updated strategy f...,1,[-2.63320029e-01 3.28540474e-01 4.48558182e-...,[-1.9436246 1.028084 0.54040897 0.117793...,1,0,0,0,0,1,1,1,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",1,0,0.604938,0.934545,0.587035
9,The Lithuanian beer market made up 14.41 milli...,0,[-6.91080689e-02 2.09023327e-01 7.64123956e-...,[ 0.4557519 -0.6015726 -0.05187901 0.517767...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.7916666667,""Precis...",0,0,0.604938,0.934545,0.587035


In [26]:
combine_data_path

'/Users/detraviousjamaribrinkley/Documents/Development/research_labs/uf_ds/predictions/prediction_classification_experiments-v2/../data/combined_generated_fin_phrase_bank'

In [27]:
save_json_path = os.path.join(combine_data_path, 'results_with_generated_non_predictions.json')
combined_df.to_json(save_json_path)
# parsed = loads(df_to_json)
# dumps(parsed, indent=3) 