## Evaluation + Metrics

- PAPER: [Collective Reasoning Among LLMs: A Framework for Answer Validation Without Ground Truth](https://arxiv.org/pdf/2502.20758) by Davoudi et al., 2025

In [1]:
import os
import sys
import warnings
import statsmodels

import numpy as np
import pandas as pd

from tqdm import tqdm
from statistics import mean
from json import loads, dumps

from statsmodels.stats.inter_rater import cohens_kappa, fleiss_kappa

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

# import log_files
from data_processing import DataProcessing
from metrics import EvaluationMetric

In [2]:
get_metrics = EvaluationMetric()
get_metrics

<metrics.EvaluationMetric at 0x3279d2850>

## Load Data

In [3]:
base_data_path = os.path.join(notebook_dir, '../data')
combine_data_path = os.path.join(base_data_path, 'financial_phrase_bank/combined_generated_fin_phrase_bank')

In [4]:
ml_classifiers_data_path = os.path.join(combine_data_path, 'ml_classifiers-v1.csv')
ml_classifiers_df = DataProcessing.load_from_file(ml_classifiers_data_path, 'csv', sep=',')
ml_classifiers_df = DataProcessing.drop_df_columns(ml_classifiers_df, ['Unnamed: 0'])
# len(ml_classifiers_df), ml_classifiers_df.head(3)
ml_classifiers_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier
0,"Dr. Maria Rodriguez, a renowned climate scient...",0,[-5.33820353e-02 2.96053916e-01 4.16866988e-...,[ 0.3908352 0.88384986 0.54582196 0.552968...,0,0,0,0,0
1,We succeeded in increasing our market share of...,0,[-4.71338071e-02 2.51612782e-01 2.61100605e-...,[ 0.46643826 0.28396165 0.3074835 -1.456996...,0,0,0,0,0
2,The Centers for Disease Control and Prevention...,0,[-1.51246399e-01 2.89643884e-01 -6.67415932e-...,[-7.9331547e-01 7.9732412e-01 -1.1132418e+00 ...,0,0,0,0,0


In [5]:
len(ml_classifiers_df)

565

In [6]:
# llm_classifiers_data_path = os.path.join(combine_data_path, 'llm_classifiers-v1.csv')
llm_classifiers_data_path = os.path.join(combine_data_path, 'llm_classifiers-shortened_to_238-v1.csv')
llm_classifiers_df = DataProcessing.load_from_file(llm_classifiers_data_path, 'csv', ',')
llm_classifiers_df = DataProcessing.drop_df_columns(llm_classifiers_df, ['Unnamed: 0'])
# len(llm_classifiers_df), llm_classifiers_df.head(3)
llm_classifiers_df.head(3)

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct
0,"Dr. Maria Rodriguez, a renowned climate scient...",0,[-5.33820353e-02 2.96053916e-01 4.16866988e-...,[ 0.3908352 0.88384986 0.54582196 0.552968...,0,0,0,0
1,We succeeded in increasing our market share of...,0,[-4.71338071e-02 2.51612782e-01 2.61100605e-...,[ 0.46643826 0.28396165 0.3074835 -1.456996...,0,0,0,0
2,The Centers for Disease Control and Prevention...,0,[-1.51246399e-01 2.89643884e-01 -6.67415932e-...,[-7.9331547e-01 7.9732412e-01 -1.1132418e+00 ...,0,0,0,0


In [7]:
len(llm_classifiers_df)

238

In [8]:
ml_classifiers_df = ml_classifiers_df.iloc[: len(llm_classifiers_df), :]
len(ml_classifiers_df)

238

In [9]:
print(all(ml_classifiers_df.loc[:, 'Base Sentence'].values == llm_classifiers_df.loc[:, 'Base Sentence'].values))

print(all(ml_classifiers_df.loc[:, 'Sentence Label'].values == llm_classifiers_df.loc[:, 'Sentence Label'].values))

True
True


In [10]:
llm_classifiers_cols = llm_classifiers_df.iloc[:, 5:]
llm_classifiers_cols.head(3)

Unnamed: 0,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct
0,0,0,0
1,0,0,0
2,0,0,0


In [11]:
combined_df = pd.concat([ml_classifiers_df, llm_classifiers_cols], axis=1)
combined_df

Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct
0,"Dr. Maria Rodriguez, a renowned climate scient...",0,[-5.33820353e-02 2.96053916e-01 4.16866988e-...,[ 0.3908352 0.88384986 0.54582196 0.552968...,0,0,0,0,0,0,0,0
1,We succeeded in increasing our market share of...,0,[-4.71338071e-02 2.51612782e-01 2.61100605e-...,[ 0.46643826 0.28396165 0.3074835 -1.456996...,0,0,0,0,0,0,0,0
2,The Centers for Disease Control and Prevention...,0,[-1.51246399e-01 2.89643884e-01 -6.67415932e-...,[-7.9331547e-01 7.9732412e-01 -1.1132418e+00 ...,0,0,0,0,0,0,0,0
3,"On 03/20/2024, the Weather Channel reported th...",0,[ 3.48531380e-02 3.58599305e-01 -5.18614389e-...,[ 1.4584736 1.7281182 -0.8855602 -0.102482...,0,0,0,0,0,0,0,0
4,"In the second quarter of 2010 , the group 's p...",0,[-2.45986860e-02 2.46378332e-01 4.28778417e-...,[ 0.73911136 0.21330449 0.5640477 3.212272...,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
233,The research advisor at the University of Cali...,0,[ 2.55523454e-02 1.30580723e-01 -4.90458868e-...,[ 1.3459346 -1.3497884 -0.84247935 -0.949201...,0,0,0,0,0,0,0,0
234,Cargo volume grew by 7 % .,0,[-2.80697614e-01 2.08972856e-01 9.39094275e-...,[-2.3596644e+00 -2.9161295e-01 1.3448831e+00 ...,0,0,0,0,0,0,0,0
235,Pioneer Library System was one of 127 librarie...,0,[ 8.20496213e-03 8.35401639e-02 7.54775554e-...,[ 1.1360328 -1.984765 1.0628567 -1.001820...,0,0,0,0,0,0,0,0
236,Previously the company has estimated its opera...,0,[-3.71137522e-02 2.36375913e-01 1.08766248e-...,[ 0.5876801 0.07828694 0.07439633 -1.458898...,0,1,0,1,0,0,0,0


## Accuracy

+ [x] ML only
+ [x] LLM only

In [12]:
ml_model_names = ml_classifiers_df.columns.to_list()[5:]
llm_model_names = llm_classifiers_df.columns.to_list()[5:]

model_names = ml_model_names + llm_model_names
model_names

['Perceptron',
 'SDG Classifier',
 'Logistic Regression',
 'Ridge Classifier',
 'llama-3.1-8b-instant',
 'llama-3.3-70b-versatile',
 'llama-3.3-70b-instruct']

In [13]:
llm_model_names

['llama-3.1-8b-instant', 'llama-3.3-70b-versatile', 'llama-3.3-70b-instruct']

In [14]:
actual_label = combined_df['Actual Label'].values

all_model_metrics = {}
for model_name in model_names:
    print(f"=============================={model_name}==============================\n")
    # ml_model_name = model.__name__()
    print(f"Actual Label:\t\t{actual_label}")
    model_predictions = combined_df[model_name].values
    print(f"{model_name}:\t\t{model_predictions}")
    get_metrics.eval_classification_report(actual_label, model_predictions)
    model_metrics = get_metrics.custom_evaluation_metrics(actual_label, model_predictions)
    all_model_metrics[model_name] = model_metrics
    print("==========================================================================\n")


Actual Label:		[0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0
 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0
 1 0 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 1 0 0 0
 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 1 1 1 0 0 0 0
 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0
 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0]
Perceptron:		[0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0
 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0
 1 0 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 1 0 0 0
 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1
 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 0 0 0
 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0
 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0]


A

Errors + why errors

In [15]:
all_model_metrics_json = pd.DataFrame(all_model_metrics).to_json()
all_model_metrics_json

'{"Perceptron":{"Accuracy":0.9789915966,"Precision":0.9682539683,"Recall":0.953125,"F1 Score":0.9606299213},"SDG Classifier":{"Accuracy":0.9453781513,"Precision":0.9636363636,"Recall":0.828125,"F1 Score":0.8907563025},"Logistic Regression":{"Accuracy":0.9789915966,"Precision":0.9682539683,"Recall":0.953125,"F1 Score":0.9606299213},"Ridge Classifier":{"Accuracy":0.9705882353,"Precision":0.9384615385,"Recall":0.953125,"F1 Score":0.9457364341},"llama-3.1-8b-instant":{"Accuracy":0.9327731092,"Precision":0.8157894737,"Recall":0.96875,"F1 Score":0.8857142857},"llama-3.3-70b-versatile":{"Accuracy":0.9033613445,"Precision":0.7469879518,"Recall":0.96875,"F1 Score":0.843537415},"llama-3.3-70b-instruct":{"Accuracy":0.9033613445,"Precision":0.7469879518,"Recall":0.96875,"F1 Score":0.843537415}}'

In [16]:
print(type(all_model_metrics))

combined_df['Accuracy Results'] = all_model_metrics_json
combined_df

<class 'dict'>


Unnamed: 0,Base Sentence,Sentence Label,Embedding,Normalized Embeddings,Actual Label,Perceptron,SDG Classifier,Logistic Regression,Ridge Classifier,llama-3.1-8b-instant,llama-3.3-70b-versatile,llama-3.3-70b-instruct,Accuracy Results
0,"Dr. Maria Rodriguez, a renowned climate scient...",0,[-5.33820353e-02 2.96053916e-01 4.16866988e-...,[ 0.3908352 0.88384986 0.54582196 0.552968...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.9789915966,""Precis..."
1,We succeeded in increasing our market share of...,0,[-4.71338071e-02 2.51612782e-01 2.61100605e-...,[ 0.46643826 0.28396165 0.3074835 -1.456996...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.9789915966,""Precis..."
2,The Centers for Disease Control and Prevention...,0,[-1.51246399e-01 2.89643884e-01 -6.67415932e-...,[-7.9331547e-01 7.9732412e-01 -1.1132418e+00 ...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.9789915966,""Precis..."
3,"On 03/20/2024, the Weather Channel reported th...",0,[ 3.48531380e-02 3.58599305e-01 -5.18614389e-...,[ 1.4584736 1.7281182 -0.8855602 -0.102482...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.9789915966,""Precis..."
4,"In the second quarter of 2010 , the group 's p...",0,[-2.45986860e-02 2.46378332e-01 4.28778417e-...,[ 0.73911136 0.21330449 0.5640477 3.212272...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.9789915966,""Precis..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,The research advisor at the University of Cali...,0,[ 2.55523454e-02 1.30580723e-01 -4.90458868e-...,[ 1.3459346 -1.3497884 -0.84247935 -0.949201...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.9789915966,""Precis..."
234,Cargo volume grew by 7 % .,0,[-2.80697614e-01 2.08972856e-01 9.39094275e-...,[-2.3596644e+00 -2.9161295e-01 1.3448831e+00 ...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.9789915966,""Precis..."
235,Pioneer Library System was one of 127 librarie...,0,[ 8.20496213e-03 8.35401639e-02 7.54775554e-...,[ 1.1360328 -1.984765 1.0628567 -1.001820...,0,0,0,0,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.9789915966,""Precis..."
236,Previously the company has estimated its opera...,0,[-3.71137522e-02 2.36375913e-01 1.08766248e-...,[ 0.5876801 0.07828694 0.07439633 -1.458898...,0,1,0,1,0,0,0,0,"{""Perceptron"":{""Accuracy"":0.9789915966,""Precis..."


### Cohen's Kappa

- How realiable are the raters?
    - Higher, then they tend to agree
    - Lower, the they tend to disagree
- Does NOT state if the raters are correct as in raters can say one thing and real value may be another. This deals with the validity.

---

1. Intra (within each model category) on the predicted labels
    - [x] ML: ML_i vs ML_j
        - ML_1 vs ML_2
        - ML_1 vs ML_3
        - ML_2 vs ML_3
    - [x] LLM_a vs LLM_b
        - LLM_1 vs LLM_2
        - LLM_1 vs LLM_3
        - LLM_2 vs LLM_3

In [28]:
# Usage
ml_cohen_kappa_df = get_metrics.calculate_pairwise_cohens_kappa(
    combined_df, ml_model_names, 'ML', print_bool = True
)
llm_cohen_kappa_df = get_metrics.calculate_pairwise_cohens_kappa(
    combined_df, llm_model_names, 'LLM'
)

cohen_kappa_df = DataProcessing.concat_dfs([ml_cohen_kappa_df, llm_cohen_kappa_df])
cohen_kappa_df.sort_values("Cohen's Kappa", ascending=False, inplace=True)
cohen_kappa_df


ML Models - Cohen's Kappa Scores
Cohen's Kappa (Perceptron x SDG Classifier): 0.89
Cohen's Kappa (Perceptron x Logistic Regression): 1.0
Cohen's Kappa (Perceptron x Ridge Classifier): 0.94
Cohen's Kappa (SDG Classifier x Logistic Regression): 0.89
Cohen's Kappa (SDG Classifier x Ridge Classifier): 0.89
Cohen's Kappa (Logistic Regression x Ridge Classifier): 0.94


Unnamed: 0,Model 1,Model 2,Cohen's Kappa
1,Perceptron,Logistic Regression,1.0
8,llama-3.3-70b-versatile,llama-3.3-70b-instruct,0.98
2,Perceptron,Ridge Classifier,0.94
5,Logistic Regression,Ridge Classifier,0.94
6,llama-3.1-8b-instant,llama-3.3-70b-versatile,0.92
7,llama-3.1-8b-instant,llama-3.3-70b-instruct,0.92
0,Perceptron,SDG Classifier,0.89
3,SDG Classifier,Logistic Regression,0.89
4,SDG Classifier,Ridge Classifier,0.89


## Majority Vote

+ Need an odd number of columns for the `.mode(axis=1)` to work.

---

+ [x] ML only
+ [x] LLM only
+ [x] Combined

In [None]:
majority_votes = {}
ml_majority_vote = combined_df.loc[:, ml_model_names[:3]].mode(axis=1)
# print(ml_majority_vote)
# majority_votes = {
#     'ML Majority Vote' = ml_majority_vote,
# }
# print(f"ML MAJORITY VOTE: {ml_majority_vote}")

llm_majority_vote = combined_df.loc[:, llm_model_names].mode(axis=1)
# majority_votes['LLM Majority Vote'] = llm_majority_vote
# print(f"LLM MAJORITY VOTE: {llm_majority_vote}")

all_models_majorty_vote = combined_df.loc[:, model_names].mode(axis=1)
# majority_votes['All Models Majority Vote'] = all_models_majorty_vote


majority_vote_col_names = {
    0 : 'ML Majority Vote',
    1 :'LLM Majority Vote',
    2: 'All Models Majority Vote'
}

majority_vote_df = DataProcessing.concat_dfs([ml_majority_vote, llm_majority_vote, all_models_majorty_vote], axis=1)
majority_vote_df.rename(columns=majority_vote_col_names, inplace=True)
# print(f"ALL MODELS MAJORITY VOTE: {combined_majorty_vote}")
# print(majority_votes)
# combined_df['ML Majority Vote'] = ml_majority_vote
# combined_df['LLM Majority Vote'] = llm_majority_vote
# combined_df['All Models Majority Vote'] = all_models_majorty_vote


# majority_vote_df = pd.DataFrame(data=majority_votes)
majority_vote_df.head(3)

# combined_df.head(7)

### Cohen's Kappa

- How realiable are the raters?
    - Higher, then they tend to agree
    - Lower, the they tend to disagree
- Does NOT state if the raters are correct as in raters can say one thing and real value may be another. This deals with the validity.

---

- Inter (across groups) for majority vote
- [x] 1. ML vs LLM
- [x] 2. ML vs All
- [x] 1. LLM vs All

In [None]:
ml_ml_ck = get_metrics.get_cohens_kappa(majority_vote_df, 'ML Majority Vote', 'ML Majority Vote')
print("\nCohen's Kappa (ML x ML):", round(ml_ml_ck.kappa, 2))

ml_llm_ck = get_metrics.get_cohens_kappa(majority_vote_df, 'ML Majority Vote', 'LLM Majority Vote')
print("\nCohen's Kappa (ML x LLM):", round(ml_llm_ck.kappa, 2))

ml_all_ck = get_metrics.get_cohens_kappa(majority_vote_df, 'ML Majority Vote', 'All Models Majority Vote')
print("\nCohen's Kappa (ML x All):", round(ml_all_ck.kappa, 2))

llm_all_ck = get_metrics.get_cohens_kappa(majority_vote_df, 'LLM Majority Vote', 'All Models Majority Vote')
print("\nCohen's Kappa (LLM x All):", round(llm_all_ck.kappa, 2))

## Fleiss’ Kappa

- How realiable are the raters?
    - Higher, then they tend to agree
    - Lower, the they tend to disagree
- Does NOT state if the raters are correct as in raters can say one thing and real value may be another. This deals with the validity.

---

1. Intra Model: Predicted Label 
    - [ ] 1. ML: ML_1 vs ML_2 vs ... vs ML_N, where N is odd
    - [ ] 2. LLM_1 vs LLM_2 vs ... vs LLM_N, where N is odd

2. Intra Model: Majority Vote 
    - [x] 1. ML vs LLM
    - [x] 2. ML vs All
    - [x] 1. LLM vs All

In [None]:
majority_vote_df

In [None]:
def get_fleiss_kappa(df):
    # frequency_table = pd.crosstab(df[rater_1_col_name], df[rater_2_col_name])

    stacked = df.stack().reset_index()
    # print(stacked)
    stacked.columns = ['Subject to Rate', 'Rater', 'Majority Vote Value']

    # Crosstab: rows = Subject, columns = Category, values = counts
    freq_table = pd.crosstab(stacked['Subject to Rate'], stacked['Majority Vote Value'])
    # print(freq_table)
    return fleiss_kappa(freq_table)


fleiss_across_all_models = get_fleiss_kappa(majority_vote_df)
print("\nFleiss's Kappa (ML x LLM x All):", round(fleiss_across_all_models, 2))

##  Reliability Metric

##  Confidence Intervals via Bootstrap

> ML > LLM > ML x LLM, why? Due to randomness of LLM?

## Confusion Matrix

## Save Data

In [None]:
combined_df

In [None]:
# combined_df.to_json()

In [None]:
combine_data_path

In [None]:
save_json_path = os.path.join(combine_data_path, 'results_with_generated_non_predictions.json')
combined_df.to_json(save_json_path)
# parsed = loads(df_to_json)
# dumps(parsed, indent=3) 

## Prompt LLM to Write Results using Latex

In [None]:
from text_generation_models import TextGenerationModelFactory
tgmf = TextGenerationModelFactory()
models = tgmf.create_instances(['openai/gpt-oss-120b'])

# prompt = f"Format results {combined_df} using latex: Use columns: Perceptron	SDG Classifier	Logistic Regression	Ridge Classifier	llama-3.1-8b-instant	llama-3.3-70b-versatile	llama-3.3-70b-instruct	Accuracy Results	LLM Majority Vote	All Models Majority Vote	ML Fleiss Kappa	LLM Fleiss Kappa	All Models Fleiss Kappa. Format so I can easily copy and paste latex code."

# prompt = f"Create a LaTeX table from this dataframe: {combined_df}. Use these columns: Perceptron, SDG Classifier, Logistic Regression, Ridge Classifier, llama-3.1-8b-instant, llama-3.3-70b-versatile, llama-3.3-70b-instruct, Accuracy Results, LLM Majority Vote, All Models Majority Vote, ML Fleiss Kappa, LLM Fleiss Kappa, All Models Fleiss Kappa. Output only the LaTeX code, no explanations."

example_table = """
    \begin{table}[H]
    \centering
    \caption{Performance Metrics Across Models}
    \begin{adjustbox}{max width=\textwidth}
    \begin{tabular}{lcccccccc}
    \toprule
    \textbf{Metric} & \textbf{Perceptron} & \textbf{SDG Classifier} & \textbf{Logistic Regression} & \textbf{Ridge Classifier} & \textbf{llama-3.1-8b-instant} & \textbf{llama-3.3-70b-versatile} & \textbf{llama-3.3-70b-instruct} \\
    \midrule
    Accuracy & 0.72 & 0.75 & 0.78 & 0.76 & 0.81 & 0.83 & 0.85 \\
    Precision & 0.70 & 0.73 & 0.76 & 0.74 & 0.79 & 0.81 & 0.83 \\
    Recall & 0.68 & 0.71 & 0.74 & 0.72 & 0.77 & 0.79 & 0.81 \\
    F1-score & 0.69 & 0.72 & 0.75 & 0.73 & 0.78 & 0.80 & 0.82 \\
    \bottomrule
    \end{tabular}
    \end{adjustbox}
    \end{table}
    
    \begin{table}[H]
    \centering
    \caption{Agreement Metrics}
    \begin{adjustbox}{max width=\textwidth}
    \begin{tabular}{lcc}
    \toprule
    \textbf{Metric} & \textbf{LLM Majority Vote} & \textbf{All Models Majority Vote} \\
    \midrule
    ML Fleiss Kappa & 0.65 & 0.68 \\
    LLM Fleiss Kappa & 0.72 & 0.75 \\
    All Models Fleiss Kappa & 0.70 & 0.73 \\
    \bottomrule
    \end{tabular}
    \end{adjustbox}
    \end{table}

"""

prompt = f"Create two separate LaTeX tables from this dataframe: {combined_df}. Table 1 - Performance Metrics with columns: Perceptron, SDG Classifier, Logistic Regression, Ridge Classifier, llama-3.1-8b-instant, llama-3.3-70b-versatile, llama-3.3-70b-instruct and rows: Accuracy, Precision, Recall, F1-score. Table 2 - Agreement Metrics with columns: LLM Majority Vote, All Models Majority Vote and rows: ML Fleiss Kappa, LLM Fleiss Kappa, All Models Fleiss Kappa. Output only the LaTeX code, no explanations. Properly format both tables using proper spacing instead of \n or multiple of those consecutively."

input_prompt = models[0].user(prompt)
models[0].chat_completion([input_prompt])