In [1]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import difflib
from collections import Counter
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report

# 1. Results B4.0

### 1. Import Results
Results created under the same fingerprint are saved in a CSV file for each seed. All results are combined into one dataset.

In [2]:
#B1.0
path_B4 = 'STRING_RESULT_TRAIN/B4.0/all_iterations'

# Open all dataframes
S1 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_3644.csv')
S2 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_3441.csv')
S3 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_280.csv')
S4 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_5991.csv')
S5 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_7917.csv')


In [3]:
# Evaluate if predictions are correctly saved
# Combine all dataframes into one
B4_full_OG = pd.concat([S1, S2, S3, S4, S5], ignore_index=True)
B4_full_OG.shape

(5960, 12)

In [4]:
def filter_dataframe(df, principles=None, units=None, shapes=None, topics=None):
    if principles:
        df = df[df['PRINCIPLE_y'].isin(principles)]
    if units:
        df = df[df['UNIT_y'].isin(units)]
    if shapes:
        df = df[df['SHAPE_y'].isin(shapes)]
    if topics:
        df = df[df['TOPIC_y'].isin(topic)]
    return df

principle = ['egalitarian', 'general normative statement', 'not evaluated', 'libertarian', 'not evaluated','prioritarian', 'sufficientarian', 'utilitarian']
unit = ['not indicated', 'not evaluated', 'responsibility','financial resources', 'technological resources', 'financial and technological resources', 'support', 'other']
shape = ['not indicated', 'not evaluated', 'equity', 'equality','priority to worst off', 'needs based', 'proportional to contribution', 'proportional to commitment']
topic = ['new UNFCCC policy', 'UNFCCC agreements and principles', 'urgency', 'cooperation', 'financial mechanisms', 'adaptation', 'mitigation', 'adaptation and mitigation', 'other']

S1 = filter_dataframe(S1, principles=principle, units=unit, shapes=shape, topics=topic)
S2 = filter_dataframe(S2, principles=principle, units=unit, shapes=shape, topics=topic)
S3 = filter_dataframe(S3, principles=principle, units=unit, shapes=shape, topics=topic)
S4 = filter_dataframe(S4, principles=principle, units=unit, shapes=shape, topics=topic)
S5 = filter_dataframe(S5, principles=principle, units=unit, shapes=shape, topics=topic)


______________________________________


In [5]:
# Combine all dataframes into one
B4_full = pd.concat([S1, S2, S3, S4, S5], ignore_index=True)
B4_full.shape

(3708, 12)

In [6]:
# Filter rows to keep only those where the unique_id appears exactly 5 times
filtered_B4 = B4_full.groupby('unique_id').filter(lambda x: len(x) == 5)
filtered_B4.shape

(3220, 12)

In [7]:
# List of unique_id's that are annotated 5 times
unique_ids = filtered_B4['unique_id'].unique().tolist()

In [8]:
# Filter all original dataframes to only include sentences that are annotated 5x by other seeds
FB1S1 = S1[S1['unique_id'].isin(unique_ids)]
FB1S2 = S2[S2['unique_id'].isin(unique_ids)]
FB1S3 = S3[S3['unique_id'].isin(unique_ids)]
FB1S4 = S4[S4['unique_id'].isin(unique_ids)]
FB1S5 = S5[S5['unique_id'].isin(unique_ids)]

In [9]:
# Evaluate performance of individual seeds for the same sentences
# Function to generate classification report for each dataframe
def generate_classification_report(df, true_label_col, pred_label_col):
    return classification_report(df[true_label_col], df[pred_label_col], output_dict=True)

# Generate classification reports
report_S1 = generate_classification_report(FB1S1, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S2 = generate_classification_report(FB1S2, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S3 = generate_classification_report(FB1S3, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S4 = generate_classification_report(FB1S4, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S5 = generate_classification_report(FB1S5, 'PRINCIPLE_x', 'PRINCIPLE_y')

# Function to print the classification reports for easy comparison
def print_classification_report(report, title):
    print(f"Classification Report for {title}")
    print(pd.DataFrame(report).transpose())
    print("\n")

# Print the classification reports
print_classification_report(report_S1, "S1")
print_classification_report(report_S2, "S2")
print_classification_report(report_S3, "S3")
print_classification_report(report_S4, "S4")
print_classification_report(report_S5, "S5")

Classification Report for S1
                             precision    recall  f1-score     support
egalitarian                   0.224806  0.630435  0.331429   46.000000
general normative statement   0.063953  0.392857  0.110000   28.000000
libertarian                   0.000000  0.000000  0.000000    1.000000
not evaluated                 1.000000  0.090308  0.165657  454.000000
prioritarian                  0.272727  0.368421  0.313433   57.000000
sufficientarian               0.085714  0.375000  0.139535    8.000000
utilitarian                   0.195767  0.740000  0.309623   50.000000
accuracy                      0.220497  0.220497  0.220497    0.220497
macro avg                     0.263281  0.371003  0.195668  644.000000
weighted avg                  0.764210  0.220497  0.198753  644.000000


Classification Report for S2
                             precision    recall  f1-score     support
egalitarian                   0.229508  0.608696  0.333333   46.000000
general normative

Evaluate consistency of predictions over the 5 different instances - see which sentences are not consistantly predicted.


In [10]:
def columns_not_uniform(group, columns):
    for col in columns:
        if group[col].nunique() != 1:
            return True
    return False

# Columns to check for uniformity
columns_to_check = ['PRINCIPLE_y', 'UNIT_y', 'SHAPE_y','TOPIC_y']

# Group by 'unique_id' and filter groups
B4_consistency = filtered_B4.groupby('unique_id').filter(lambda x: columns_not_uniform(x, columns_to_check))

B4_consistency.shape

(1895, 12)


In combined dataframe, groupyby unique ID and take the majority label

In [11]:
def most_frequent_except_principle(x):
    if x.name in ['PRINCIPLE_y', 'UNIT_y', 'SHAPE_y', 'TOPIC_y']:
        return Counter(x).most_common(1)[0][0]
    return x.iloc[0]  # Keep the first value for other columns

# Dictionary to specify aggregation functions for all columns
agg_dict_all = {col: most_frequent_except_principle for col in filtered_B4.columns if col != 'unique_id'}

# Group by 'unique_id' and apply the aggregation functions
filtered_B4_grouped = filtered_B4.groupby('unique_id').agg(agg_dict_all).reset_index()

filtered_B4_grouped

Unnamed: 0,unique_id,text,PRINCIPLE_x,TOPIC_x,UNIT_x,SHAPE_x,llm_query,PRINCIPLE_y,TOPIC_y,UNIT_y,SHAPE_y,iteration
0,3,Mr. President: A fair and effective framewor...,utilitarian,new UNFCCC policy,responsibility,equality,3 Mr. President: A fair and effective framew...,egalitarian,UNFCCC agreements and principles,responsibility,equality,1
1,4,"In this regard, Japan firmly supports the est...",not evaluated,not evaluated,not evaluated,not evaluated,"4 In this regard, Japan firmly supports the e...",general normative statement,new UNFCCC policy,not indicated,not indicated,1
2,5,Such a framework must be based on “nationally ...,egalitarian,new UNFCCC policy,responsibility,equity,5 Such a framework must be based on “nationall...,general normative statement,new UNFCCC policy,not indicated,not indicated,1
3,21,We will strategically promote mitigation meas...,not evaluated,not evaluated,not evaluated,not evaluated,21 We will strategically promote mitigation m...,general normative statement,mitigation,not indicated,not indicated,1
4,24,"Using that opportunity, we will launch a new ...",not evaluated,not evaluated,not evaluated,not evaluated,"24 Using that opportunity, we will launch a n...",general normative statement,new UNFCCC policy,not indicated,not indicated,1
...,...,...,...,...,...,...,...,...,...,...,...,...
639,1201,And COP28 is our moment to act.,not evaluated,not evaluated,not evaluated,not evaluated,1201 And COP28 is our moment to act.\n,general normative statement,urgency,not indicated,not indicated,1
640,1202,The first global stocktake of the Paris Agree...,not evaluated,not evaluated,not evaluated,not evaluated,1202 The first global stocktake of the Paris ...,general normative statement,UNFCCC agreements and principles,not indicated,not indicated,1
641,1204,We must deliver on our commitments.,general normative statement,UNFCCC agreements and principles,not indicated,proportional to commitment,1204 We must deliver on our commitments.\n,general normative statement,UNFCCC agreements and principles,responsibility,proportional to commitment,1
642,1205,"We need a course correction, and working toge...",utilitarian,cooperation,not indicated,not indicated,"1205 We need a course correction, and working...",utilitarian,cooperation,not indicated,not indicated,1


In [12]:
# Evaluate performance of this dataframe - principle
print(classification_report(filtered_B4_grouped['PRINCIPLE_x'],filtered_B4_grouped['PRINCIPLE_y']))

                             precision    recall  f1-score   support

                egalitarian       0.23      0.65      0.34        46
general normative statement       0.07      0.43      0.12        28
                libertarian       0.00      0.00      0.00         1
              not evaluated       1.00      0.09      0.17       454
               prioritarian       0.28      0.37      0.32        57
            sufficientarian       0.11      0.50      0.19         8
                utilitarian       0.19      0.72      0.30        50

                   accuracy                           0.22       644
                  macro avg       0.27      0.39      0.20       644
               weighted avg       0.76      0.22      0.20       644



In [13]:
# Evaluate performance of this dataframe - topic
print(classification_report(filtered_B4_grouped['TOPIC_x'],filtered_B4_grouped['TOPIC_y']))

                                  precision    recall  f1-score   support

UNFCCC agreements and principles       0.15      0.44      0.23        25
                      adaptation       0.03      1.00      0.06         1
       adaptation and mitigation       0.13      0.44      0.21         9
                     cooperation       0.13      0.71      0.22        21
            financial mechanisms       0.33      0.95      0.49        20
                      mitigation       0.06      0.67      0.11         6
               new UNFCCC policy       0.21      0.29      0.24        35
                   not evaluated       0.00      0.00      0.00       454
                           other       0.18      0.24      0.20        42
                         urgency       0.14      0.74      0.24        31

                        accuracy                           0.15       644
                       macro avg       0.14      0.55      0.20       644
                    weighted avg    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# Evaluate performance of this dataframe - topic
print(classification_report(filtered_B4_grouped['UNIT_x'],filtered_B4_grouped['UNIT_y']))

                                       precision    recall  f1-score   support

financial and technological resources       0.30      0.88      0.45         8
                  financial resources       0.37      0.96      0.53        27
                        not evaluated       1.00      0.03      0.06       454
                        not indicated       0.16      0.42      0.23        92
                                other       0.00      0.00      0.00         2
                       responsibility       0.17      0.80      0.28        45
                              support       0.19      0.67      0.29        15
              technological resources       0.06      1.00      0.11         1

                             accuracy                           0.21       644
                            macro avg       0.28      0.59      0.24       644
                         weighted avg       0.76      0.21      0.13       644



In [15]:
# Evaluate performance of this dataframe - topic
print(classification_report(filtered_B4_grouped['SHAPE_x'],filtered_B4_grouped['SHAPE_y']))

                              precision    recall  f1-score   support

                    equality       0.03      0.30      0.06        10
                      equity       0.24      0.43      0.31        14
                 needs based       0.05      0.75      0.10         4
               not evaluated       1.00      0.03      0.06       456
               not indicated       0.18      0.56      0.27       113
       priority to worst off       0.15      0.39      0.21        28
  proportional to commitment       0.34      0.65      0.45        17
proportional to contribution       0.17      0.50      0.25         2

                    accuracy                           0.17       644
                   macro avg       0.27      0.45      0.21       644
                weighted avg       0.76      0.17      0.12       644



# TEST SET 4.0

In [16]:
#B1.0
path_B4 = 'STRING_RESULT/B4.0/all_iterations'

# Open all dataframes
S1 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_3644.csv')
S2 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_3441.csv')
S3 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_280.csv')
S4 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_5991.csv')
S5 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_7917.csv')


In [17]:
# Evaluate if predictions are correctly saved
# Combine all dataframes into one
B4_full_OG = pd.concat([S1, S2, S3, S4, S5], ignore_index=True)
B4_full_OG.shape

(1615, 12)

In [18]:
def filter_dataframe(df, principles=None, units=None, shapes=None, topics=None):
    if principles:
        df = df[df['PRINCIPLE_y'].isin(principles)]
    if units:
        df = df[df['UNIT_y'].isin(units)]
    if shapes:
        df = df[df['SHAPE_y'].isin(shapes)]
    if topics:
        df = df[df['TOPIC_y'].isin(topic)]
    return df

principle = ['egalitarian', 'general normative statement', 'not evaluated', 'libertarian', 'not evaluated','prioritarian', 'sufficientarian', 'utilitarian']
unit = ['not indicated', 'not evaluated', 'responsibility','financial resources', 'technological resources', 'financial and technological resources', 'support', 'other']
shape = ['not indicated', 'not evaluated', 'equity', 'equality','priority to worst off', 'needs based', 'proportional to contribution', 'proportional to commitment']
topic = ['new UNFCCC policy', 'UNFCCC agreements and principles', 'urgency', 'cooperation', 'financial mechanisms', 'adaptation', 'mitigation', 'adaptation and mitigation', 'other']

S1 = filter_dataframe(S1, principles=principle, units=unit, shapes=shape, topics=topic)
S2 = filter_dataframe(S2, principles=principle, units=unit, shapes=shape, topics=topic)
S3 = filter_dataframe(S3, principles=principle, units=unit, shapes=shape, topics=topic)
S4 = filter_dataframe(S4, principles=principle, units=unit, shapes=shape, topics=topic)
S5 = filter_dataframe(S5, principles=principle, units=unit, shapes=shape, topics=topic)


______________________________________


In [21]:
# Combine all dataframes into one
B4_full = pd.concat([S1, S2, S3, S4, S5], ignore_index=True)
B4_full.shape

(911, 12)

In [22]:
# Filter rows to keep only those where the unique_id appears exactly 5 times
filtered_B4 = B4_full.groupby('unique_id').filter(lambda x: len(x) == 5)
filtered_B4.shape

(710, 12)

In [23]:
# List of unique_id's that are annotated 5 times
unique_ids = filtered_B4['unique_id'].unique().tolist()

In [24]:
# Filter all original dataframes to only include sentences that are annotated 5x by other seeds
FB1S1 = S1[S1['unique_id'].isin(unique_ids)]
FB1S2 = S2[S2['unique_id'].isin(unique_ids)]
FB1S3 = S3[S3['unique_id'].isin(unique_ids)]
FB1S4 = S4[S4['unique_id'].isin(unique_ids)]
FB1S5 = S5[S5['unique_id'].isin(unique_ids)]

In [25]:
# Evaluate performance of individual seeds for the same sentences
# Function to generate classification report for each dataframe
def generate_classification_report(df, true_label_col, pred_label_col):
    return classification_report(df[true_label_col], df[pred_label_col], output_dict=True)

# Generate classification reports
report_S1 = generate_classification_report(FB1S1, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S2 = generate_classification_report(FB1S2, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S3 = generate_classification_report(FB1S3, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S4 = generate_classification_report(FB1S4, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S5 = generate_classification_report(FB1S5, 'PRINCIPLE_x', 'PRINCIPLE_y')

# Function to print the classification reports for easy comparison
def print_classification_report(report, title):
    print(f"Classification Report for {title}")
    print(pd.DataFrame(report).transpose())
    print("\n")

# Print the classification reports
print_classification_report(report_S1, "S1")
print_classification_report(report_S2, "S2")
print_classification_report(report_S3, "S3")
print_classification_report(report_S4, "S4")
print_classification_report(report_S5, "S5")

Classification Report for S1
                             precision    recall  f1-score     support
egalitarian                   0.322581  0.666667  0.434783   15.000000
general normative statement   0.065217  0.250000  0.103448   12.000000
libertarian                   0.000000  0.000000  0.000000    2.000000
not evaluated                 1.000000  0.088235  0.162162  102.000000
prioritarian                  0.272727  0.600000  0.375000    5.000000
sufficientarian               0.000000  0.000000  0.000000    1.000000
utilitarian                   0.055556  0.400000  0.097561    5.000000
accuracy                      0.190141  0.190141  0.190141    0.190141
macro avg                     0.245154  0.286415  0.167565  142.000000
weighted avg                  0.769456  0.190141  0.187792  142.000000


Classification Report for S2
                             precision    recall  f1-score     support
egalitarian                   0.354839  0.733333  0.478261   15.000000
general normative

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluate consistency of predictions over the 5 different instances - see which sentences are not consistantly predicted.


In [26]:
def columns_not_uniform(group, columns):
    for col in columns:
        if group[col].nunique() != 1:
            return True
    return False

# Columns to check for uniformity
columns_to_check = ['PRINCIPLE_y', 'UNIT_y', 'SHAPE_y','TOPIC_y']

# Group by 'unique_id' and filter groups
B4_consistency = filtered_B4.groupby('unique_id').filter(lambda x: columns_not_uniform(x, columns_to_check))

B4_consistency.shape

(430, 12)


In combined dataframe, groupyby unique ID and take the majority label

In [27]:
def most_frequent_except_principle(x):
    if x.name in ['PRINCIPLE_y', 'UNIT_y', 'SHAPE_y', 'TOPIC_y']:
        return Counter(x).most_common(1)[0][0]
    return x.iloc[0]  # Keep the first value for other columns

# Dictionary to specify aggregation functions for all columns
agg_dict_all = {col: most_frequent_except_principle for col in filtered_B4.columns if col != 'unique_id'}

# Group by 'unique_id' and apply the aggregation functions
filtered_B4_grouped = filtered_B4.groupby('unique_id').agg(agg_dict_all).reset_index()

filtered_B4_grouped

Unnamed: 0,unique_id,text,PRINCIPLE_x,TOPIC_x,UNIT_x,SHAPE_x,llm_query,PRINCIPLE_y,TOPIC_y,UNIT_y,SHAPE_y,iteration
0,2," Of course, scientists dont know that clima...",not evaluated,not evaluated,not evaluated,not evaluated,"2  Of course, scientists dont know that cli...",utilitarian,urgency,not indicated,not indicated,1
1,3, It is for this reason that President Obama ...,not evaluated,not evaluated,not evaluated,not evaluated,3  It is for this reason that President Obam...,utilitarian,cooperation,responsibility,not indicated,1
2,12, The U.S. also continues to play an importan...,not evaluated,not evaluated,not evaluated,not evaluated,12  The U.S. also continues to play an impor...,egalitarian,financial mechanisms,financial resources,proportional to commitment,1
3,13,"• Simultaneously, we are fully engaged in cra...",not evaluated,not evaluated,not evaluated,not evaluated,"13 • Simultaneously, we are fully engaged in ...",general normative statement,new UNFCCC policy,not indicated,not indicated,1
4,14,We have advocated a structure for the new agre...,egalitarian,new UNFCCC policy,responsibility,equality,14 We have advocated a structure for the new a...,egalitarian,new UNFCCC policy,not indicated,equality,1
...,...,...,...,...,...,...,...,...,...,...,...,...
137,318,"Especially with Africa , which harnesses 40% ...",not evaluated,not evaluated,not evaluated,not evaluated,"318 Especially with Africa , which harnesses ...",utilitarian,cooperation,technological resources,not indicated,1
138,325,"Mr. President, Your Excellencies, Ladies and...",egalitarian,cooperation,not indicated,not indicated,"325 Mr. President, Your Excellencies, Ladies...",general normative statement,cooperation,not indicated,not indicated,1
139,326,The most important result of COP28 that I wou...,not evaluated,not evaluated,not evaluated,not evaluated,326 The most important result of COP28 that I...,general normative statement,new UNFCCC policy,not indicated,not indicated,1
140,327,"Trust that “WE” , governments together with o...",not evaluated,not evaluated,not evaluated,not evaluated,"327 Trust that “WE” , governments together wi...",egalitarian,UNFCCC agreements and principles,responsibility,equality,1


In [28]:
# Evaluate performance of this dataframe - principle
print(classification_report(filtered_B4_grouped['PRINCIPLE_x'],filtered_B4_grouped['PRINCIPLE_y']))

                             precision    recall  f1-score   support

                egalitarian       0.34      0.73      0.47        15
general normative statement       0.09      0.25      0.13        12
                libertarian       0.00      0.00      0.00         2
              not evaluated       1.00      0.14      0.24       102
               prioritarian       0.27      0.60      0.37         5
            sufficientarian       0.00      0.00      0.00         1
                utilitarian       0.05      0.40      0.09         5

                   accuracy                           0.23       142
                  macro avg       0.25      0.30      0.19       142
               weighted avg       0.77      0.23      0.25       142



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# Evaluate performance of this dataframe - topic
print(classification_report(filtered_B4_grouped['TOPIC_x'],filtered_B4_grouped['TOPIC_y']))

                                  precision    recall  f1-score   support

UNFCCC agreements and principles       0.15      0.44      0.23        25
                      adaptation       0.03      1.00      0.06         1
       adaptation and mitigation       0.13      0.44      0.21         9
                     cooperation       0.13      0.71      0.22        21
            financial mechanisms       0.33      0.95      0.49        20
                      mitigation       0.06      0.67      0.11         6
               new UNFCCC policy       0.21      0.29      0.24        35
                   not evaluated       0.00      0.00      0.00       454
                           other       0.18      0.24      0.20        42
                         urgency       0.14      0.74      0.24        31

                        accuracy                           0.15       644
                       macro avg       0.14      0.55      0.20       644
                    weighted avg    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
# Evaluate performance of this dataframe - topic
print(classification_report(filtered_B4_grouped['UNIT_x'],filtered_B4_grouped['UNIT_y']))

                                       precision    recall  f1-score   support

financial and technological resources       0.50      0.50      0.50         2
                  financial resources       0.41      1.00      0.58         9
                        not evaluated       0.00      0.00      0.00       102
                        not indicated       0.10      0.46      0.16        13
                                other       0.00      0.00      0.00         3
                       responsibility       0.18      0.73      0.29        11
                              support       0.20      1.00      0.33         1
              technological resources       0.12      1.00      0.22         1

                             accuracy                           0.18       142
                            macro avg       0.19      0.59      0.26       142
                         weighted avg       0.06      0.18      0.09       142



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
# Evaluate performance of this dataframe - topic
print(classification_report(filtered_B4_grouped['SHAPE_x'],filtered_B4_grouped['SHAPE_y']))

                              precision    recall  f1-score   support

                    equality       0.19      0.83      0.30         6
                      equity       0.50      1.00      0.67         3
                 needs based       0.22      0.67      0.33         3
               not evaluated       0.00      0.00      0.00       102
               not indicated       0.11      0.43      0.18        21
       priority to worst off       0.27      0.75      0.40         4
  proportional to commitment       0.00      0.00      0.00         1
proportional to contribution       0.25      0.50      0.33         2

                    accuracy                           0.16       142
                   macro avg       0.19      0.52      0.28       142
                weighted avg       0.05      0.16      0.08       142



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-------------------------------------------------------

# B4.1

In [2]:
#B1.0
path_B4 = 'STRING_RESULT_TRAIN/B4.1/all_iterations'

# Open all dataframes
S1 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_3644.csv')
S2 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_3441.csv')
S3 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_280.csv')
S4 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_5991.csv')
S5 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_7917.csv')


In [3]:
S1.shape

(218, 12)

In [4]:
# Evaluate if predictions are correctly saved
# Combine all dataframes into one
B4_full_OG = pd.concat([S1, S2, S3, S4, S5], ignore_index=True)
B4_full_OG.shape

(1070, 12)

In [5]:
def filter_dataframe(df, principles=None, units=None, shapes=None, topics=None):
    if principles:
        df = df[df['PRINCIPLE_y'].isin(principles)]
    if units:
        df = df[df['UNIT_y'].isin(units)]
    if shapes:
        df = df[df['SHAPE_y'].isin(shapes)]
    if topics:
        df = df[df['TOPIC_y'].isin(topic)]
    return df

principle = ['egalitarian', 'general normative statement', 'libertarian', 'not evaluated','prioritarian', 'sufficientarian', 'utilitarian']
unit = ['not indicated', 'responsibility','financial resources', 'technological resources', 'financial and technological resources', 'support', 'other']
shape = ['not indicated', 'equity', 'equality','priority to worst off', 'needs based', 'proportional to contribution', 'proportional to commitment']
topic = ['new UNFCCC policy', 'UNFCCC agreements and principles', 'urgency', 'cooperation', 'financial mechanisms', 'adaptation', 'mitigation', 'adaptation and mitigation', 'other']

S1 = filter_dataframe(S1, principles=principle, units=unit, shapes=shape, topics=topic)
S2 = filter_dataframe(S2, principles=principle, units=unit, shapes=shape, topics=topic)
S3 = filter_dataframe(S3, principles=principle, units=unit, shapes=shape, topics=topic)
S4 = filter_dataframe(S4, principles=principle, units=unit, shapes=shape, topics=topic)
S5 = filter_dataframe(S5, principles=principle, units=unit, shapes=shape, topics=topic)


______________________________________


In [6]:
# Combine all dataframes into one
B4_full = pd.concat([S1, S2, S3, S4, S5], ignore_index=True)
B4_full.shape

(1057, 12)

In [7]:
# Filter rows to keep only those where the unique_id appears exactly 5 times
filtered_B4 = B4_full.groupby('unique_id').filter(lambda x: len(x) == 5)
filtered_B4.shape

(970, 12)

In [8]:
# List of unique_id's that are annotated 5 times
unique_ids = filtered_B4['unique_id'].unique().tolist()

In [9]:
# Filter all original dataframes to only include sentences that are annotated 5x by other seeds
FB1S1 = S1[S1['unique_id'].isin(unique_ids)]
FB1S2 = S2[S2['unique_id'].isin(unique_ids)]
FB1S3 = S3[S3['unique_id'].isin(unique_ids)]
FB1S4 = S4[S4['unique_id'].isin(unique_ids)]
FB1S5 = S5[S5['unique_id'].isin(unique_ids)]

In [10]:
# Evaluate performance of individual seeds for the same sentences
# Function to generate classification report for each dataframe
def generate_classification_report(df, true_label_col, pred_label_col):
    return classification_report(df[true_label_col], df[pred_label_col], output_dict=True)

# Generate classification reports
report_S1 = generate_classification_report(FB1S1, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S2 = generate_classification_report(FB1S2, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S3 = generate_classification_report(FB1S3, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S4 = generate_classification_report(FB1S4, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S5 = generate_classification_report(FB1S5, 'PRINCIPLE_x', 'PRINCIPLE_y')

# Function to print the classification reports for easy comparison
def print_classification_report(report, title):
    print(f"Classification Report for {title}")
    print(pd.DataFrame(report).transpose())
    print("\n")

# Print the classification reports
print_classification_report(report_S1, "S1")
print_classification_report(report_S2, "S2")
print_classification_report(report_S3, "S3")
print_classification_report(report_S4, "S4")
print_classification_report(report_S5, "S5")

Classification Report for S1
                             precision    recall  f1-score     support
egalitarian                   0.543860  0.645833  0.590476   48.000000
general normative statement   0.400000  0.258065  0.313725   31.000000
libertarian                   0.000000  0.000000  0.000000    1.000000
prioritarian                  0.823529  0.500000  0.622222   56.000000
sufficientarian               0.294118  0.714286  0.416667    7.000000
utilitarian                   0.538462  0.686275  0.603448   51.000000
accuracy                      0.551546  0.551546  0.551546    0.551546
macro avg                     0.433328  0.467410  0.424423  194.000000
weighted avg                  0.588367  0.551546  0.549512  194.000000


Classification Report for S2
                             precision    recall  f1-score     support
egalitarian                   0.555556  0.625000  0.588235   48.000000
general normative statement   0.333333  0.193548  0.244898   31.000000
libertarian      

Evaluate consistency of predictions over the 5 different instances - see which sentences are not consistantly predicted.


In [11]:
def columns_not_uniform(group, columns):
    for col in columns:
        if group[col].nunique() != 1:
            return True
    return False

# Columns to check for uniformity
columns_to_check = ['PRINCIPLE_y', 'UNIT_y', 'SHAPE_y','TOPIC_y']

# Group by 'unique_id' and filter groups
B4_consistency = filtered_B4.groupby('unique_id').filter(lambda x: columns_not_uniform(x, columns_to_check))

B4_consistency.shape

(430, 12)


In combined dataframe, groupyby unique ID and take the majority label

In [12]:
def most_frequent_except_principle(x):
    if x.name in ['PRINCIPLE_y', 'UNIT_y', 'SHAPE_y', 'TOPIC_y']:
        return Counter(x).most_common(1)[0][0]
    return x.iloc[0]  # Keep the first value for other columns

# Dictionary to specify aggregation functions for all columns
agg_dict_all = {col: most_frequent_except_principle for col in filtered_B4.columns if col != 'unique_id'}

# Group by 'unique_id' and apply the aggregation functions
filtered_B4_grouped = filtered_B4.groupby('unique_id').agg(agg_dict_all).reset_index()

filtered_B4_grouped

Unnamed: 0,unique_id,text,PRINCIPLE_x,TOPIC_x,UNIT_x,SHAPE_x,llm_query,PRINCIPLE_y,TOPIC_y,UNIT_y,SHAPE_y,iteration
0,3,Mr. President: A fair and effective framewor...,utilitarian,new UNFCCC policy,responsibility,equality,3 Mr. President: A fair and effective framew...,egalitarian,cooperation,responsibility,equality,1
1,5,Such a framework must be based on “nationally ...,egalitarian,new UNFCCC policy,responsibility,equity,5 Such a framework must be based on “nationall...,general normative statement,new UNFCCC policy,not indicated,not indicated,1
2,44,It should not only enable us to discuss global...,utilitarian,new UNFCCC policy,not indicated,not indicated,44 It should not only enable us to discuss glo...,utilitarian,urgency,not indicated,not indicated,1
3,53,Global warming is a catastrophic problem that ...,utilitarian,urgency,not indicated,not indicated,53 Global warming is a catastrophic problem th...,utilitarian,urgency,not indicated,not indicated,1
4,54,"Therefore, the multilateralism approach remain...",general normative statement,new UNFCCC policy,not indicated,not indicated,"54 Therefore, the multilateralism approach rem...",utilitarian,cooperation,not indicated,not indicated,1
...,...,...,...,...,...,...,...,...,...,...,...,...
189,1172,As we work to catch up on lost time and progr...,prioritarian,urgency,not indicated,not indicated,1172 As we work to catch up on lost time and ...,prioritarian,cooperation,not indicated,priority to worst off,1
190,1173,"Conflict -ridden communities, refugees, and d...",prioritarian,new UNFCCC policy,not indicated,not indicated,"1173 Conflict -ridden communities, refugees, ...",prioritarian,cooperation,not indicated,priority to worst off,1
191,1174,"Nor can we stand by , as the massive destructi...",utilitarian,other,not indicated,not indicated,"1174 Nor can we stand by , as the massive dest...",utilitarian,cooperation,not indicated,not indicated,1
192,1198,We recognise that we must deliver on our coll...,prioritarian,financial mechanisms,financial resources,proportional to commitment,1198 We recognise that we must deliver on our...,sufficientarian,financial mechanisms,financial resources,proportional to commitment,1


In [13]:
# Evaluate performance of this dataframe - principle
report_principle = classification_report(filtered_B4_grouped['PRINCIPLE_x'],filtered_B4_grouped['PRINCIPLE_y'], output_dict=True)
df_principle = pd.DataFrame(report_principle).transpose()
df_principle

Unnamed: 0,precision,recall,f1-score,support
egalitarian,0.534483,0.645833,0.584906,48.0
general normative statement,0.352941,0.193548,0.25,31.0
libertarian,0.0,0.0,0.0,1.0
prioritarian,0.833333,0.535714,0.652174,56.0
sufficientarian,0.3125,0.714286,0.434783,7.0
utilitarian,0.530303,0.686275,0.598291,51.0
accuracy,0.551546,0.551546,0.551546,0.551546
macro avg,0.42726,0.462609,0.420025,194.0
weighted avg,0.579876,0.551546,0.545894,194.0


In [14]:
# Evaluate performance of this dataframe - principle
report_topic = classification_report(filtered_B4_grouped['TOPIC_x'],filtered_B4_grouped['TOPIC_y'],output_dict=True)
df_topic = pd.DataFrame(report_topic).transpose()
df_topic

Unnamed: 0,precision,recall,f1-score,support
UNFCCC agreements and principles,0.666667,0.357143,0.465116,28.0
adaptation,0.166667,1.0,0.285714,1.0
adaptation and mitigation,0.625,0.555556,0.588235,9.0
cooperation,0.386364,0.85,0.53125,20.0
financial mechanisms,0.617647,1.0,0.763636,21.0
mitigation,0.307692,0.666667,0.421053,6.0
new UNFCCC policy,0.545455,0.387097,0.45283,31.0
other,0.736842,0.325581,0.451613,43.0
urgency,0.575758,0.542857,0.558824,35.0
accuracy,0.530928,0.530928,0.530928,0.530928


In [17]:
# Evaluate performance of this dataframe - topic
#print(classification_report(filtered_B4_grouped['UNIT_x'],filtered_B4_grouped['UNIT_y']))

# Evaluate performance of this dataframe - principle
report_unit = classification_report(filtered_B4_grouped['UNIT_x'],filtered_B4_grouped['UNIT_y'],output_dict=True)
df_unit = pd.DataFrame(report_unit).transpose()
df_unit

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
financial and technological resources,0.461538,0.857143,0.6,7.0
financial resources,0.966667,0.966667,0.966667,30.0
not indicated,0.848837,0.737374,0.789189,99.0
other,0.0,0.0,0.0,3.0
responsibility,0.591837,0.707317,0.644444,41.0
support,0.5,0.5,0.5,12.0
technological resources,0.5,1.0,0.666667,2.0
accuracy,0.747423,0.747423,0.747423,0.747423
macro avg,0.552697,0.681214,0.595281,194.0
weighted avg,0.760469,0.747423,0.747862,194.0


In [18]:
# Evaluate performance of this dataframe - topic
#print(classification_report(filtered_B4_grouped['SHAPE_x'],filtered_B4_grouped['SHAPE_y']))

# Evaluate performance of this dataframe - principle
report_shape = classification_report(filtered_B4_grouped['SHAPE_x'],filtered_B4_grouped['SHAPE_y'],output_dict=True)
df_shape = pd.DataFrame(report_shape).transpose()
df_shape

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
equality,0.176471,0.666667,0.27907,9.0
equity,0.428571,0.5,0.461538,12.0
needs based,0.176471,0.75,0.285714,4.0
not evaluated,0.0,0.0,0.0,2.0
not indicated,0.9,0.6,0.72,120.0
priority to worst off,0.5,0.5,0.5,26.0
proportional to commitment,0.611111,0.6875,0.647059,16.0
proportional to contribution,0.8,0.8,0.8,5.0
accuracy,0.592784,0.592784,0.592784,0.592784
macro avg,0.449078,0.563021,0.461673,194.0


________________________________________________________________________________________________
# B4.1 TEST

In [56]:
#B1.0
path_B4 = 'STRING_RESULT/B4.1/all_iterations'

# Open all dataframes
S1 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_3644.csv')
S2 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_3441.csv')
S3 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_280.csv')
S4 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_5991.csv')
S5 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_7917.csv')


In [57]:
# Evaluate if predictions are correctly saved
# Combine all dataframes into one
B4_full_OG = pd.concat([S1, S2, S3, S4, S5], ignore_index=True)
B4_full_OG.shape

(216, 12)

In [58]:
def filter_dataframe(df, principles=None, units=None, shapes=None, topics=None):
    if principles:
        df = df[df['PRINCIPLE_y'].isin(principles)]
    if units:
        df = df[df['UNIT_y'].isin(units)]
    if shapes:
        df = df[df['SHAPE_y'].isin(shapes)]
    if topics:
        df = df[df['TOPIC_y'].isin(topic)]
    return df

principle = ['egalitarian', 'general normative statement', 'not evaluated', 'libertarian', 'not evaluated','prioritarian', 'sufficientarian', 'utilitarian']
unit = ['not indicated', 'not evaluated', 'responsibility','financial resources', 'technological resources', 'financial and technological resources', 'support', 'other']
shape = ['not indicated', 'not evaluated', 'equity', 'equality','priority to worst off', 'needs based', 'proportional to contribution', 'proportional to commitment']
topic = ['new UNFCCC policy', 'UNFCCC agreements and principles', 'urgency', 'cooperation', 'financial mechanisms', 'adaptation', 'mitigation', 'adaptation and mitigation', 'other']

S1 = filter_dataframe(S1, principles=principle, units=unit, shapes=shape, topics=topic)
S2 = filter_dataframe(S2, principles=principle, units=unit, shapes=shape, topics=topic)
S3 = filter_dataframe(S3, principles=principle, units=unit, shapes=shape, topics=topic)
S4 = filter_dataframe(S4, principles=principle, units=unit, shapes=shape, topics=topic)
S5 = filter_dataframe(S5, principles=principle, units=unit, shapes=shape, topics=topic)


______________________________________


In [59]:
# Combine all dataframes into one
B4_full = pd.concat([S1, S2, S3, S4, S5], ignore_index=True)
B4_full.shape

(204, 12)

In [60]:
# Filter rows to keep only those where the unique_id appears exactly 5 times
filtered_B4 = B4_full.groupby('unique_id').filter(lambda x: len(x) == 5)
filtered_B4.shape

(90, 12)

In [61]:
# List of unique_id's that are annotated 5 times
unique_ids = filtered_B4['unique_id'].unique().tolist()

In [62]:
# Filter all original dataframes to only include sentences that are annotated 5x by other seeds
FB1S1 = S1[S1['unique_id'].isin(unique_ids)]
FB1S2 = S2[S2['unique_id'].isin(unique_ids)]
FB1S3 = S3[S3['unique_id'].isin(unique_ids)]
FB1S4 = S4[S4['unique_id'].isin(unique_ids)]
FB1S5 = S5[S5['unique_id'].isin(unique_ids)]

In [63]:
# Evaluate performance of individual seeds for the same sentences
# Function to generate classification report for each dataframe
def generate_classification_report(df, true_label_col, pred_label_col):
    return classification_report(df[true_label_col], df[pred_label_col], output_dict=True)

# Generate classification reports
report_S1 = generate_classification_report(FB1S1, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S2 = generate_classification_report(FB1S2, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S3 = generate_classification_report(FB1S3, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S4 = generate_classification_report(FB1S4, 'PRINCIPLE_x', 'PRINCIPLE_y')
report_S5 = generate_classification_report(FB1S5, 'PRINCIPLE_x', 'PRINCIPLE_y')

# Function to print the classification reports for easy comparison
def print_classification_report(report, title):
    print(f"Classification Report for {title}")
    print(pd.DataFrame(report).transpose())
    print("\n")

# Print the classification reports
print_classification_report(report_S1, "S1")
print_classification_report(report_S2, "S2")
print_classification_report(report_S3, "S3")
print_classification_report(report_S4, "S4")
print_classification_report(report_S5, "S5")

Classification Report for S1
                             precision    recall  f1-score    support
egalitarian                   0.500000  0.800000  0.615385   5.000000
general normative statement   0.500000  0.142857  0.222222   7.000000
libertarian                   0.000000  0.000000  0.000000   1.000000
prioritarian                  0.500000  0.500000  0.500000   2.000000
sufficientarian               0.000000  0.000000  0.000000   0.000000
utilitarian                   0.333333  0.333333  0.333333   3.000000
accuracy                      0.388889  0.388889  0.388889   0.388889
macro avg                     0.305556  0.296032  0.278490  18.000000
weighted avg                  0.444444  0.388889  0.368471  18.000000


Classification Report for S2
                             precision    recall  f1-score    support
egalitarian                   0.444444  0.800000  0.571429   5.000000
general normative statement   0.500000  0.142857  0.222222   7.000000
libertarian                   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Evaluate consistency of predictions over the 5 different instances - see which sentences are not consistantly predicted.


In [64]:
def columns_not_uniform(group, columns):
    for col in columns:
        if group[col].nunique() != 1:
            return True
    return False

# Columns to check for uniformity
columns_to_check = ['PRINCIPLE_y', 'UNIT_y', 'SHAPE_y','TOPIC_y']

# Group by 'unique_id' and filter groups
B4_consistency = filtered_B4.groupby('unique_id').filter(lambda x: columns_not_uniform(x, columns_to_check))

B4_consistency.shape

(30, 12)


In combined dataframe, groupyby unique ID and take the majority label

In [65]:
def most_frequent_except_principle(x):
    if x.name in ['PRINCIPLE_y', 'UNIT_y', 'SHAPE_y', 'TOPIC_y']:
        return Counter(x).most_common(1)[0][0]
    return x.iloc[0]  # Keep the first value for other columns

# Dictionary to specify aggregation functions for all columns
agg_dict_all = {col: most_frequent_except_principle for col in filtered_B4.columns if col != 'unique_id'}

# Group by 'unique_id' and apply the aggregation functions
filtered_B4_grouped = filtered_B4.groupby('unique_id').agg(agg_dict_all).reset_index()

filtered_B4_grouped

Unnamed: 0,unique_id,text,PRINCIPLE_x,TOPIC_x,UNIT_x,SHAPE_x,llm_query,PRINCIPLE_y,TOPIC_y,UNIT_y,SHAPE_y,iteration
0,14,We have advocated a structure for the new agre...,egalitarian,new UNFCCC policy,responsibility,equality,14 We have advocated a structure for the new a...,egalitarian,new UNFCCC policy,responsibility,equality,1
1,15,"• This kind of structure, based on a spectrum...",utilitarian,UNFCCC agreements and principles,responsibility,equity,"15 • This kind of structure, based on a spect...",egalitarian,mitigation,responsibility,equity,1
2,16,"• By contrast, an agreement based on 1992 cat...",libertarian,UNFCCC agreements and principles,not indicated,not indicated,"16 • By contrast, an agreement based on 1992 ...",general normative statement,new UNFCCC policy,not indicated,not indicated,1
3,18,If those categories are to beoperational in ch...,general normative statement,new UNFCCC policy,responsibility,equity,18 If those categories are to beoperational in...,egalitarian,new UNFCCC policy,responsibility,equity,1
4,25,"• Let us work together, mindful of our mutual...",egalitarian,cooperation,responsibility,not indicated,"25 • Let us work together, mindful of our mut...",egalitarian,cooperation,responsibility,equality,1
5,35,"The major polluters, especially those who are ...",general normative statement,UNFCCC agreements and principles,responsibility,proportional to contribution,"35 The major polluters, especially those who a...",prioritarian,urgency,responsibility,priority to worst off,1
6,38,Pledges to the Green Climate Fund have now pas...,prioritarian,financial mechanisms,financial resources,priority to worst off,38 Pledges to the Green Climate Fund have now ...,sufficientarian,financial mechanisms,financial resources,needs based,1
7,40,We call on our partners to deliver the large s...,general normative statement,financial mechanisms,financial resources,not indicated,40 We call on our partners to deliver the larg...,sufficientarian,financial mechanisms,financial resources,needs based,1
8,41,Loss and damage should also be included as an ...,general normative statement,other,not indicated,not indicated,41 Loss and damage should also be included as ...,general normative statement,new UNFCCC policy,not indicated,not indicated,1
9,44,We also fully support inclusion of gender equa...,egalitarian,other,not indicated,not indicated,44 We also fully support inclusion of gender e...,egalitarian,new UNFCCC policy,responsibility,equality,1


In [66]:
# Evaluate performance of this dataframe - principle
print(classification_report(filtered_B4_grouped['PRINCIPLE_x'],filtered_B4_grouped['PRINCIPLE_y']))

                             precision    recall  f1-score   support

                egalitarian       0.44      0.80      0.57         5
general normative statement       0.50      0.14      0.22         7
                libertarian       0.00      0.00      0.00         1
               prioritarian       0.50      0.50      0.50         2
            sufficientarian       0.00      0.00      0.00         0
                utilitarian       0.50      0.33      0.40         3

                   accuracy                           0.39        18
                  macro avg       0.32      0.30      0.28        18
               weighted avg       0.46      0.39      0.37        18



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [67]:
# Evaluate performance of this dataframe - topic
print(classification_report(filtered_B4_grouped['TOPIC_x'],filtered_B4_grouped['TOPIC_y']))

                                  precision    recall  f1-score   support

UNFCCC agreements and principles       0.00      0.00      0.00         4
                      adaptation       0.00      0.00      0.00         0
                     cooperation       0.67      1.00      0.80         2
            financial mechanisms       0.75      1.00      0.86         3
                      mitigation       0.00      0.00      0.00         0
               new UNFCCC policy       0.33      0.33      0.33         6
                           other       0.00      0.00      0.00         3
                         urgency       0.00      0.00      0.00         0

                        accuracy                           0.39        18
                       macro avg       0.22      0.29      0.25        18
                    weighted avg       0.31      0.39      0.34        18



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
# Evaluate performance of this dataframe - topic
print(classification_report(filtered_B4_grouped['UNIT_x'],filtered_B4_grouped['UNIT_y']))

                     precision    recall  f1-score   support

financial resources       0.75      1.00      0.86         3
      not indicated       1.00      0.29      0.44         7
     responsibility       0.67      1.00      0.80         8

           accuracy                           0.72        18
          macro avg       0.81      0.76      0.70        18
       weighted avg       0.81      0.72      0.67        18



In [69]:
# Evaluate performance of this dataframe - topic
print(classification_report(filtered_B4_grouped['SHAPE_x'],filtered_B4_grouped['SHAPE_y']))

                              precision    recall  f1-score   support

                    equality       0.14      0.50      0.22         2
                      equity       1.00      0.67      0.80         3
                 needs based       0.00      0.00      0.00         0
               not indicated       0.75      0.30      0.43        10
       priority to worst off       0.50      0.50      0.50         2
proportional to contribution       0.00      0.00      0.00         1

                    accuracy                           0.39        18
                   macro avg       0.40      0.33      0.33        18
                weighted avg       0.65      0.39      0.45        18



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
