In [2]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import difflib

from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report

# B4 Principle, Topic, Unit, Shape

## Notebook to evaluate GPT-annotate results

Codebooks:
- B4.0: zero shot

Test for 5 different seeds [3644,3441, 280, 5991, 7917]
> Refer to these as S1-5

Batch of 20 sentences
1 Iteration.

Main outcomes: T0 - I1
For testing purposes: (T 0.6 I1 - T 0 I3)

FINGERPRINT USED: fp_319be4768e


In [3]:
# Basic stats of train dataset:
HLS_train = pd.read_csv('data/string/HLS_train_string.csv')

In [4]:
HLS_train['PRINCIPLE'].value_counts()

not evaluated                  992
prioritarian                    66
utilitarian                     59
egalitarian                     52
general normative statement     34
sufficientarian                  8
libertarian                      1
Name: PRINCIPLE, dtype: int64

# 1. Results B4.0

### 1. Import Results
Results created under the same fingerprint are saved in a CSV file for each seed. All results are combined into one dataset.

In [5]:
#B1.0
path_B4 = 'STRING_RESULT/B4.0/all_iterations'

# Open all dataframes
S1 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_3644.csv')
S2 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_3441.csv')
S3 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_280.csv')
S4 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_5991.csv')
S5 = pd.read_csv(f'{path_B4}/all_iterations_string_T0_7917.csv')

seeds_B3 = {
    "S1": [S1],
    "S2": [S2],
    "S3": [S3],
    "S4": [S4],
    "S5": [S5]
}

### 2. Evaluate missed batches

In [6]:
# Evaluate missed batches
B4_missed = pd.read_csv('STRING_RESULT/B4.0/T0_missed_batches.csv')
B4_missed

Unnamed: 0.1,Unnamed: 0,Missed batch
0,0,3441 - I1 - B4
1,1,280 - I1 - B36
2,2,5991 - I1 - B25
3,3,7917 - I1 - B10
4,4,7917 - I1 - B26


In [15]:
S3['PRINCIPLE_y'].value_counts()

not evaluated                  501
general normative statement    216
utilitarian                    195
egalitarian                    150
prioritarian                    84
sufficientarian                 33
cooperation                      7
urgency                          5
libertarian                      1
Name: PRINCIPLE_y, dtype: int64

In [8]:
S1['TOPIC_y'].value_counts()

not evaluated                            448
urgency                                  185
cooperation                              137
UNFCCC agreements and principles          87
other                                     77
mitigation                                77
financial mechanisms                      67
new UNFCCC policy                         50
adaptation                                42
adaptation and mitigation                 34
technological resources                    4
support                                    3
financial and technological resources      1
Name: TOPIC_y, dtype: int64

In [9]:
S1['UNIT_y'].value_counts()

not evaluated                            463
not indicated                            327
responsibility                           223
financial resources                       81
support                                   62
technological resources                   28
financial and technological resources     28
Name: UNIT_y, dtype: int64

In [11]:
S1['SHAPE_y'].value_counts()

not evaluated                   460
not indicated                   420
equality                        101
priority to worst off            92
needs based                      53
proportional to commitment       41
equity                           28
proportional to contribution     17
Name: SHAPE_y, dtype: int64

Evaluate missed batches

## 2. Evaluate metrics - S1
Evaluation is performed with SKLEARN - presenting 5 classification reports

Column x is the ground truth label - y is the predicted label
Evaluate per seed all classification reports per category


In [None]:
print(classification_report(S5['PRINCIPLE_x'],S5['PRINCIPLE_y']))

In [None]:
print(classification_report(S1['TOPIC_x'],S1['TOPIC_y']))

In [None]:
print(classification_report(S1['UNIT_x'],S1['UNIT_y']))

In [None]:
print(classification_report(S1['SHAPE_x'],S1['SHAPE_y']))

This batch has something wrong with saving results

In [None]:
print(classification_report(S5['PRINCIPLE_x'],S5['PRINCIPLE_y']))

Metrics over 5 seeds are very similar.
Macro avg: average of all classes (aka labels)
Weighted avg: weighted average, taking class balances into account.

## 3. Confusion matrix
Evaluate where misclassifications are found - Again for each seed. - account for wrong saves for seed 4

In [None]:
labels = ['egalitarian', 'general normative statement', 'libertarian','not evaluated',
          'prioritarian', 'sufficientarian', 'utilitarian']
# Number of plots
num_plots = len(seeds_B3)
# Create plot names
plot_names = list(seeds_B3.keys())

# Create subplots
fig, axes = plt.subplots(num_plots, 1, figsize=(20, 20))

# Visualize the confusion matrix
for i, name in enumerate(plot_names):
    df = seeds_B3[name][0]  # Access the dataframe
    cm = confusion_matrix(df['PRINCIPLE_x'], df['PRINCIPLE_y'])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap=plt.cm.Blues, ax=axes[i])
    axes[i].set_title(name)

# Adjust layout
plt.tight_layout()
plt.show()

## 4. Evaluate misclassifications of sentences
Special interest in relevant sentences that are incorrectly classified. Evaluate if the same sentences are misclassified in each seed

# 2. Results B3.1.1 - zero shot with context
Only seed S1-S3 available. Others are not saved correctly

In [None]:
#B1.0
path_B311 = 'STRING_RESULT/B3.1.1/all_iterations'

# Open all dataframes
S1_311 = pd.read_csv(f'{path_B311}/all_iterations_string_T0_3644.csv')
S2_311 = pd.read_csv(f'{path_B311}/all_iterations_string_T0_3441.csv')
S3_311 = pd.read_csv(f'{path_B311}/all_iterations_string_T0_280.csv')

seeds_B311 = {
    "S1": [S1_311],
    "S2": [S2_311],
    "S3": [S3_311],
}

### 2. Evaluate missed batches

In [None]:
# Evaluate missed batches
B311_missed = pd.read_csv('STRING_RESULT/B3.1.1/T0_missed_batches.csv')
B311_missed

S1: 2 batches (40 sentences) missed
S2: 2 batches (40 sentences) missed
S3: 4 batches (80 sentences) missed
S4: 1 batch (20 sentences) missed
S5: 4 batches (80 sentences) missed

## 2. Evaluate metrics
Evaluation is performed with SKLEARN - presenting 5 classification reports

Column RELEVANCE_x is the ground truth label - RELEVANCE_y is the predicted label

Save as dataframes - if neccessary

In [None]:
print(classification_report(S1_311['PRINCIPLE_x'],S1_311['PRINCIPLE_y']))

In [None]:
print(classification_report(S2_311['PRINCIPLE_x'],S2_311['PRINCIPLE_y']))

In [None]:
print(classification_report(S3_311['PRINCIPLE_x'],S3_311['PRINCIPLE_y']))

Ook een fout in deze opslag, met > 100 egalitarian labels - wat niet kan kloppen