In [1]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import difflib

## Notebook to annotate HLS speeches for principles
### B: string-based labels

Codebooks:
- B2.0: zero shot
- B2.1: one shot

Only apply on sentences in OG dataframe labelled as relevant
Temperature: 0
Iterations: 1

Model selection:
 As of 22-05-2024, gpt-4-turbo-2024-04-09 seems to be the only gpt-model that returns a fingerprint in addition to gpt-4o

  #model= "gpt-4-turbo-2024-04-09"
  #model = "gpt-3.5-turbo-0125"


### 1. Import text to annotate
Select only relevant columns of the full dataframe, in this case:
PRINCIPLE

In this case, evaluation is performed on only the sentences that are deemed relevant in the manual annotation. Only these sentences are labelled for the principle they present.

In [2]:
# Import string based datafile
HLS_train = pd.read_csv('data/string/HLS_train_string.csv')

In [3]:
### Select only japan for testing purposes
#HLS_train_japan = HLS_train[HLS_train['id']=='COP19_japan']
HLS_train_relevant = HLS_train[HLS_train['RELEVANCE']=='Relevant']

In [4]:
# Select only columns containing relevance labels
HLS_principle = HLS_train[['Text', 'PRINCIPLE']]
HLS_principle.head()

Unnamed: 0,Text,PRINCIPLE
0,"Thank you, Mr. President .",not evaluated
1,"On beha lf of the government of Japan , I wou...",not evaluated
2,I would also like to expr ess my d eepest con...,not evaluated
3,Mr. President: A fair and effective framewor...,utilitarian
4,"In this regard, Japan firmly supports the est...",not evaluated


### 2. Import necessary files
- codebooks
- API key
- import gpt_annotate_num

In [5]:
# Load codebook - zero shot
with open('codebooks/B2.0', 'r', encoding='utf-8') as file:
    B20 = file.read()

In [6]:
# OpenAI key
with open('gpt_api_key.txt', 'r') as f:
    key = f.read().strip()

In [7]:
import gpt_annotate_string

### 3. Prepare data for annotation
Compares column names in HLS_relevance to the codes identified by GPT-4o in the codebook. Seed for this identification is set to 1234.

In [8]:
# Prepare dataframe for annotation
text_to_annotate = gpt_annotate_string.prepare_data(HLS_principle, B20, key, prep_codebook=True)

ChatCompletion(id='chatcmpl-9TucTLCEnrxTwVha51BcaUMAvMnBr', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='PRINCIPLE', role='assistant', function_call=None, tool_calls=None))], created=1716916185, model='gpt-4o-2024-05-13', object='chat.completion', system_fingerprint='fp_43dfabdef1', usage=CompletionUsage(completion_tokens=3, prompt_tokens=584, total_tokens=587))

Categories to annotate:
1) PRINCIPLE


Data is ready to be annotated using gpt_annotate()!

Glimpse of your data:
Shape of data:  (1212, 4)
   unique_id                                               text  \
0          0                         Thank you, Mr. President .   
1          1   On beha lf of the government of Japan , I wou...   
2          2   I would also like to expr ess my d eepest con...   
3          3   Mr. President:  A fair and effective framewor...   
4          4   In this regard, Japan firmly supports the est...   

       PRINCIPLE                   

Fingerprint used: fp_43dfabdef1

Seed of textpreparation is hardcoded into gpt_annotate. This to ensure that onlye the results of the same fingerprint for all seeds and all iterations. Essentially every time GPT-4o is called only results with this specific fingerprint are saved.

# 4. Run gpt_annotate_num
Evaluation per seed -
5 different seeds
Batch of 20 sentences
1 iteration

Returns 3 outputs:
1. all_iterations_{seed}.csv
2. fingerprints_all.csv
3. missed_batches.csv

## B2.0 principle - zero shot

In [9]:
fingerprint = 'fp_43dfabdef1'

#Block seed to prevent accidental rerun of gpt_annotate
#seeds = [3644,3441, 280, 5991, 7917]

In [10]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B20, key, seed,fingerprint, experiment="B2.0",  num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

3644 - iteration 1
3644 - I1 - B4 fingerprint does not match
3644 - I1 - B10 fingerprint does not match
3644 - I1 - B27 fingerprint does not match
3644 - I1 - B46 fingerprint does not match
3644 - I1 - B50 fingerprint does not match
iteration:  1 completed
3441 - iteration 1
3441 - I1 - B27 fingerprint does not match
iteration:  1 completed
280 - iteration 1
280 - I1 - B33 fingerprint does not match
280 - I1 - B47 fingerprint does not match
280 - I1 - B58 fingerprint does not match
280 - I1 - B60 fingerprint does not match
iteration:  1 completed
5991 - iteration 1
5991 - I1 - B8 fingerprint does not match
5991 - I1 - B25 fingerprint does not match
5991 - I1 - B37 fingerprint does not match
iteration:  1 completed
7917 - iteration 1
7917 - I1 - B25 fingerprint does not match
7917 - I1 - B49 fingerprint does not match
7917 - I1 - B55 fingerprint does not match
iteration:  1 completed


## B2.1 principle - one shot

In [28]:
# Load codebook - zero shot
with open('codebooks/B2.1', 'r', encoding='utf-8') as file:
    B21 = file.read()

In [29]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B21, key, seed,fingerprint, experiment="B2.1",  num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

3644 - iteration 1
3644 - I1 - B2 fingerprint does not match
3644 - I1 - B12 fingerprint does not match
3644 - I1 - B25 fingerprint does not match
3644 - I1 - B45 fingerprint does not match
iteration:  1 completed
3441 - iteration 1
3441 - I1 - B1 fingerprint does not match
3441 - I1 - B3 fingerprint does not match
3441 - I1 - B7 fingerprint does not match
3441 - I1 - B38 fingerprint does not match
iteration:  1 completed
280 - iteration 1
280 - I1 - B16 fingerprint does not match
280 - I1 - B17 fingerprint does not match
280 - I1 - B18 fingerprint does not match
280 - I1 - B41 fingerprint does not match
iteration:  1 completed
5991 - iteration 1
5991 - I1 - B28 fingerprint does not match
5991 - I1 - B31 fingerprint does not match
5991 - I1 - B57 fingerprint does not match
iteration:  1 completed
7917 - iteration 1
7917 - I1 - B5 fingerprint does not match
7917 - I1 - B19 fingerprint does not match
7917 - I1 - B33 fingerprint does not match
7917 - I1 - B50 fingerprint does not match
79

### 5. Define evaluation functions


In [12]:
# Define similarity score function and save to DF
def get_similarity_score(Rx, Ry):
    # Ensure Rx and Ry are pandas Series and convert to strings with stripped whitespace
    Rx = Rx.astype(str).str.strip()
    Ry = Ry.astype(str).str.strip()

    # Calculate similarity scores
    similarity_scores = Rx.combine(Ry, lambda x, y: difflib.SequenceMatcher(None, x, y).ratio())

    # Apply the threshold - maybe put higher?
    similarity_scores = similarity_scores.apply(lambda x: x if x >= 0.9 else 0)

    # Return the mean similarity score as a percentage - no need for mean as 1 iteration is used
    return similarity_scores.mean()

In [15]:
def similarity(directory):
    # Iterate through each file in the directory
    list = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)
        Rx = df['PRINCIPLE_x']
        Ry = df['PRINCIPLE_y']

        similarity_score = get_similarity_score(Rx,Ry)

        #Save the score in a dataframe
        list.append((filename, similarity_score))

    similarity = pd.DataFrame(list, columns=['filename', 'similarity ALL'])
    return similarity


In [22]:
def recall(directory, principle_x):
    # Iterate through each file in the directory
    list = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)

        relevant_df = df[df['PRINCIPLE_x'] == principle_x]

        Rx = relevant_df['PRINCIPLE_x']
        Ry = relevant_df['PRINCIPLE_y']

        similarity_score = get_similarity_score(Rx,Ry)

        #Save the score in a dataframe
        list.append((filename, similarity_score))

    recall = pd.DataFrame(list, columns=['filename', f'{principle_x} recall'])
    return recall

In [30]:
def precision(directory, principle_y):
    # Iterate through each file in the directory
    list = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)

        relevant_df = df[df['PRINCIPLE_y'] == principle_y]

        Rx = relevant_df['PRINCIPLE_x']
        Ry = relevant_df['PRINCIPLE_y']

        similarity_score = get_similarity_score(Rx,Ry)

        #Save the score in a dataframe
        list.append((filename, similarity_score))

    precision = pd.DataFrame(list, columns=['filename', f'{principle_y} precision'])
    return precision

### 6. Evaluate performance

In [16]:
directory = 'STRING_RESULT/B2.0/all_iterations'
similarity(directory)

Unnamed: 0,filename,similarity ALL
0,all_iterations_string_T0_280.csv,0.119258
1,all_iterations_string_T0_3441.csv,0.111577
2,all_iterations_string_T0_3644.csv,0.110612
3,all_iterations_string_T0_5991.csv,0.111979
4,all_iterations_string_T0_7917.csv,0.115451


In [23]:
principle_x =  'egalitarian'
recall(directory,principle_x)

Unnamed: 0,filename,egalitarian recall
0,all_iterations_string_T0_280.csv,0.625
1,all_iterations_string_T0_3441.csv,0.659574
2,all_iterations_string_T0_3644.csv,0.702703
3,all_iterations_string_T0_5991.csv,0.6
4,all_iterations_string_T0_7917.csv,0.647059


In [32]:
principle_y = 'egalitarian'
precision(directory,principle_y)

Unnamed: 0,filename,egalitarian precision
0,all_iterations_string_T0_280.csv,0.309278
1,all_iterations_string_T0_3441.csv,0.264957
2,all_iterations_string_T0_3644.csv,0.247619
3,all_iterations_string_T0_5991.csv,0.3
4,all_iterations_string_T0_7917.csv,0.33


In [31]:
B21 = 'STRING_RESULT/B2.1/all_iterations'
similarity(B21)

Unnamed: 0,filename,similarity ALL
0,all_iterations_string_T0_280.csv,0.121025
1,all_iterations_string_T0_3441.csv,0.119258
2,all_iterations_string_T0_3644.csv,0.122792
3,all_iterations_string_T0_5991.csv,0.121528
4,all_iterations_string_T0_7917.csv,0.121429


In [33]:
principle_x =  'egalitarian'
recall(B21,principle_x)

Unnamed: 0,filename,egalitarian recall
0,all_iterations_string_T0_280.csv,0.58
1,all_iterations_string_T0_3441.csv,0.62
2,all_iterations_string_T0_3644.csv,0.591837
3,all_iterations_string_T0_5991.csv,0.674419
4,all_iterations_string_T0_7917.csv,0.673913


In [34]:
principle_y =  'egalitarian'
precision(B21,principle_y)

Unnamed: 0,filename,egalitarian precision
0,all_iterations_string_T0_280.csv,0.284314
1,all_iterations_string_T0_3441.csv,0.306931
2,all_iterations_string_T0_3644.csv,0.287129
3,all_iterations_string_T0_5991.csv,0.278846
4,all_iterations_string_T0_7917.csv,0.281818


## 7. Evaluate full outcomes - Confusionmatrix

In [35]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, matthews_corrcoef, make_scorer, classification_report

In [None]:
# Now only make confusionmatrix for one element

relevance =
cf_matrix = confusion_matrix(y, y_pred