In [1]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import difflib

## Notebook to annotate HLS speeches for principles
### B: string-based labels

Codebooks:
- B3.0: zero shot
- B3.1: one shot
- B3.0.1: zero shot with specific inclusion of context
- B3.1.1: one shot with specific inclusion of context

Apply on full string-based dataset
Temperature: 0
Iterations: 1

Test for 5 different seeds
Batch of 20 sentences, Temperature 0

Model selection: GPT-4o


### 1. Import text to annotate
S

In [2]:
# Import string based datafile
HLS_train = pd.read_csv('data/string/HLS_train_string.csv')

In [3]:
# Select only columns containing relevance labels
HLS_principle = HLS_train[['Text', 'PRINCIPLE']]
HLS_principle

Unnamed: 0,Text,PRINCIPLE
0,"Thank you, Mr. President .",not evaluated
1,"On beha lf of the government of Japan , I wou...",not evaluated
2,I would also like to expr ess my d eepest con...,not evaluated
3,Mr. President: A fair and effective framewor...,utilitarian
4,"In this regard, Japan firmly supports the est...",not evaluated
...,...,...
1207,New Zealand is proud to suppor t several impo...,not evaluated
1208,"I am joined by New Zealand’s largest business,...",not evaluated
1209,The commitment o f New Zealanders from across ...,not evaluated
1210,Thank you Mr President.,not evaluated


In [4]:
HLS_principle.PRINCIPLE.value_counts()

not evaluated                  992
prioritarian                    66
utilitarian                     59
egalitarian                     52
general normative statement     34
sufficientarian                  8
libertarian                      1
Name: PRINCIPLE, dtype: int64

Classes are VERY imbalanced


### 2. Import necessary files
- codebooks
- API key
- import gpt_annotate_num

In [10]:
# OpenAI key
with open('gpt_api_key.txt', 'r') as f:
    key = f.read().strip()

In [6]:
import gpt_annotate_string

In [8]:
# Load codebook - zero shot
with open('codebooks/B3.0', 'r', encoding='utf-8') as file:
    B30 = file.read()

### 3. Prepare data for annotation
Compares column names in HLS_principle to the codes identified by GPT-4o in the codebook. Seed for this identification is set to 1234.

In [11]:
# Prepare dataframe for annotation
text_to_annotate = gpt_annotate_string.prepare_data(HLS_principle, B30, key, prep_codebook=True)

ChatCompletion(id='chatcmpl-9WUGbS9mMagiGLhj9Q77Nh2UMojDM', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='PRINCIPLE', role='assistant', function_call=None, tool_calls=None))], created=1717529869, model='gpt-4o-2024-05-13', object='chat.completion', system_fingerprint='fp_319be4768e', usage=CompletionUsage(completion_tokens=3, prompt_tokens=670, total_tokens=673))

Categories to annotate:
1) PRINCIPLE


Data is ready to be annotated using gpt_annotate()!

Glimpse of your data:
Shape of data:  (1212, 4)
   unique_id                                               text  \
0          0                         Thank you, Mr. President .   
1          1   On beha lf of the government of Japan , I wou...   
2          2   I would also like to expr ess my d eepest con...   
3          3   Mr. President:  A fair and effective framewor...   
4          4   In this regard, Japan firmly supports the est...   

       PRINCIPLE                   

Fingerprint used: fp_319be4768e

Seed of textpreparation is hardcoded into gpt_annotate. This to ensure that only the results of the same fingerprint for all seeds and all iterations. Essentially every time GPT-4o is called only results with this specific fingerprint are saved.

# 4. Run gpt_annotate_num
Evaluation per seed -
5 different seeds
Batch of 20 sentences
1 iteration

Returns 3 outputs:
1. all_iterations_{seed}.csv
2. fingerprints_all.csv
3. missed_batches.csv

## B3.0 principle on all sentences - zero shot

In [12]:
fingerprint = 'fp_319be4768e'
## Take fingerprint into account!! Could go wrong

#Block seed to prevent accidental rerun of gpt_annotate
seeds = [3644,3441, 280, 5991, 7917]

In [10]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B30, key, seed,fingerprint, experiment="B3.0",  num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

3644 - iteration 1
iteration:  1 completed
3441 - iteration 1
3441 - I1 - B18 fingerprint does not match
iteration:  1 completed
280 - iteration 1
280 - I1 - B22 fingerprint does not match
iteration:  1 completed
5991 - iteration 1
iteration:  1 completed
7917 - iteration 1
7917 - I1 - B13 fingerprint does not match
7917 - I1 - B55 fingerprint does not match
iteration:  1 completed


## B3.1 principle on all sentences - one shot

In [13]:
# Load codebook - one shot
with open('codebooks/B3.1', 'r', encoding='utf-8') as file:
    B31 = file.read()

In [14]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B31, key, seed,fingerprint, experiment="B3.1",  num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

3644 - iteration 1
3644 - I1 - B5 fingerprint does not match
iteration:  1 completed
3441 - iteration 1
iteration:  1 completed
280 - iteration 1
iteration:  1 completed
5991 - iteration 1
5991 - I1 - B52 fingerprint does not match
iteration:  1 completed
7917 - iteration 1
iteration:  1 completed


## B3.0.1 principle on all sentences - one shot

In [15]:
# Load codebook - one shot
with open('codebooks/B3.0.1', 'r', encoding='utf-8') as file:
    B301 = file.read()

In [16]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B301, key, seed,fingerprint, experiment="B3.0.1",  num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

3644 - iteration 1
iteration:  1 completed
3441 - iteration 1
3441 - I1 - B39 fingerprint does not match
iteration:  1 completed
280 - iteration 1
280 - I1 - B33 fingerprint does not match
280 - I1 - B50 fingerprint does not match
280 - I1 - B54 fingerprint does not match
iteration:  1 completed
5991 - iteration 1
iteration:  1 completed
7917 - iteration 1
7917 - I1 - B19 fingerprint does not match
7917 - I1 - B21 fingerprint does not match
7917 - I1 - B29 fingerprint does not match
iteration:  1 completed


## B3.1.1 principle on all sentences - one shot

In [13]:
# Load codebook - one shot
with open('codebooks/B3.1.1', 'r', encoding='utf-8') as file:
    B311 = file.read()

In [14]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B311, key, seed,fingerprint, experiment="B3.1.1",  num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

3644 - iteration 1
3644 - I1 - B31 fingerprint does not match
3644 - I1 - B36 fingerprint does not match
3644 - I1 - B51 fingerprint does not match
3644 - I1 - B52 fingerprint does not match
3644 - I1 - B55 fingerprint does not match
3644 - I1 - B60 fingerprint does not match
iteration:  1 completed
3441 - iteration 1
3441 - I1 - B1 fingerprint does not match
3441 - I1 - B8 fingerprint does not match
3441 - I1 - B12 fingerprint does not match
3441 - I1 - B19 fingerprint does not match
3441 - I1 - B29 fingerprint does not match
3441 - I1 - B52 fingerprint does not match
3441 - I1 - B53 fingerprint does not match
iteration:  1 completed
280 - iteration 1
280 - I1 - B1 fingerprint does not match
280 - I1 - B8 fingerprint does not match
280 - I1 - B20 fingerprint does not match
280 - I1 - B25 fingerprint does not match
280 - I1 - B46 fingerprint does not match
iteration:  1 completed
5991 - iteration 1
5991 - I1 - B45 fingerprint does not match
5991 - I1 - B58 fingerprint does not match
it