In [2]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import difflib

## Notebook to annotate HLS speeches for principles, topic, unit, shape
### B: string-based labels

Codebooks:
- B4.0

Apply on full string-based dataset
Temperature: 0
Iterations: 1

Model selection:
 As of 22-05-2024, gpt-4-turbo-2024-04-09 seems to be the only gpt-model that returns a fingerprint in addition to gpt-4o

  #model= "gpt-4-turbo-2024-04-09"
  #model = "gpt-3.5-turbo-0125"


### 1. Import text to annotate
Select only relevant columns of the full dataframe, in this case:
PRINCIPLE

In [3]:
# Import string based datafile
HLS_train = pd.read_csv('data/string/HLS_train_string.csv')

In [4]:
# Select only columns containing relevance labels
HLS = HLS_train[['Text', 'PRINCIPLE','TOPIC', 'UNIT', 'SHAPE']]
HLS

Unnamed: 0,Text,PRINCIPLE,TOPIC,UNIT,SHAPE
0,"Thank you, Mr. President .",not evaluated,not evaluated,not evaluated,not evaluated
1,"On beha lf of the government of Japan , I wou...",not evaluated,not evaluated,not evaluated,not evaluated
2,I would also like to expr ess my d eepest con...,not evaluated,not evaluated,not evaluated,not evaluated
3,Mr. President: A fair and effective framewor...,utilitarian,new UNFCCC policy,responsibility,equality
4,"In this regard, Japan firmly supports the est...",not evaluated,not evaluated,not evaluated,not evaluated
...,...,...,...,...,...
1207,New Zealand is proud to suppor t several impo...,not evaluated,not evaluated,not evaluated,not evaluated
1208,"I am joined by New Zealand’s largest business,...",not evaluated,not evaluated,not evaluated,not evaluated
1209,The commitment o f New Zealanders from across ...,not evaluated,not evaluated,not evaluated,not evaluated
1210,Thank you Mr President.,not evaluated,not evaluated,not evaluated,not evaluated


In [5]:
HLS.PRINCIPLE.value_counts()

not evaluated                  992
prioritarian                    66
utilitarian                     59
egalitarian                     52
general normative statement     34
sufficientarian                  8
libertarian                      1
Name: PRINCIPLE, dtype: int64

In [6]:
HLS.TOPIC.value_counts()

not evaluated                       992
other                                46
new UNFCCC policy                    41
urgency                              36
UNFCCC agreements and principles     32
cooperation                          23
financial mechanisms                 22
adaptation and mitigation            11
mitigation                            7
adaptation                            2
Name: TOPIC, dtype: int64

In [7]:
HLS.UNIT.value_counts()

not evaluated                            992
not indicated                            106
responsibility                            49
financial resources                       31
support                                   19
financial and technological resources      8
other                                      4
technological resources                    3
Name: UNIT, dtype: int64

In [8]:
HLS.SHAPE.value_counts()

not evaluated                   994
not indicated                   130
priority to worst off            34
proportional to commitment       19
equity                           14
equality                         11
needs based                       5
proportional to contribution      5
Name: SHAPE, dtype: int64

Classes are VERY imbalanced


### 2. Import necessary files
- codebooks
- API key
- import gpt_annotate_num

In [9]:
# OpenAI key
with open('gpt_api_key.txt', 'r') as f:
    key = f.read().strip()

In [10]:
import gpt_annotate_string

In [11]:
# Load codebook - zero shot
with open('codebooks/B4.0', 'r', encoding='utf-8') as file:
    B40 = file.read()

### 3. Prepare data for annotation
Compares column names in HLS_principle to the codes identified by GPT-4o in the codebook. Seed for this identification is set to 1234.

In [13]:
# Prepare dataframe for annotation
text_to_annotate = gpt_annotate_string.prepare_data(HLS, B40, key, prep_codebook=True)

ChatCompletion(id='chatcmpl-9VM2BkUjpLsBnVZDaC9pSXNp1P7HS', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='PRINCIPLE, TOPIC, UNIT, SHAPE', role='assistant', function_call=None, tool_calls=None))], created=1717259895, model='gpt-4o-2024-05-13', object='chat.completion', system_fingerprint='fp_319be4768e', usage=CompletionUsage(completion_tokens=11, prompt_tokens=1731, total_tokens=1742))

Categories to annotate:
1) PRINCIPLE
2) TOPIC
3) UNIT
4) SHAPE


Data is ready to be annotated using gpt_annotate()!

Glimpse of your data:
Shape of data:  (1212, 7)
   unique_id                                               text  \
0          0                         Thank you, Mr. President .   
1          1   On beha lf of the government of Japan , I wou...   
2          2   I would also like to expr ess my d eepest con...   
3          3   Mr. President:  A fair and effective framewor...   
4          4   In this regard, Japan firmly supports t

Fingerprint used: fp_319be4768e
Unsure as to why a different fingerprint is used.
Fingerprint used in other tests: fp_43dfabdef1

Seed of textpreparation is hardcoded into gpt_annotate. This to ensure that only the results of the same fingerprint for all seeds and all iterations. Essentially every time GPT-4o is called only results with this specific fingerprint are saved.

# 4. Run gpt_annotate_num
Evaluation per seed -
5 different seeds
Batch of 20 sentences
1 iteration

Returns 3 outputs:
1. all_iterations_{seed}.csv
2. fingerprints_all.csv
3. missed_batches.csv

## B4.0 full annotation on all sentences

In [14]:
fingerprint = 'fp_319be4768e'
## Take fingerprint into account!! Could go wrong

#Block seed to prevent accidental rerun of gpt_annotate
seeds = [3644,3441, 280, 5991, 7917]

In [15]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B40, key, seed,fingerprint, experiment="B4.0",  num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

3644 - iteration 1
iteration:  1 completed
3441 - iteration 1
3441 - I1 - B4 fingerprint does not match
iteration:  1 completed
280 - iteration 1
280 - I1 - B36 fingerprint does not match
iteration:  1 completed
5991 - iteration 1
5991 - I1 - B25 fingerprint does not match
iteration:  1 completed
7917 - iteration 1
7917 - I1 - B10 fingerprint does not match
7917 - I1 - B26 fingerprint does not match
iteration:  1 completed
