In [1]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import difflib

## Notebook to annotate HLS speeches for principles
### B: string-based labels

Codebooks:
- B2.0: zero shot
- B2.1: one shot

Only apply on sentences in ground truth dataframe labelled as relevant
Temperature: 0
Iterations: 1

Test for 5 different seeds Batch of 20 sentences, Temperature 0
Model selection: GPT-4o


### 1. Import text to annotate
Select only relevant columns of the full dataframe, in this case:
PRINCIPLE

In this case, evaluation is performed on only the sentences that are deemed relevant in the manual annotation. Only these sentences are labelled for the principle they present.

In [2]:
# Import string based datafile
HLS_train = pd.read_csv('data/string/HLS_train_string.csv')

In [3]:
HLS_train_relevant = HLS_train[HLS_train['RELEVANCE']=='Relevant']

In [4]:
# Select only columns containing relevance labels
HLS_principle = HLS_train_relevant[['Text', 'PRINCIPLE']]
HLS_principle

Unnamed: 0,Text,PRINCIPLE
3,Mr. President: A fair and effective framewor...,utilitarian
5,Such a framework must be based on “nationally ...,egalitarian
44,It should not only enable us to discuss global...,utilitarian
53,Global warming is a catastrophic problem that ...,utilitarian
54,"Therefore, the multilateralism approach remain...",general normative statement
...,...,...
1172,As we work to catch up on lost time and progr...,prioritarian
1173,"Conflict -ridden communities, refugees, and d...",prioritarian
1174,"Nor can we stand by , as the massive destructi...",utilitarian
1198,We recognise that we must deliver on our coll...,prioritarian


In [5]:
HLS_principle.PRINCIPLE.value_counts()

prioritarian                   66
utilitarian                    59
egalitarian                    51
general normative statement    33
sufficientarian                 8
libertarian                     1
Name: PRINCIPLE, dtype: int64

### 2. Import necessary files
- codebooks
- API key
- import gpt_annotate_num

In [6]:
# Load codebook - zero shot
with open('codebooks/B2.0', 'r', encoding='utf-8') as file:
    B20 = file.read()

In [7]:
# OpenAI key
with open('gpt_api_key.txt', 'r') as f:
    key = f.read().strip()

In [8]:
import gpt_annotate_string

### 3. Prepare data for annotation
Compares column names in HLS_principle to the codes identified by GPT-4o in the codebook. Seed for this identification is set to 1234.

In [9]:
# Prepare dataframe for annotation
text_to_annotate = gpt_annotate_string.prepare_data(HLS_principle, B20, key, prep_codebook=True)

ChatCompletion(id='chatcmpl-9X4n4012lUNhn8thU5wR7f6y2jIUE', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='PRINCIPLE', role='assistant', function_call=None, tool_calls=None))], created=1717670266, model='gpt-4o-2024-05-13', object='chat.completion', system_fingerprint='fp_319be4768e', usage=CompletionUsage(completion_tokens=3, prompt_tokens=582, total_tokens=585))

Categories to annotate:
1) PRINCIPLE


Data is ready to be annotated using gpt_annotate()!

Glimpse of your data:
Shape of data:  (218, 4)
   unique_id                                               text  \
0          3   Mr. President:  A fair and effective framewor...   
1          5  Such a framework must be based on “nationally ...   
2         44  It should not only enable us to discuss global...   
3         53  Global warming is a catastrophic problem that ...   
4         54  Therefore, the multilateralism approach remain...   

                     PRINCIPLE  \
0 

Fingerprint used: fp_319be4768e

Seed of textpreparation is hardcoded into gpt_annotate. This to ensure that onlye the results of the same fingerprint for all seeds and all iterations. Essentially every time GPT-4o is called only results with this specific fingerprint are saved.

# 4. Run gpt_annotate_string
Evaluation per seed -
5 different seeds
Batch of 20 sentences
1 iteration

Returns 3 outputs:
1. all_iterations_{seed}.csv
2. fingerprints_all.csv
3. missed_batches.csv

## B2.0 principle - zero shot

In [10]:
fingerprint = 'fp_319be4768e'

#Block seed to prevent accidental rerun of gpt_annotate
seeds = [3644,3441, 280, 5991, 7917]

In [15]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B20, key, seed,fingerprint, experiment="B2.0",  num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

3644 - iteration 1
iteration:  1 completed
3441 - iteration 1
3441 - I1 - B8 fingerprint does not match
iteration:  1 completed
280 - iteration 1
280 - I1 - B1 fingerprint does not match
280 - I1 - B11 fingerprint does not match
iteration:  1 completed
5991 - iteration 1
iteration:  1 completed
7917 - iteration 1
iteration:  1 completed


## B2.1 principle - one shot

In [12]:
# Load codebook - zero shot
with open('codebooks/B2.1', 'r', encoding='utf-8') as file:
    B21 = file.read()

In [13]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B21, key, seed,fingerprint, experiment="B2.1",  num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

3644 - iteration 1
iteration:  1 completed
3441 - iteration 1
iteration:  1 completed
280 - iteration 1
iteration:  1 completed
5991 - iteration 1
iteration:  1 completed
7917 - iteration 1
iteration:  1 completed
