In [1]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import difflib

## Notebook to annotate HLS speeches for relevance
### B: string-based labels

Codebooks:
- B1.0: zero shot
- B1.1: one shot
- (B1.2: two shot)
- B1.0.1: zero shot with specific inclusion of context
- B1.1.1: one shot with specific inclusion of context

Test for 5 different seeds
Batch of 20 sentences
Original: 5 iterations; does take very long!
Set to 1 iteration; check for class imbalances between labelled elements. OOK: seed zou moeten zorgen dat iedere iteratie gelijk is - dit hoeft niet het geval te zijn, zeker niet als de temperature is aangepast. > kan wel inherent in model zitten

Main outcomes: T0 - I1
For testing purposes: T0 - (T 0.2) - T 0.6 : (I3)

Temperature: 0 - 0.6
Temperature only focuses on the output probabilities - should thus be set to 0 to make the output as deterministic as possible.
> Do a single test to see what the influence of changing temperature is
> Top_p is set to 1; making temperature the primary factor.

Hypothesis: accuracy decreases with temperature

Model selection:
 As of 22-05-2024, gpt-4-turbo-2024-04-09 seems to be the only gpt-model that returns a fingerprint in addition to gpt-4o

  #model= "gpt-4-turbo-2024-04-09"
  #model = "gpt-3.5-turbo-0125"


### 1. Import text to annotate
Select only relevant columns of the full dataframe, in this case:
RELEVANCE

In [2]:
# Import string based datafile
HLS_train = pd.read_csv('data/string/HLS_train_string.csv')

In [3]:
### Select only japan for testing purposes
#HLS_train_japan = HLS_train[HLS_train['id']=='COP19_japan']
HLS_train

Unnamed: 0,id,Text,Relevance,Principle,Topic,Unit,Shape,RELEVANCE,PRINCIPLE,TOPIC,UNIT,SHAPE
0,COP19_japan,"Thank you, Mr. President .",0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated
1,COP19_japan,"On beha lf of the government of Japan , I wou...",0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated
2,COP19_japan,I would also like to expr ess my d eepest con...,0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated
3,COP19_japan,Mr. President: A fair and effective framewor...,2,3,1,2,2,Relevant,utilitarian,new UNFCCC policy,responsibility,equality
4,COP19_japan,"In this regard, Japan firmly supports the est...",1,0,0,0,0,Statement of intent,not evaluated,not evaluated,not evaluated,not evaluated
...,...,...,...,...,...,...,...,...,...,...,...,...
1207,COP28_newzealand,New Zealand is proud to suppor t several impo...,0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated
1208,COP28_newzealand,"I am joined by New Zealand’s largest business,...",0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated
1209,COP28_newzealand,The commitment o f New Zealanders from across ...,0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated
1210,COP28_newzealand,Thank you Mr President.,0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated


In [4]:
# Select only columns containing relevance labels
HLS_relevance = HLS_train[['Text', 'RELEVANCE']]

### 2. Import necessary files
- codebooks
- API key
- import gpt_annotate_num

In [5]:
# Load codebook - zero shot
with open('codebooks/B1.0', 'r', encoding='utf-8') as file:
    B10 = file.read()

In [6]:
# OpenAI key
with open('gpt_api_key.txt', 'r') as f:
    key = f.read().strip()

In [7]:
import gpt_annotate_string

### 3. Prepare data for annotation
Compares column names in HLS_relevance to the codes identified by GPT-4o in the codebook. Seed for this identification is set to 1234.

In [8]:
# Prepare dataframe for annotation
text_to_annotate = gpt_annotate_string.prepare_data(HLS_relevance, B10, key, prep_codebook=True)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

Fingerprint used: fp_43dfabdef1

Seed of textpreparation is hardcoded into gpt_annotate. This to ensure that onlye the results of the same fingerprint for all seeds and all iterations. Essentially every time GPT-4o is called only results with this specific fingerprint are saved.

# 4. Run gpt_annotate_string
Evaluation per seed -
5 different seeds
Batch of 20 sentences
1 iteration

Returns 3 outputs:
1. all_iterations_{seed}.csv
2. fingerprints_all.csv
3. missed_batches.csv

## B1.0 Relevance - zero shot - T0


In [8]:
fingerprint = 'fp_43dfabdef1'

# Turn seed of to prevent accidental run of GPT annotate
#seeds = [3644,3441, 280, 5991, 7917]

In [9]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B10, key, seed,fingerprint, experiment="B1.0",  num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

3644 - iteration 1
3644 - I1 - B22 fingerprint does not match
3644 - I1 - B30 fingerprint does not match
3644 - I1 - B32 fingerprint does not match
iteration:  1 completed
3441 - iteration 1
3441 - I1 - B28 fingerprint does not match
3441 - I1 - B37 fingerprint does not match
3441 - I1 - B60 fingerprint does not match
iteration:  1 completed
280 - iteration 1
280 - I1 - B1 fingerprint does not match
280 - I1 - B4 fingerprint does not match
280 - I1 - B5 fingerprint does not match
280 - I1 - B7 fingerprint does not match
280 - I1 - B16 fingerprint does not match
iteration:  1 completed
5991 - iteration 1
5991 - I1 - B54 fingerprint does not match
iteration:  1 completed
7917 - iteration 1
iteration:  1 completed


## B1.0 Relevance - zero shot - T0.6

In [20]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B10, key, seed,fingerprint, experiment="B1.0",  num_iterations=1, model="gpt-4o", temperature=0.6, batch_size=20, human_labels=True)

3644 - iteration 1
3644 - I1 - B5 fingerprint does not match
3644 - I1 - B26 fingerprint does not match
3644 - I1 - B39 fingerprint does not match
3644 - I1 - B55 fingerprint does not match
iteration:  1 completed
3441 - iteration 1
3441 - I1 - B22 fingerprint does not match
3441 - I1 - B29 fingerprint does not match
iteration:  1 completed
280 - iteration 1
280 - I1 - B1 fingerprint does not match
280 - I1 - B33 fingerprint does not match
280 - I1 - B42 fingerprint does not match
280 - I1 - B54 fingerprint does not match
iteration:  1 completed
5991 - iteration 1
5991 - I1 - B32 fingerprint does not match
5991 - I1 - B43 fingerprint does not match
iteration:  1 completed
7917 - iteration 1
7917 - I1 - B1 fingerprint does not match
7917 - I1 - B12 fingerprint does not match
7917 - I1 - B32 fingerprint does not match
iteration:  1 completed


## B1.0 Relevance - zero shot - T0 - I3

In [None]:
# Annotate the data - T0 - I3
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B10, key, seed,fingerprint, experiment="B1.0",  num_iterations=3, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

## B1.0.1 Relevance - zero shot - with context

In [None]:
# Load codebook - zero shot
with open('codebooks/B1.0.1', 'r', encoding='utf-8') as file:
    B101 = file.read()

In [None]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B101, key, seed, fingerprint, experiment="B1.0.1",num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

## B1.1 Relevance - one shot

In [None]:
# Load codebook - zero shot
with open('codebooks/B1.1', 'r', encoding='utf-8') as file:
    B11 = file.read()

In [None]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B11, key, seed,fingerprint, experiment="B1.1",  num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

## B1.1.1 Relevance - one shot - with context

In [None]:
# Load codebook - zero shot
with open('codebooks/B1.1.1', 'r', encoding='utf-8') as file:
    B111 = file.read()

In [None]:
# Annotate the data - T0 - I1
for seed in seeds:
    gpt_annotate_string.gpt_annotate(text_to_annotate, B111, key, seed,fingerprint, experiment="B1.1.1",  num_iterations=1, model="gpt-4o", temperature=0, batch_size=20, human_labels=True)

## B1.2 - TWO-shot codebook
Codebook is created. Currently not evaluated.


# 5. Define evaluation functions


In [18]:
# Define similarity score function and save to DF
def get_similarity_score(Rx, Ry):
    # Ensure Rx and Ry are pandas Series and convert to strings with stripped whitespace
    Rx = Rx.astype(str).str.strip()
    Ry = Ry.astype(str).str.strip()

    # Calculate similarity scores
    similarity_scores = Rx.combine(Ry, lambda x, y: difflib.SequenceMatcher(None, x, y).ratio())

    # Apply the threshold - maybe put higher?
    similarity_scores = similarity_scores.apply(lambda x: x if x >= 0.9 else 0)

    # Return the mean similarity score as a percentage - no need for mean as 1 iteration is used
    return similarity_scores.mean()

In [19]:
def similarity(directory):
    # Iterate through each file in the directory
    list = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)
        Rx = df['RELEVANCE_x']
        Ry = df['RELEVANCE_y']

        similarity_score = get_similarity_score(Rx,Ry)

        #Save the score in a dataframe
        list.append((filename, similarity_score))

    similarity = pd.DataFrame(list, columns=['filename', 'similarity ALL'])
    return similarity


In [20]:
def recall(directory, relevance_x):
    # Iterate through each file in the directory
    list = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)

        relevant_df = df[df['RELEVANCE_x'] == relevance_x]

        Rx = relevant_df['RELEVANCE_x']
        Ry = relevant_df['RELEVANCE_y']

        similarity_score = get_similarity_score(Rx,Ry)

        #Save the score in a dataframe
        list.append((filename, similarity_score))

    recall = pd.DataFrame(list, columns=['filename', f'{relevance_x} recall'])
    return recall

In [21]:
def precision(directory, relevance_y):
    # Iterate through each file in the directory
    list = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)

        relevant_df = df[df['RELEVANCE_y'] == relevance_y]

        Rx = relevant_df['RELEVANCE_x']
        Ry = relevant_df['RELEVANCE_y']

        similarity_score = get_similarity_score(Rx,Ry)

        #Save the score in a dataframe
        list.append((filename, similarity_score))

    precision = pd.DataFrame(list, columns=['filename', f'{relevance_y} precision'])
    return precision

# 6. Evaluate performance

In [24]:
# Iterate through each file in the directory
B10 = 'STRING_RESULT/B1.0/all_iterations'
relevant = 'Relevant'
SOI = 'Statement of intent'
NR = 'Not relevant'

B10_similarity = similarity(B10)
B10_recall = recall(B10, relevant)
B10_recall


Unnamed: 0,filename,Relevant recall
0,all_iterations_string_T0.6_280.csv,0.796117
1,all_iterations_string_T0.6_3441.csv,0.805556
2,all_iterations_string_T0.6_3644.csv,0.845771
3,all_iterations_string_T0.6_5991.csv,0.759615
4,all_iterations_string_T0.6_7917.csv,0.797101
5,all_iterations_string_T0_280.csv,0.82199
6,all_iterations_string_T0_3441.csv,0.815166
7,all_iterations_string_T0_3644.csv,0.798077
8,all_iterations_string_T0_5991.csv,0.790698
9,all_iterations_string_T0_7917.csv,0.807339
