In [14]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import openai
import difflib

## Notebook to annotate HLS speeches for relevance
### A: one-hot coded labels

Codebooks:
- A1.0: zero shot
- A1.1: one shot
- (A1.2: two shot)
- A1.0.1: zero shot with specific inclusion of context
- A1.1.1: one shot with specific inclusion of context

Test for 5 different seeds
Batch of 20 sentences
1 iterations

Main outcomes: T0 - I1
For testing purposes: T0 - T 0.2 - T 0.6 : (I3)

Model selection:
 As of 22-05-2024, gpt-4-turbo-2024-04-09 seems to be the only gpt-model that returns a fingerprint in addition to gpt-4o

  #model= "gpt-4-turbo-2024-04-09"
  #model = "gpt-3.5-turbo-0125"


### Import text to annotate
Select only relevant columns of the full dataframe, in this case:
relevance_0, relevance_1, relevance_2

In [2]:
# Import numerical csv file
HLS_train = pd.read_csv('data/num/HLS_train_dummies.csv')

In [3]:
# ### Select only japan for testing purposes
# HLS_train_japan = HLS_train[HLS_train['id']=='COP19_japan']
HLS_train

Unnamed: 0,id,Text,relevance_0,relevance_1,relevance_2,principle_0,principle_1,principle_2,principle_3,principle_4,...,unit_7,shape_0,shape_1,shape_2,shape_3,shape_4,shape_5,shape_6,shape_7,shape_8
0,COP19_japan,"Thank you, Mr. President .",1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,COP19_japan,"On beha lf of the government of Japan , I wou...",1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,COP19_japan,I would also like to expr ess my d eepest con...,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,COP19_japan,Mr. President: A fair and effective framewor...,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,COP19_japan,"In this regard, Japan firmly supports the est...",0,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1207,COP28_newzealand,New Zealand is proud to suppor t several impo...,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1208,COP28_newzealand,"I am joined by New Zealand’s largest business,...",1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1209,COP28_newzealand,The commitment o f New Zealanders from across ...,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1210,COP28_newzealand,Thank you Mr President.,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [4]:
# Select only columns containing relevance labels
HLS_relevance = HLS_train[['Text', 'relevance_0', 'relevance_1', 'relevance_2']]

### Import necessary files
- codebooks
- API key
- import gpt_annotate_num

In [5]:
# Load codebook - zero shot
with open('codebooks/A/A1.0', 'r', encoding='utf-8') as file:
    A10 = file.read()

In [6]:
# OpenAI key
with open('gpt_api_key.txt', 'r') as f:
    key = f.read().strip()

In [7]:
import gpt_annotate_num

### Prepare data for annotation
Compares column names in HLS_relevance to the codes identified by GPT-4o in the codebook. Seed for this identification is set to 1234.

In [8]:
# Prepare dataframe for annotation
text_to_annotate = gpt_annotate_num.prepare_data(HLS_relevance, A10, key, prep_codebook=True)

ChatCompletion(id='chatcmpl-9Trobvxu3ic6yy7h0gtlaZfagr0VC', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='relevance_0, relevance_1, relevance_2', role='assistant', function_call=None, tool_calls=None))], created=1716905405, model='gpt-4o-2024-05-13', object='chat.completion', system_fingerprint='fp_43dfabdef1', usage=CompletionUsage(completion_tokens=12, prompt_tokens=389, total_tokens=401))

Categories to annotate:
1) relevance_0
2) relevance_1
3) relevance_2


Data is ready to be annotated using gpt_annotate()!

Glimpse of your data:
Shape of data:  (1212, 6)
   unique_id                                               text  relevance_0  \
0          0                         Thank you, Mr. President .            1   
1          1   On beha lf of the government of Japan , I wou...            1   
2          2   I would also like to expr ess my d eepest con...            1   
3          3   Mr. President:  A fair and effective frame

Fingerprint used: 'fp_43dfabdef1'

Seed of textpreparation is hardcoded into gpt_annotate. This to ensure that onlye the results of the same fingerprint for all seeds and all iterations. Essentially every time GPT-4o is called only results with this specific fingerprint are saved.

### Run gpt_annotate_num
Evaluation per seed -
5 different seeds
Batch of 20 sentences
1 iterations

Returns 5 outputs per seed;
1. all_iterations_num_{seed}.csv
2. final_num_{seed}.csv
3. performance_metrics_{seed}
4. incorrect_{seed}.csv
5. fingerprints_all.csv


In [9]:
fingerprint = 'fp_43dfabdef1'
seeds = [3644,3441, 280, 5991, 7917]

In [10]:
# Annotate the data - T0
for seed in seeds:
    gpt_annotate_num.gpt_annotate(text_to_annotate, A10, key, seed, fingerprint,experiment='A1.0', num_iterations=1, model="gpt-4o", temperature=0,batch_size=20, human_labels=True)

3644 - iteration 1
3644 - I1 - B15 fingerprint does not match
3644 - I1 - B17 fingerprint does not match
3644 - I1 - B61 fingerprint does not match
iteration:  1 completed
3441 - iteration 1
3441 - I1 - B2 fingerprint does not match
3441 - I1 - B16 fingerprint does not match
iteration:  1 completed
280 - iteration 1
280 - I1 - B7 fingerprint does not match
280 - I1 - B15 fingerprint does not match
280 - I1 - B18 fingerprint does not match
280 - I1 - B23 fingerprint does not match
280 - I1 - B53 fingerprint does not match
280 - I1 - B61 fingerprint does not match
iteration:  1 completed
5991 - iteration 1
5991 - I1 - B31 fingerprint does not match
5991 - I1 - B51 fingerprint does not match
iteration:  1 completed
7917 - iteration 1
7917 - I1 - B46 fingerprint does not match
7917 - I1 - B55 fingerprint does not match
iteration:  1 completed


Time to evaluate: 35 min

### Evaluate Performance metrics
Print performance metrics

In [11]:
# Function to extract numbers from the filename string
def extract_numbers(filename):
    return re.findall(r'\d+', filename)

# Iterate through each file in the directory
directory = 'NUM_RESULT/A1.0/performance_metrics_num'

# Create empty list to store all dataframes
dataframes = []

# Open each file and make dataframe of complete scores
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    df = pd.read_csv(file_path)

    seed = extract_numbers(filename)
    # Convert the list of numbers to a string
    seed_str = '_'.join(seed)

    # Add the extracted numbers to the first column entries
    if not df.empty and seed_str:
        # Ensure the first column is treated as a string
        df.iloc[:, 0] = seed_str + df.iloc[:, 0].astype(str)

    dataframes.append(df)

performance_all = pd.concat(dataframes, ignore_index=True)
performance_all

Unnamed: 0,Category,Accuracy,Precision,Recall,F1
0,0_280relevance_0,0.787273,0.88162,0.781768,0.828697
1,0_280relevance_1,0.724545,0.323529,0.601093,0.42065
2,0_280relevance_2,0.825455,0.503448,0.378238,0.431953
3,0_3441relevance_0,0.777304,0.864629,0.779528,0.819876
4,0_3441relevance_1,0.711604,0.303279,0.572165,0.396429
5,0_3441relevance_2,0.819113,0.512346,0.384259,0.439153
6,0_3644relevance_0,0.776724,0.870482,0.76964,0.816961
7,0_3644relevance_1,0.716379,0.331522,0.595122,0.425829
8,0_3644relevance_2,0.818103,0.47929,0.397059,0.434316
9,0_5991relevance_0,0.78413,0.87426,0.778656,0.823693


### Evaluate similarities between the predicted and true outcomes
Also only selecting relevant sentences to see what the outcome would be.

Hypothesis: String based annotation performs significantly better.
Could in itself be a limitation of the testing possibilities. Limits the evaluation metrics that can be used.

In [17]:
def get_similarity_score(Rx, Ry):
    # Ensure Rx and Ry are pandas Series and convert to strings with stripped whitespace
    Rx = Rx.astype(str).str.strip()
    Ry = Ry.astype(str).str.strip()

    # Calculate similarity scores
    similarity_scores = Rx.combine(Ry, lambda x, y: difflib.SequenceMatcher(None, x, y).ratio())

    # Apply the threshold - maybe put higher?
    similarity_scores = similarity_scores.apply(lambda x: x if x >= 0.95 else 0)

    # Return the mean similarity score as a percentage
    return similarity_scores.mean() * 100

In [18]:
# Iterate through each file in the directory
directory = 'NUM_RESULT/A1.0/all_iterations_num'
similarity_scores_2 = []

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    df = pd.read_csv(file_path)
    Rx = df['relevance_2_x']
    Ry = df['relevance_2_y']

    similarity_score = get_similarity_score(Rx,Ry)

    #Save the score in a dataframe
    similarity_scores_2.append((filename, similarity_score))

similarity_2 = pd.DataFrame(similarity_scores_2, columns=['filename', 'similarity 2'])
similarity_2.to_csv("NUM_RESULT/A1.0/T0_similarity_scores_2", index=False)

similarity_2

Unnamed: 0,filename,similarity 2
0,all_iterations_num_T0_280.csv,82.545455
1,all_iterations_num_T0_3441.csv,81.911263
2,all_iterations_num_T0_3644.csv,81.810345
3,all_iterations_num_T0_5991.csv,82.593857
4,all_iterations_num_T0_7917.csv,82.167235


In [20]:
similarity_scores_22 = []

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    df = pd.read_csv(file_path)

    relevant_df = df[df['relevance_2_x'] == 1]

    Rx = relevant_df['relevance_2_x']
    Ry = relevant_df['relevance_2_y']

    similarity_score = get_similarity_score(Rx,Ry)

    #Save the score in a dataframe
    similarity_scores_22.append((filename, similarity_score))

similarity_22 = pd.DataFrame(similarity_scores_22, columns=['filename', 'similarity 2'])
similarity_22.to_csv("NUM_RESULT/A1.0/T0_similarity_scores_2_only2", index=False)

similarity_22

## IS ACCURACY

Unnamed: 0,filename,similarity 2
0,all_iterations_num_T0_280.csv,37.823834
1,all_iterations_num_T0_3441.csv,38.425926
2,all_iterations_num_T0_3644.csv,39.705882
3,all_iterations_num_T0_5991.csv,41.784038
4,all_iterations_num_T0_7917.csv,42.23301


In [21]:
similarity_scores_222 = []

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    df = pd.read_csv(file_path)

    relevant_df = df[df['relevance_2_y'] == 1]

    Rx = relevant_df['relevance_2_x']
    Ry = relevant_df['relevance_2_y']

    similarity_score = get_similarity_score(Rx,Ry)

    #Save the score in a dataframe
    similarity_scores_222.append((filename, similarity_score))

similarity_222 = pd.DataFrame(similarity_scores_222, columns=['filename', 'similarity 2'])
similarity_222.to_csv("NUM_RESULT/A1.0/T0_similarity_scores_2_only2PRED", index=False)

similarity_222

## IS PRECISION

Unnamed: 0,filename,similarity 2
0,all_iterations_num_T0_280.csv,50.344828
1,all_iterations_num_T0_3441.csv,51.234568
2,all_iterations_num_T0_3644.csv,47.928994
3,all_iterations_num_T0_5991.csv,52.662722
4,all_iterations_num_T0_7917.csv,49.152542
