In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import openai

## Notebook to convert numerical based HLS annotations to tekst for non-binary evaluation with GPT

In [2]:
HLS_train = pd.read_csv('HLS_train_string.csv')

In [3]:
### Select only japan for evaluation of string
HLS_train_japan = HLS_train[HLS_train['id']=='COP19_japan']
HLS_train_japan

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,Text,Relevance,Principle,Topic,Unit,Shape,RELEVANCE,PRINCIPLE,TOPIC,UNIT,SHAPE
0,0,0,COP19_japan,"Thank you, Mr. President .",0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated
1,1,1,COP19_japan,"On beha lf of the government of Japan , I wou...",0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated
2,2,2,COP19_japan,I would also like to expr ess my d eepest con...,0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated
3,3,3,COP19_japan,Mr. President: A fair and effective framewor...,2,3,1,2,2,Relevant,utilitarian,new UNFCCC policy,responsibility,equality
4,4,4,COP19_japan,"In this regard, Japan firmly supports the est...",1,0,0,0,0,Statement of intent,not evaluated,not evaluated,not evaluated,not evaluated
5,5,5,COP19_japan,Such a framework must be based on “nationally ...,2,2,1,2,3,Relevant,egalitarian,new UNFCCC policy,responsibility,equity
6,6,6,COP19_japan,I will devote myself toward the s uccessful o...,1,0,0,0,0,Statement of intent,not evaluated,not evaluated,not evaluated,not evaluated
7,7,7,COP19_japan,Mr. President: The Great East Japan Earthqua...,0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated
8,8,8,COP19_japan,Even under such circumstance both the public ...,0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated
9,9,9,COP19_japan,Our greenhouse gas emissions for the first co...,0,0,0,0,0,Not relevant,not evaluated,not evaluated,not evaluated,not evaluated


## Stringbased annotation with updated GPT_annotate and new codebook
First evaluate only for relevance

In [4]:
## Import the altered version of GPT_annotate
# Should ony present a single file with the outcomes
import gpt_annotate_string

In [5]:
# don't type the key in this file!
# create gpt_api.txt, put the key in that, and save
with open('gpt_api_key.txt', 'r') as f:
    key = f.read().strip()

In [6]:
HLS_relevance_string = HLS_train_japan[['Text', 'RELEVANCE']]
text_to_annotate = HLS_relevance_string

In [7]:
# Load codebook
with open('relevance_V1_string', 'r', encoding='utf-8') as file:
    codebook = file.read()

In [8]:
# Prepare the data for annotation
# Preparation is done with GPT-3.5-turbo - can be altered hardcoded
# The outcome of this analysis is not always the same, seems like something is going wrong here
#text_to_annotate = gpt_annotate_string.prepare_data(text_to_annotate, codebook, key, prep_codebook=True)

In [9]:
# Annotate the data; should return 1 output including all annotation for each batch.
#gpt_out_all =  gpt_annotate_string.gpt_annotate(text_to_annotate, codebook, key, seed=3644, num_iterations = 5, model = "gpt-4o", temperature = 0.6, batch_size = 20, human_labels = True,  data_prep_warning = False, time_cost_warning = True)

In [10]:
# New dataframe for text annotation, is already prepared
text_to_annotate_seedbatch = text_to_annotate.copy()

In [11]:
gpt_out_all = {}

seeds = [3644,3441, 280, 5991, 7917]

#for seed in seeds:
    # Construct a key name using the seed
#    key_name = f'gpt_out_all_{seed}'
#    gpt_out_all[key_name] =  gpt_annotate_string.gpt_annotate(text_to_annotate_seedbatch, codebook, key, seed=seed, num_iterations = 5, model = "gpt-4o", temperature = 0, batch_size = 20, human_labels = True,  data_prep_warning = False, time_cost_warning = True)

# Temperature 0 should be somewhat deterministic, is not the case?
# Test for temperature 0 0.2 0.6 - elements found in literature
# Include seed name in output CSV file

Use of gpt-4o seems to provide somewhat constant outputs, at least a fingerprint that can be evaluated.

Outputfiles:
fingerprints_mainseed - fingerprints of each time the API is called. This should be the same, as seed and model parameters remain the same. This is not the case.
gpt_out_all_iterations_string.csv - output of all annotations by gpt. In theory, these outputs should be the same for all iterations. This is not always the case.

In [12]:
gpt_out_all['gpt_out_all_3644']

KeyError: 'gpt_out_all_3644'

In [13]:
import difflib
import numpy as np

def get_similarity_score(Rx, Ry):
    Rx = Rx.copy()
    Ry = Ry.copy()

    # Remove the white spaces from the strings inside the dataframe
    for i in range(len(Rx)):
        Rx[i] = Rx[i].strip()
        Ry[i] = Ry[i].strip()

    # Calculate the similarity score between the two relevance strings
    similarity_score = []

    for i in range(len(Rx)):
        similarity_score.append(difflib.SequenceMatcher(None, Rx[i], Ry[i]).ratio())

        # if similarity score is less than 0.9, make it 0 - this makes  it the accuracy
        if similarity_score[i] < 0.9:
            similarity_score[i] = 0

    return (np.mean(similarity_score))*100# Compare the similarity of two columns


In [14]:
# Iterate through each file in the directory
directory = 'gpt_out_all'
similarity_scores = []

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    df = pd.read_csv(file_path)
    Rx = df['RELEVANCE_x']
    Ry = df['RELEVANCE_y']

    similarity_score = get_similarity_score(Rx,Ry)

    #Save the score in a dataframe
    similarity_scores.append((filename, similarity_score))
    print(filename, similarity_score)

similarity_df = pd.DataFrame(similarity_scores, columns=['filename', 'score'])
similarity_df.to_csv("simliarity_scores_all", index=False)

similarity_df

gpt_out_all_iterations_string_280.csv 76.21621621621621
gpt_out_all_iterations_string_3441.csv 77.7027027027027
gpt_out_all_iterations_string_3644.csv 75.75757575757575
gpt_out_all_iterations_string_5991.csv 76.96969696969697
gpt_out_all_iterations_string_7917.csv 77.83783783783784


Unnamed: 0,filename,score
0,gpt_out_all_iterations_string_280.csv,76.216216
1,gpt_out_all_iterations_string_3441.csv,77.702703
2,gpt_out_all_iterations_string_3644.csv,75.757576
3,gpt_out_all_iterations_string_5991.csv,76.969697
4,gpt_out_all_iterations_string_7917.csv,77.837838


In [15]:
## Similarity score per code
full = gpt_out_all['gpt_out_all_3644']
relevant = full[full['RELEVANCE_x']=='Relevant']
relevant
# TODO GET SIMILARITY SCORE PER CODE - SELECT SPECIFIC ROW - account for variances in spelling


KeyError: 'gpt_out_all_3644'

In [37]:
def get_similarity_score(Rx, Ry):
    Rx = Rx.astype(str).str.strip()
    Ry = Ry.astype(str).str.strip()

    # Calculate the similarity score between the two relevance strings
    similarity_score = []
    for i in range(len(Rx)):
        similarity_score.append(difflib.SequenceMatcher(None, Rx[i], Ry[i]).ratio())

        # if similarity score is less than 0.9, make it 0 - this makes  it the accuracy
        if similarity_score[i] < 0.9:
            similarity_score[i] = 0

    return (np.mean(similarity_score))*100# Compare the similarity of two columns


df = pd.read_csv('gpt_out_all/gpt_out_all_iterations_string_280.csv')

df_relevant = df[df["RELEVANCE_x"]=='Relevant']
Rx = df_relevant['RELEVANCE_x']
Ry = df_relevant['RELEVANCE_y']

similarity_score = get_similarity_score(Rx,Ry)
similarity_score


KeyError: 0

In [40]:

def get_similarity_score(Rx, Ry):
    # Ensure Rx and Ry are pandas Series and convert to strings with stripped whitespace
    Rx = Rx.astype(str).str.strip()
    Ry = Ry.astype(str).str.strip()

    # Calculate similarity scores
    similarity_scores = Rx.combine(Ry, lambda x, y: difflib.SequenceMatcher(None, x, y).ratio())

    # Apply the threshold - maybe put higher?
    similarity_scores = similarity_scores.apply(lambda x: x if x >= 0.9 else 0)

    # Return the mean similarity score as a percentage
    return similarity_scores.mean() * 100

# Example usage
directory = 'gpt_out_all'  # Replace with your directory path

# Initialize an empty list to store filename and similarity score tuples
similarity_scores = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    df = pd.read_csv(file_path)

    # Check if columns exist in DataFrame
    if 'RELEVANCE_x' in df.columns and 'RELEVANCE_y' in df.columns:
        # Filter rows where 'Relevance_x' is 'relevant'
        filtered_df = df[df['RELEVANCE_x'] == 'Relevant']

        if not filtered_df.empty:
            Rx = filtered_df['RELEVANCE_x']
            Ry = filtered_df['RELEVANCE_y']

            similarity_score = get_similarity_score(Rx, Ry)

            # Append the filename and similarity score to the list
            similarity_scores.append((filename, similarity_score))

            print(f'Similarity score for {filename}: {similarity_score}')
        else:
            print(f"No 'relevant' entries in column 'RELEVANCE_x' for file {filename}")
    else:
        print(f"Columns 'RELEVANCE_x' and/or 'RELEVANCE_y' do not exist in {filename}")

# Convert the list to a DataFrame
similarity_df = pd.DataFrame(similarity_scores, columns=['filename', 'score'])

# Save the DataFrame to a CSV file (optional)
output_path = 'path/to/save/similarity_scores.csv'  # Replace with your desired output path
similarity_df.to_csv(output_path, index=False)

# Optionally, display the DataFrame
print(similarity_df)

Similarity score for gpt_out_all_iterations_string_280.csv: 80.0
Similarity score for gpt_out_all_iterations_string_3441.csv: 50.0
Similarity score for gpt_out_all_iterations_string_3644.csv: 75.0
Similarity score for gpt_out_all_iterations_string_5991.csv: 100.0
Similarity score for gpt_out_all_iterations_string_7917.csv: 100.0


OSError: Cannot save file into a non-existent directory: 'path\to\save'

In [41]:
# Iterate over each file in the directory
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    df = pd.read_csv(file_path)

    # Check if columns exist in DataFrame
    if 'RELEVANCE_x' in df.columns and 'RELEVANCE_y' in df.columns:
        # Filter rows where 'Relevance_x' is 'relevant'
        filtered_df = df[df['RELEVANCE_x'] == 'Not relevant']

        if not filtered_df.empty:
            Rx = filtered_df['RELEVANCE_x']
            Ry = filtered_df['RELEVANCE_y']

            similarity_score = get_similarity_score(Rx, Ry)

            # Append the filename and similarity score to the list
            similarity_scores.append((filename, similarity_score))

            print(f'NOT RELEVANT: Similarity score for {filename}: {similarity_score}')
        else:
            print(f"No 'relevant' entries in column 'RELEVANCE_x' for file {filename}")
    else:
        print(f"Columns 'RELEVANCE_x' and/or 'RELEVANCE_y' do not exist in {filename}")

# Convert the list to a DataFrame
similarity_df = pd.DataFrame(similarity_scores, columns=['filename', 'score'])

# Save the DataFrame to a CSV file (optional)
output_path = 'path/to/save/similarity_scores.csv'  # Replace with your desired output path
similarity_df.to_csv(output_path, index=False)

# Optionally, display the DataFrame
print(similarity_df)

NOT RELEVANT: Similarity score for gpt_out_all_iterations_string_280.csv: 76.47058823529412
NOT RELEVANT: Similarity score for gpt_out_all_iterations_string_3441.csv: 76.47058823529412
NOT RELEVANT: Similarity score for gpt_out_all_iterations_string_3644.csv: 78.87323943661971
NOT RELEVANT: Similarity score for gpt_out_all_iterations_string_5991.csv: 78.87323943661971
NOT RELEVANT: Similarity score for gpt_out_all_iterations_string_7917.csv: 77.64705882352942


OSError: Cannot save file into a non-existent directory: 'path\to\save'

In [42]:
# Iterate over each file in the directory
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    df = pd.read_csv(file_path)

    # Check if columns exist in DataFrame
    if 'RELEVANCE_x' in df.columns and 'RELEVANCE_y' in df.columns:
        # Filter rows where 'Relevance_x' is 'relevant'
        filtered_df = df[df['RELEVANCE_x'] == 'Statement of intent']

        if not filtered_df.empty:
            Rx = filtered_df['RELEVANCE_x']
            Ry = filtered_df['RELEVANCE_y']

            similarity_score = get_similarity_score(Rx, Ry)

            # Append the filename and similarity score to the list
            similarity_scores.append((filename, similarity_score))

            print(f'NOT RELEVANT: Similarity score for {filename}: {similarity_score}')
        else:
            print(f"No 'relevant' entries in column 'RELEVANCE_x' for file {filename}")
    else:
        print(f"Columns 'RELEVANCE_x' and/or 'RELEVANCE_y' do not exist in {filename}")

# Convert the list to a DataFrame
similarity_df = pd.DataFrame(similarity_scores, columns=['filename', 'score'])

# Save the DataFrame to a CSV file (optional)
output_path = 'path/to/save/similarity_scores.csv'  # Replace with your desired output path
similarity_df.to_csv(output_path, index=False)

# Optionally, display the DataFrame
print(similarity_df)

NOT RELEVANT: Similarity score for gpt_out_all_iterations_string_280.csv: 75.55555555555556
NOT RELEVANT: Similarity score for gpt_out_all_iterations_string_3441.csv: 81.94444444444444
NOT RELEVANT: Similarity score for gpt_out_all_iterations_string_3644.csv: 73.25581395348837
NOT RELEVANT: Similarity score for gpt_out_all_iterations_string_5991.csv: 73.25581395348837
NOT RELEVANT: Similarity score for gpt_out_all_iterations_string_7917.csv: 75.55555555555556


OSError: Cannot save file into a non-existent directory: 'path\to\save'