In [1]:
import numpy as np
import pandas as pd
import os

Analysis of V2 annotation codebook - (hopefully) final version

Preparation for application of GPT_annotate. Initailly focus on the principles.

In [2]:
#Directory path
path = 'HLS_man/COP19'

#List of items in folder
folder_items = os.listdir(path)

# Empty list to store dataframes
list = []

for filename in folder_items:
    file_path = os.path.join(path, filename)
    df = pd.read_excel(file_path)
    copID = df.columns[0]
    # add copID to dataframe
    df['id'] = copID
    # Rename first column containing ID
    df = df.rename(columns={df.columns[0]: 'Text'})
    # Add dataframe to list
    list.append(df)

HLS19_df = pd.concat(list, ignore_index=True)

HLS19_df

Unnamed: 0,Text,Relevance,Topic,Unit,Principle,30 word explanation,Notes,id
0,"\nThank you, Mr. President .",0,,,,,,COP19_japan
1,"\n \nOn beha lf of the government of Japan , I...",0,,,,,,COP19_japan
2,\n \nI would also like to expr ess my d eepest...,0,,,,,,COP19_japan
3,\n \nMr. President: \n \nA fair and effective ...,2,new UNFCCC policy,responsibility,egalitarian,"participation of all countries, reference to f...",,COP19_japan
4,"\n \nIn this regard, Japan firmly supports the...",1,,,,,,COP19_japan
...,...,...,...,...,...,...,...,...
162,• Our meeting here in Warsaw can help build m...,1,,,,,,COP19_usa
163,"\n• And as we work overtime in the UNFCCC, we...",1,,,,,,COP19_usa
164,"Thats why we have been pushing hard,\ntogethe...",0,,,,,,COP19_usa
165,This alone has the potential to avoid some 90\...,0,,,,,,COP19_usa


In [3]:
# Remove '\n' from the 'Text' column
HLS19_df['Text'] = HLS19_df['Text'].str.replace('\n', '')

# General information on dataframe
# 1. Nan counts
nan_counts = HLS19_df.isna().sum()
print(nan_counts)

Text                     0
Relevance                0
Topic                  124
Unit                   124
Principle              124
30 word explanation    124
Notes                  164
id                       0
dtype: int64


In [4]:
# Determine number of principle occurences
principle_count = HLS19_df['Principle'].value_counts()
principle_count

general normative statement    14
egalitarian                     9
prioritarian                    9
utilitarian                     3
utilitarian                     2
egalitarian                     2
libertarian                     2
sufficientarian                 1
prioritarain                    1
Name: Principle, dtype: int64

In [20]:
HLS19_df.loc[:,'Principle'] = HLS19_df['Principle'].str.replace(' utilitarian', 'utilitarian')
HLS19_df.loc[:,'Principle'] = HLS19_df['Principle'].str.replace('utilitarian ', 'utilitarian')
HLS19_df.loc[:,'Principle'] = HLS19_df['Principle'].str.replace('egalitarian ', 'egalitarian')
HLS19_df.loc[:,'Principle'] = HLS19_df['Principle'].str.replace('prioritarain', 'prioritarian')
HLS19_df.loc[:,'Principle'] = HLS19_df['Principle'].str.replace('libertarian ', 'libertarian')

HLS19 = HLS19_df.copy()
count_HLS19 = HLS19['Principle'].value_counts()
count_HLS19

general normative statement    14
egalitarian                    11
prioritarian                   10
utilitarian                     5
libertarian                     2
sufficientarian                 1
Name: Principle, dtype: int64

For initial GPT application, focus on the labelling for principles, not additional factors.

Requirements presented by gpt_annotate:
text_to_annotate:
A dataframe that includes one column for text samples and, if you are comparing the LLM output against humans, any number of one-hot-encoded category columns. The text column should be the first column in your data. We provide Python code (described below) that will automatically assist with the formatting of text_to_annotate to ensure accurate annotation.

> keep the same lines in order to later add the COPID back to the dataframe
>
>
> Tomorrow: perform test with additional determination of relevance. Now only focus on the principle applied to each sentence


In [21]:
#Create dataframe, only including the principle column
data = HLS19.loc[:,['Text','Principle']]
data

Unnamed: 0,Text,Principle
0,"Thank you, Mr. President .",
1,"On beha lf of the government of Japan , I wou...",
2,I would also like to expr ess my d eepest con...,
3,Mr. President: A fair and effective framewor...,egalitarian
4,"In this regard, Japan firmly supports the est...",
...,...,...
162,• Our meeting here in Warsaw can help build m...,
163,"• And as we work overtime in the UNFCCC, we m...",
164,"Thats why we have been pushing hard,together ...",
165,This alone has the potential to avoid some 90g...,


In [27]:
# One0hot encode principle column
HLS19_gpt = pd.get_dummies(data,prefix='', prefix_sep='', columns=['Principle'])
order = ['Text', 'egalitarian', 'libertarian', 'prioritarian', 'sufficientarian','utilitarian', 'general normative statement']
HLS19_gpt = HLS19_gpt[order]
HLS19_gpt.columns = map(str.upper, HLS19_gpt.columns)
HLS19_gpt

Unnamed: 0,TEXT,EGALITARIAN,LIBERTARIAN,PRIORITARIAN,SUFFICIENTARIAN,UTILITARIAN,GENERAL NORMATIVE STATEMENT
0,"Thank you, Mr. President .",0,0,0,0,0,0
1,"On beha lf of the government of Japan , I wou...",0,0,0,0,0,0
2,I would also like to expr ess my d eepest con...,0,0,0,0,0,0
3,Mr. President: A fair and effective framewor...,1,0,0,0,0,0
4,"In this regard, Japan firmly supports the est...",0,0,0,0,0,0
...,...,...,...,...,...,...,...
162,• Our meeting here in Warsaw can help build m...,0,0,0,0,0,0
163,"• And as we work overtime in the UNFCCC, we m...",0,0,0,0,0,0
164,"Thats why we have been pushing hard,together ...",0,0,0,0,0,0
165,This alone has the potential to avoid some 90g...,0,0,0,0,0,0


In [29]:
HLS19_gpt.sum()

TEXT                            Thank you, Mr. President . On beha lf of the ...
EGALITARIAN                                                                   11
LIBERTARIAN                                                                    2
PRIORITARIAN                                                                  10
SUFFICIENTARIAN                                                                1
UTILITARIAN                                                                    5
GENERAL NORMATIVE STATEMENT                                                   14
dtype: object

In [28]:
#Save dummies as csv file
HLS19_gpt.to_csv('gpt_annotate/COP19_principle_HL.csv', index=False)