In [38]:
import numpy as np
import pandas as pd
import os

### EDA of the manually annotated HLS
Evaluation of the number of sentences deemed relevant, the types of labels presented and other insights

In [39]:
#Directory path
path = 'HLS_man'

#List of items in folder
folder_items = os.listdir(path)

#Create filter for subdirectories
subfolders = [os.path.join(path, item) for item in folder_items if os.path.isdir(os.path.join(path, item))]
print(subfolders)

# Empty list to store dataframes
list = []

# Iterate over subdirectories
for subfolder in subfolders:
    for filename in os.listdir(subfolder):
        df = pd.read_excel(os.path.join(subfolder, filename))
        copID = df.columns[0]
        # add copID to dataframe
        df['id'] = copID
        # Rename first column containing ID
        df = df.rename(columns={df.columns[0]: 'Text'})
        # Add dataframe to list
        list.append(df)

HLS_df = pd.concat(list, ignore_index=True)

HLS_df


['HLS_man\\COP19', 'HLS_man\\COP20', 'HLS_man\\COP21', 'HLS_man\\COP22', 'HLS_man\\COP23', 'HLS_man\\COP24', 'HLS_man\\COP25', 'HLS_man\\COP26', 'HLS_man\\COP27', 'HLS_man\\COP28']


Unnamed: 0,Text,Relevance,Topic,Unit,Scale,Time,Principle,30 word explanation,Notes,id
0,"\nThank you, Mr. President .",no,,,,,,,,COP19_japan
1,"\n \nOn beha lf of the government of Japan , I...",no,,,,,,,,COP19_japan
2,\n \nI would also like to expr ess my d eepest...,no,,,,,,,,COP19_japan
3,\n \nMr. President: \n \nA fair and effective ...,yes,new UNFCCC policy,measures,global,nearby future,egalitarian,"participation of all countries, reference to f...",,COP19_japan
4,"\n \nIn this regard, Japan firmly supports the...",no,,,,,,,,COP19_japan
...,...,...,...,...,...,...,...,...,...,...
1621,\n \nNew Zealand is proud to suppor t several ...,no,,,,,,,,COP28_newzealand
1622,"I am joined by New Zealand’s largest business,...",no,,,,,,,,COP28_newzealand
1623,The commitment o f New Zealanders from across ...,no,,,,,,,,COP28_newzealand
1624,\n \nThank you Mr President.,no,,,,,,,,COP28_newzealand


In [40]:
# Remove '\n' from the 'Text' column
HLS_df['Text'] = HLS_df['Text'].str.replace('\n', '')

# General information on dataframe
# 1. Nan counts
nan_counts = HLS_df.isna().sum()
print(nan_counts)

Text                     72
Relevance                90
Topic                  1347
Unit                   1348
Scale                  1347
Time                   1347
Principle              1347
30 word explanation    1348
Notes                  1613
id                        0
dtype: int64


Columns without any textual entry should be removed. Additionally, columns without relevance indication are reviewed.

In [41]:
# Check text column
nan_text = HLS_df[HLS_df['Text'].isna()]
print(nan_text)

    Text Relevance Topic Unit Scale Time Principle 30 word explanation Notes  \
310  NaN       NaN   NaN  NaN   NaN  NaN       NaN                 NaN   NaN   
311  NaN       NaN   NaN  NaN   NaN  NaN       NaN                 NaN   NaN   
312  NaN       NaN   NaN  NaN   NaN  NaN       NaN                 NaN   NaN   
313  NaN       NaN   NaN  NaN   NaN  NaN       NaN                 NaN   NaN   
314  NaN       NaN   NaN  NaN   NaN  NaN       NaN                 NaN   NaN   
..   ...       ...   ...  ...   ...  ...       ...                 ...   ...   
377  NaN       NaN   NaN  NaN   NaN  NaN       NaN                 NaN   NaN   
378  NaN       NaN   NaN  NaN   NaN  NaN       NaN                 NaN   NaN   
379  NaN       NaN   NaN  NaN   NaN  NaN       NaN                 NaN   NaN   
380  NaN       NaN   NaN  NaN   NaN  NaN       NaN                 NaN   NaN   
513  NaN       NaN   NaN  NaN   NaN  NaN       NaN                 NaN   NaN   

              id  
310  COP20_tonga  
3

In [42]:
# Remove rows where the entry in the 'Text' column is NaN
HLS_df = HLS_df.dropna(subset=['Text'])

# Print rows with NaN in 'relevance' column
nan_relevance = HLS_df[HLS_df['Relevance'].isna()]
print(nan_relevance)

             Text Relevance Topic Unit Scale Time Principle  \
141                     NaN   NaN  NaN   NaN  NaN       NaN   
202                     NaN   NaN  NaN   NaN  NaN       NaN   
381                     NaN   NaN  NaN   NaN  NaN       NaN   
455                     NaN   NaN  NaN   NaN  NaN       NaN   
471                     NaN   NaN  NaN   NaN  NaN       NaN   
514                     NaN   NaN  NaN   NaN  NaN       NaN   
631                     NaN   NaN  NaN   NaN  NaN       NaN   
732                     NaN   NaN  NaN   NaN  NaN       NaN   
800    Thank you.       NaN   NaN  NaN   NaN  NaN       NaN   
842                     NaN   NaN  NaN   NaN  NaN       NaN   
901                     NaN   NaN  NaN   NaN  NaN       NaN   
1012                    NaN   NaN  NaN   NaN  NaN       NaN   
1232                    NaN   NaN  NaN   NaN  NaN       NaN   
1332                    NaN   NaN  NaN   NaN  NaN       NaN   
1364                    NaN   NaN  NaN   NaN  NaN      

Explanation for nan counts: formatting of speeches with no entries in final lines of the speech. Rows are removed.

In [43]:
# Remove all rows where entry in 'Relevance' column is NaN
HLS_df = HLS_df.dropna(subset=['Relevance'])

# Determine number of relevant sentences
relevant_count = (HLS_df['Relevance']== 'yes').sum()

# Determine number of principle occurences
principle_count = HLS_df['Principle'].value_counts()
# General information on dataframe
print('The shape of the dataframe is:', HLS_df.shape)
print('The number of relevant sentences is:', relevant_count)
print('The nr. of occurences per principle is:', principle_count)

The shape of the dataframe is: (1536, 10)
The number of relevant sentences is: 266
The nr. of occurences per principle is: egalitarian                    73
prioritarian                   70
general normative statement    61
utilitarian                    38
sufficientarian                 9
prioritarian                    4
egalitarain                     4
libertarian                     3
prioritarain                    2
egalitarian                     2
utilitarian                     2
libertarian                     1
egalitarian, utilitarian        1
prioritarian, egalitarian       1
egalitarian, libertarian        1
egalitarain, utilitarian        1
proioritarian                   1
utilitarian, egalitarian        1
egalitarian, prioritarian       1
proritarian                     1
egalitararin                    1
Egalitarian                     1
Name: Principle, dtype: int64


## Spellcheck and evaluation of principles
In total there are 266 sentences labelled as relevant. This is 17% of all sentences.
Need to filter the spelling mistakes.

In [47]:
HLS_df = HLS_df.copy()

#Replace prioritarian errors
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('prioritarian ', 'prioritarian')
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('prioritarain', 'prioritarian')
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('proioritarian', 'prioritarian')
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('proritarian', 'prioritarian')

#Replace egalitarian errors
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('Egalitarian', 'egalitarian')
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('egalitarian ', 'egalitarian')
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('egalitarain', 'egalitarian')
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('egalitararin', 'egalitarian')

# Replace utilitarian errors
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('utilitarian', 'utilitarian')
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('utilitarian ', 'utilitarian')

# Replace libertarian erros
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('libertarian ', 'libertarian')

# Format double labels
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('egalitarian, utilitarian', 'egalitarian, utilitarian')
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('egalitarian,utilitarian', 'egalitarian, utilitarian')
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('utilitarian, egalitarian', 'egalitarian, utilitarian')
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('prioritarian, egalitarian', 'egalitarian,prioritarian')
HLS_df.loc[:,'Principle'] = HLS_df['Principle'].str.replace('egalitarian,prioritarian', 'egalitarian, prioritarian')

# Perform new principle count
principle_count2 = HLS_df['Principle'].value_counts()
principle_count2

egalitarian                    81
prioritarian                   78
general normative statement    61
utilitarian                    40
sufficientarian                 9
libertarian                     4
egalitarian, utilitarian        3
egalitarian, prioritarian       2
egalitarian, libertarian        1
Name: Principle, dtype: int64

Print rows with double entries for analysis


In [48]:
double_principle = HLS_df[(HLS_df['Principle']=='egalitarian, utilitarian') |(HLS_df['Principle']=='egalitarian, prioritarian')|(HLS_df['Principle']=='egalitarian, libertarian')]
double_principle

Unnamed: 0,Text,Relevance,Topic,Unit,Scale,Time,Principle,30 word explanation,Notes,id
206,Climate change poses a serious global challen...,yes,moral responsibility,measures,global,nearby future,"egalitarian, utilitarian","Urging for global action, referring to egalita...",,COP20_kenya
392,"We must all work collectively, with a sense of...",yes,moral responsibility,measures,global,n.a.,"egalitarian, prioritarian",working collectively and supporting the worst ...,,COP20_tonga_PSIDS
400,The guidance to the financing mechanisms shoul...,yes,new UNFCCC policy,financial resources,n.a.,n.a.,"egalitarian, libertarian",Accounting for the differences between locatio...,,COP20_tonga_PSIDS
465,"It is therefore, of utmost importance that we...",yes,new UNFCCC policy,measures,global,n.a.,"egalitarian, utilitarian","Egalitarian in urging all countries, for the r...",,COP21_afghanistan
684,Let us ensure that the environmental issue is ...,yes,cooperation,measures,global,n.a.,"egalitarian, utilitarian",focus on the need for cooperation focussing on...,,COP22_israel
861,The global response to climate changemust put ...,yes,new UNFCCC policy,measures,global,n.a.,"egalitarian, prioritarian",Prescribing the need for fair policies but als...,,COP23_vanatu


## Evaluating the other statistics of elements like topic, unit, scale and time
Need to filter for multiple occurences. Like the occurances of the term 'measures'. Again the need to filter spelling mistakes.

In [49]:
# Determine number of Topic occurences - need to filter for specific text that is found
HLS_df['Topic'].value_counts()

new UNFCCC policy                                        38
moral responsibility                                     17
action                                                   11
adaptation, mitigation                                    5
moral responsiblity                                       5
                                                         ..
moral responsibilty, UNFCCC agreements and principles     1
cooperation                                               1
support                                                   1
other(transparancy), other(implementation)                1
moral responsibility, global prosperitiy                  1
Name: Topic, Length: 171, dtype: int64

In [51]:
# Determine number of Unit occurences - need to filter for specific text that is found
HLS_df['Unit'].value_counts()

measures                                                            133
n.a.                                                                 62
financial resources                                                  39
technological resources, financial resources                          4
support                                                               4
financial resources, technological resources                          4
measuers                                                              2
n.a.                                                                  2
other(support)                                                        2
technical resources, financial resources                              2
resources                                                             2
commitments                                                           1
knowledge, technological resources                                    1
support                                                         

In [52]:
# Determine number of 'Scale" occurences - need to filter for specific text that is found
HLS_df['Scale'].value_counts()

global                                    218
n.a.                                       35
national                                    6
global institutions                         3
multinational(developed countries)          2
multinational(developing countries)         2
sectors                                     1
multilateral(SIDS)                          1
multilateral(europe)                        1
glboal                                      1
multinational(annex 1)                      1
other(countries with higer emissions)       1
gobal                                       1
regional                                    1
other(people vs policimakers)               1
multinationalmajor polluters)               1
Multinational(annex-1 parties)              1
Multinational(developed countries)          1
multinational(refugee hosting nations)      1
Name: Scale, dtype: int64

In [53]:
# Determine number of 'Time'"  occurences - need to filter for specific text that is found
HLS_df['Time'].value_counts()

n.a.                       185
present                     59
nearby future               20
n.a.                         9
distant future               4
present, distant future      1
past                         1
Name: Time, dtype: int64