### Read all_sentences and annotated sentences

In [974]:
import pandas as pd 
import re

annotated = pd.read_csv('annotated_to_retrain.csv')
annotated = annotated.rename(columns = {'Sentences': 'sentences'})
len(annotated)

815

In [975]:
all_sentences = pd.read_csv('../relevant_irrelevant_sentences_labeled_final/all_sentences.csv', index_col = 0)
len(all_sentences)

77277

In [976]:
all_sentences[all_sentences.label.isnull()]

Unnamed: 0,key,sentences,company_label,label


In [977]:
all_sentences

Unnamed: 0,key,sentences,company_label,label
1,1,"In 2021, 9.6% of our purchased electricity cam...",EliLilly,rel
2,2,A large portion of this renewable electricity ...,EliLilly,rel
3,3,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,rel
4,4,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,rel
5,5,This reduction was partially driven by energy ...,EliLilly,rel
...,...,...,...,...
77273,77273,→ FPL’s four nuclear units continue to operate...,NextEraEnergyZeroCarbonBlueprint,irr
77274,77274,Technology We assume that: → FPL’s gas plants ...,NextEraEnergyZeroCarbonBlueprint,irr
77275,77275,→ NextEra Energy Resources would invest in ele...,NextEraEnergyZeroCarbonBlueprint,irr
77276,77276,→ All non-FPL fossil generation assets would r...,NextEraEnergyZeroCarbonBlueprint,irr


In [978]:
all_sentences

Unnamed: 0,key,sentences,company_label,label
1,1,"In 2021, 9.6% of our purchased electricity cam...",EliLilly,rel
2,2,A large portion of this renewable electricity ...,EliLilly,rel
3,3,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,rel
4,4,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,rel
5,5,This reduction was partially driven by energy ...,EliLilly,rel
...,...,...,...,...
77273,77273,→ FPL’s four nuclear units continue to operate...,NextEraEnergyZeroCarbonBlueprint,irr
77274,77274,Technology We assume that: → FPL’s gas plants ...,NextEraEnergyZeroCarbonBlueprint,irr
77275,77275,→ NextEra Energy Resources would invest in ele...,NextEraEnergyZeroCarbonBlueprint,irr
77276,77276,→ All non-FPL fossil generation assets would r...,NextEraEnergyZeroCarbonBlueprint,irr


In [979]:
all_sentences.groupby('label').count()[['key']]

Unnamed: 0_level_0,key
label,Unnamed: 1_level_1
irr,76406
rel,871


### Merge annotated data with all_sentences

In [980]:
annotated_merged = annotated.merge(all_sentences, how = 'left', on = 'sentences')

In [981]:
len(annotated_merged)

815

In [982]:
# check label change status by combining true label and classified by hand column
annotated_merged['label_change'] = annotated_merged['True Label'] + annotated_merged['Classified by Hand']

In [983]:
annotated_merged = annotated_merged.drop(['label', 'company_label'], axis = 1 )

In [984]:
# Separately keep track of sentences that weren't merged but didn't have any status change

In [985]:
# 31 of annotated sentences were not merged properly with all sentences
len(annotated_merged[annotated_merged.key.isnull()])

21

In [986]:
# 13 of these didn't have any label status change (8: relrel and 5: irrirr) focus on 18 with label status change
len(annotated_merged[annotated_merged.key.isnull() & ((annotated_merged.label_change != 'relrel') & (annotated_merged.label_change != 'irrirr'))])

10

In [987]:
len(annotated_merged[annotated_merged.key.isnull() & (annotated_merged.label_change == 'relrel')])

6

In [988]:
len(annotated_merged[annotated_merged.key.isnull() & (annotated_merged.label_change == 'irrirr')])

5

In [989]:
annotated_merged[annotated_merged.key.isnull() & (annotated_merged.label_change == 'irrirr')]

Unnamed: 0,sentences,Naive Bayes,KNN,Decision Tree,Logistic Regression,Perceptron Model,Random Forest,SVM,sum count,Company,True Label,Classified by Hand,Pattern Recognition,Issues / Comments,key,label_change
313,#NAME?,1,1,1,1,1,1,1,7,Microsoft,irr,irr,,incomplete sentence?,,irrirr
497,In This Section Climate Action Strategy 2030 C...,0,1,0,1,0,1,1,4,EliLilly,irr,irr,,table of contents containing relevant info?,,irrirr
761,This assessment considered the following: 1.,0,0,0,0,1,0,0,1,NewmontMining,irr,irr,,,,irrirr
762,Newmont Corporation 2021 Climate Report Strate...,0,0,1,0,0,0,0,1,NewmontMining,irr,irr,,,,irrirr
764,Availability of capital could pose another cha...,0,0,1,0,0,0,0,1,NewmontMining,irr,irr,,,,irrirr


In [990]:
# annotated not merged with changes in status (rel --> rel or irr --> irr are disregarded)
annotated_not_merged = annotated_merged[annotated_merged.key.isnull() & ((annotated_merged.label_change != 'relrel') & (annotated_merged.label_change != 'irrirr'))]

In [991]:
annotated_merged = annotated_merged[annotated_merged.key.notnull()]

In [992]:
annotated_not_merged.head()

Unnamed: 0,sentences,Naive Bayes,KNN,Decision Tree,Logistic Regression,Perceptron Model,Random Forest,SVM,sum count,Company,True Label,Classified by Hand,Pattern Recognition,Issues / Comments,key,label_change
187,TotalEnergies allocated $100 million to CCS re...,0,0,1,0,0,0,0,1,Total,rel,irr,,need more context,,relirr
233,Net zero operations« Our aim 1 is to be net ze...,1,1,0,1,1,1,1,6,BP,irr,rel,,,,irrrel
277,Our current initiatives include: Designing for...,1,1,0,1,1,0,1,5,EliLilly,irr,rel,,,,irrrel
330,Newmont Corporation 2021 Climate Report 46 App...,1,1,1,1,1,1,1,7,NewmontMining,irr,rel,,,,irrrel
398,BASING CALCULATIONS ON THE LARGEST VOLUME IN E...,1,1,0,1,0,1,1,5,Total,irr,rel,,,,irrrel


In [993]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

### String comparison to match the remaining not_merged data

In [994]:
# not_merged_all = []
# not_merged_annotated = []

# for i in annotated_not_merged.sentences.to_list():
#     val = process.extractOne(i, all_sentences['sentences'], scorer = fuzz.partial_ratio)
#     not_merged_all.append(val) # from all_sentences
#     not_merged_annotated.append(i) # from annotated_not_merged

In [995]:
match_not_merged = pd.DataFrame([not_merged_all, not_merged_annotated], index = ['not_merged_all', 'not_merged_annotated']).transpose()

In [996]:
match_not_merged.to_csv('match_not_merged.csv', encoding = 'utf-8-sig')

In [997]:
match_not_merged = pd.read_csv('match_not_merged.csv', index_col = 0)

In [998]:
match_not_merged['not_merged_all_sent'] = [i[0] for i in not_merged_all]

In [999]:
match_not_merged = match_not_merged.drop(['not_merged_all'], axis = 1)

In [1000]:
match_not_merged = match_not_merged.rename(columns = {'not_merged_annotated': 'sentences'})

In [1001]:
match_not_merged = match_not_merged.merge(annotated, how = 'left', on = 'sentences')

In [1002]:
match_not_merged = match_not_merged.rename(columns = {'sentences': 'not_merged_annotated', 'not_merged_all_sent': 'sentences'})

In [1003]:
match_not_merged = match_not_merged.drop('not_merged_annotated', axis = 1)

In [1004]:
match_not_merged = match_not_merged.merge(all_sentences, how = 'left', on = 'sentences')

In [1005]:
match_not_merged = match_not_merged.drop(['company_label', 'label'], axis = 1)

In [1006]:
match_not_merged['label_change'] = match_not_merged['True Label'] + match_not_merged['Classified by Hand']

In [1007]:
match_not_merged.head(3)

Unnamed: 0,sentences,Naive Bayes,KNN,Decision Tree,Logistic Regression,Perceptron Model,Random Forest,SVM,sum count,Company,True Label,Classified by Hand,Pattern Recognition,Issues / Comments,key,label_change
0,TotalEnergies allocated $100 million to CCS r...,0,0,1,0,0,0,0,1,Total,rel,irr,,need more context,236.0,relirr
1,Net zero operations Our aim 1 is to be net ze...,1,1,0,1,1,1,1,6,BP,irr,rel,,,23887.0,irrrel
2,Our current initiatives include: Designing f...,1,1,0,1,1,0,1,5,EliLilly,irr,rel,,,904.0,irrrel


In [1008]:
after_annotation = pd.concat([annotated_merged, match_not_merged])

In [1009]:
after_annotation = after_annotation.drop_duplicates(subset = 'key')

In [1010]:
len(annotated_merged) + len(match_not_merged)

812

In [1011]:
len(after_annotation)

804

### Check status of label change

In [1012]:
# 784 + 18 (16 after removing duplicates) + 13 (no status changed: relrel = 8, irrirr = 5)

label_change_stat = pd.DataFrame(after_annotation.groupby('label_change').count()['sentences'])

In [1013]:
label_change_stat

Unnamed: 0_level_0,sentences
label_change,Unnamed: 1_level_1
irrirr,256
irrrel,159
relirr,114
relrel,275


In [1014]:
label_change_stat['sentences']['irrirr'] = label_change_stat['sentences']['irrirr'] + 5

In [1015]:
label_change_stat['sentences']['relrel'] = label_change_stat['sentences']['relrel'] + 8

### Statistics of label_change after adding 8 and 7 

In [1016]:
# Total 813 instead of 815 after removing duplicates 
label_change_stat

Unnamed: 0_level_0,sentences
label_change,Unnamed: 1_level_1
irrirr,261
irrrel,159
relirr,114
relrel,283


In [1017]:
rel_to_irr = after_annotation[(after_annotation.label_change == 'relirr')]

In [1018]:
len(rel_to_irr)

114

In [1019]:
irr_to_rel = after_annotation[(after_annotation.label_change == 'irrrel')]

In [1020]:
len(irr_to_rel)

159

In [1021]:
rel_to_irr = rel_to_irr[['key', 'Classified by Hand']] 

In [1022]:
irr_to_rel = irr_to_rel[['key', 'Classified by Hand']]

In [1023]:
change_label = pd.concat([rel_to_irr, irr_to_rel])

In [1024]:
change_label = change_label.drop_duplicates(subset = 'key')

In [1025]:
change_label = change_label.rename(columns = {'Classified by Hand': 'label_change'}) 

In [1026]:
all_sentences = all_sentences.merge(change_label, how = 'left', on = 'key')

In [1027]:
test_df = all_sentences[all_sentences.label_change.isnull()][:5]

In [1028]:
test_df['label_change'] = test_df['label_change'].fillna(test_df['label'])

In [1029]:
test_df = all_sentences.sample(5)

In [1030]:
test_df['label_change'] = test_df['label_change'].fillna(test_df['label'])

In [1031]:
all_sentences.groupby('label_change').count()[['key']]

Unnamed: 0_level_0,key
label_change,Unnamed: 1_level_1
irr,113
rel,159


In [1032]:
all_sentences.groupby('label').count()[['key']]

Unnamed: 0_level_0,key
label,Unnamed: 1_level_1
irr,76406
rel,871


In [1033]:
test_all = all_sentences[all_sentences.label_change.isnull()]

In [1034]:
# fill labels that haven't been changed
all_sentences['label_change'] = all_sentences['label_change'].fillna(all_sentences['label'])

In [1035]:
all_sentences['label_status'] = all_sentences.label + all_sentences.label_change

In [1036]:
all_sentences.groupby('label_status').count()

Unnamed: 0_level_0,key,sentences,company_label,label,label_change
label_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
irrirr,76248,76248,76248,76248,76248
irrrel,158,158,158,158,158
relirr,111,111,111,111,111
relrel,760,760,760,760,760


In [1045]:
# total number of rel and irr sentences change
d = {'relevant_sentences': [871, 871+158-111], 'irrelevant_sentences': [76406, 76406-158+111]}
df = pd.DataFrame(data=d, index = ['original_data', 'after_annotation'])
df

Unnamed: 0,relevant_sentences,irrelevant_sentences
original_data,871,76406
after_annotation,918,76359


In [1046]:
df.to_csv('annotation_stats.csv')

In [1047]:
label_change_stat.to_csv('label_change_stat.csv')

In [1048]:
all_sentences

Unnamed: 0,key,sentences,company_label,label,label_change,label_status
0,1,"In 2021, 9.6% of our purchased electricity cam...",EliLilly,rel,rel,relrel
1,2,A large portion of this renewable electricity ...,EliLilly,rel,rel,relrel
2,3,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,rel,rel,relrel
3,4,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,rel,rel,relrel
4,5,This reduction was partially driven by energy ...,EliLilly,rel,rel,relrel
...,...,...,...,...,...,...
77272,77273,→ FPL’s four nuclear units continue to operate...,NextEraEnergyZeroCarbonBlueprint,irr,irr,irrirr
77273,77274,Technology We assume that: → FPL’s gas plants ...,NextEraEnergyZeroCarbonBlueprint,irr,irr,irrirr
77274,77275,→ NextEra Energy Resources would invest in ele...,NextEraEnergyZeroCarbonBlueprint,irr,irr,irrirr
77275,77276,→ All non-FPL fossil generation assets would r...,NextEraEnergyZeroCarbonBlueprint,irr,irr,irrirr


In [1049]:
all_sentences.groupby(['company_label', 'label_change']).count()[['sentences']].to_csv('annotated_sentences_stat.csv')

In [1050]:
all_sentences.groupby('label').count()

Unnamed: 0_level_0,key,sentences,company_label,label_change,label_status
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
irr,76406,76406,76406,76406,76406
rel,871,871,871,871,871


In [1051]:
all_sentences.groupby('label_change').count()

Unnamed: 0_level_0,key,sentences,company_label,label,label_status
label_change,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
irr,76359,76359,76359,76359,76359
rel,918,918,918,918,918


In [1052]:
all_sentences

Unnamed: 0,key,sentences,company_label,label,label_change,label_status
0,1,"In 2021, 9.6% of our purchased electricity cam...",EliLilly,rel,rel,relrel
1,2,A large portion of this renewable electricity ...,EliLilly,rel,rel,relrel
2,3,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,rel,rel,relrel
3,4,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,rel,rel,relrel
4,5,This reduction was partially driven by energy ...,EliLilly,rel,rel,relrel
...,...,...,...,...,...,...
77272,77273,→ FPL’s four nuclear units continue to operate...,NextEraEnergyZeroCarbonBlueprint,irr,irr,irrirr
77273,77274,Technology We assume that: → FPL’s gas plants ...,NextEraEnergyZeroCarbonBlueprint,irr,irr,irrirr
77274,77275,→ NextEra Energy Resources would invest in ele...,NextEraEnergyZeroCarbonBlueprint,irr,irr,irrirr
77275,77276,→ All non-FPL fossil generation assets would r...,NextEraEnergyZeroCarbonBlueprint,irr,irr,irrirr


In [665]:
# annotated[annotated['transition'] == 'rel_to_irr'].iloc[:,1:9].sum()

In [666]:
# annotated[annotated['transition'] == 'irr_to_rel'].iloc[:,1:9].sum() + annotated[annotated['transition'] == 'rel_to_irr'].iloc[:,1:9].sum()