## A) Read Data Files

### A-1) Pulling in All the Data Across Notebooks

In [203]:
import pandas as pd
import os 
from nltk import tokenize
import json
import re

In [204]:
%store -r HC_reltext
%store -r HC_alltext
%store -r HC_stat

%store -r IND_reltext
%store -r IND_alltext
%store -r IND_stat

%store -r Energy_reltext
%store -r Energy_alltext
%store -r Energy_stat

%store -r CONSTA_reltext
%store -r CONSTA_alltext
%store -r CONSTA_stat

%store -r CONDIS_reltext
%store -r CONDIS_alltext
%store -r CONDIS_stat

%store -r IT_reltext
%store -r IT_alltext
%store -r IT_stat

%store -r Real_Estate_reltext
%store -r Real_Estate_alltext
%store -r Real_Estate_stat

%store -r Materials_reltext
%store -r Materials_alltext
%store -r Materials_stat

%store -r Utilities_reltext
%store -r Utilities_alltext
%store -r Utilities_stat

### Read  <ins>total_relevant</ins>, <ins>total_all</ins>, (both relevant and irrelevant),  <ins>total_stat</ins> (statistics of relevant and all sentences)

In [205]:
total_relevant = pd.concat([HC_reltext, IND_reltext, Energy_reltext,
                 CONSTA_reltext, CONDIS_reltext, IT_reltext,
                 Real_Estate_reltext, Materials_reltext, Utilities_reltext])

In [206]:
total_all = pd.concat([HC_alltext, IND_alltext, Energy_alltext,
                 CONSTA_alltext, CONDIS_alltext, IT_alltext,
                 Real_Estate_alltext, Materials_alltext, Utilities_alltext])

### Removing Duplicates of Sentences

In [207]:
total_all = total_all.drop_duplicates('all_sentences')

In [208]:
total_relevant = total_relevant.drop_duplicates('relevant_sentences')

In [209]:
unique_comp = total_all.company_label.unique()

In [210]:
original_all = total_all
original_relevant = total_relevant

### Company Labels in Dictionary

In [211]:
comp_dict = dict()

count = 1
for i in unique_comp:
    comp_dict[i] = str("%04d" % count)
    count += 1

In [212]:
company_list = list(comp_dict.keys())

In [213]:
comp_dict.items()

dict_items([('EliLilly', '0001'), ('UnitedHealthGroup', '0002'), ('Merck', '0003'), ('BristolMyersSquibb', '0004'), ('Danaher', '0005'), ('johnsonandjohnson', '0006'), ('Pfizer', '0007'), ('Abbott', '0008'), ('ThermoFisherScientifiic', '0009'), ('Amgen', '0010'), ('Caterpillar', '0011'), ('Lockheed', '0012'), ('Boeing', '0013'), ('UPS', '0014'), ('Raytheon', '0015'), ('Delta', '0016'), ('Deere', '0017'), ('Honeywell', '0018'), ('3M', '0019'), ('UnionPacific', '0020'), ('Total', '0021'), ('BP', '0022'), ('Shell', '0023'), ('Mondelez_Intl', '0024'), ('Hershey', '0025'), ('Philip_Morris_Intl', '0026'), ('PepsiCo', '0027'), ('Altria_Environmental', '0028'), ('PandG', '0029'), ('Altria_TCFD', '0030'), ('Costco', '0031'), ('CocaCola', '0032'), ('Altria_2021', '0033'), ('Walmart', '0034'), ('EsteeLauder', '0035'), ('McDonalds', '0036'), ('TJX', '0037'), ('HomeDepot', '0038'), ('Lowes', '0039'), ('Target', '0040'), ('BookingHoldings', '0041'), ('Tesla', '0042'), ('Amazon', '0043'), ('Nike', '0

### Company Indexing

In [214]:
total_all['company_index'] = total_all.apply(lambda x: comp_dict[x.company_label], axis = 1)

### Sentence Indexing

In [215]:
cur_comp_index = ""
sent_index = [] 
for i in total_all.company_index:
    if i != cur_comp_index:
        cur_comp_index = i
        sent_val = 0
        sent_val += 1
        sent_index.append(str("%04d" % sent_val))
    else:
        sent_val += 1
        sent_index.append(str("%04d" % sent_val))

In [216]:
total_all.groupby('company_label', sort = False).count()

Unnamed: 0_level_0,all_sentences,company_index
company_label,Unnamed: 1_level_1,Unnamed: 2_level_1
EliLilly,105,105
UnitedHealthGroup,1052,1052
Merck,2411,2411
BristolMyersSquibb,1197,1197
Danaher,943,943
...,...,...
Dow,2874,2874
Dominion_Energy,672,672
Duke_Energy,1018,1018
AEP,1582,1582


### total_all with Indices

In [217]:
total_all['sent_index'] = sent_index

In [218]:
total_all[total_all.sent_index == '0001']

Unnamed: 0,all_sentences,company_label,company_index,sent_index
0,"7/7/22, 10:29 AM Environmental | 2021 ESG Repo...",EliLilly,0001,0001
105,Our Mission in Action 2021 Sustainability Report,UnitedHealthGroup,0002,0001
1170,"Environmental, Social & Governance (ESG) Progr...",Merck,0003,0001
3634,"Environmental, Social and Governance Report Ou...",BristolMyersSquibb,0004,0001
4845,2021 Sustainability Report,Danaher,0005,0001
...,...,...,...,...
10200,"2021 ENVIRONMENTAL, SOCIAL & GOVERNANCE REPORT...",Dow,0068,0001
0,REPORT 2021 A report based on the recommendati...,Dominion_Energy,0069,0001
677,1 2021 DUKE ENERGY ESG REPORT D UK E E NE ...,Duke_Energy,0070,0001
1752,2022 Corporate Sustainability Report 2022 CORP...,AEP,0071,0001


### total_relevant with indices

In [219]:
total_relevant['company_index'] = total_relevant.apply(lambda x: comp_dict[x.company_label], axis = 1)

In [220]:
total_relevant.head(20)

Unnamed: 0,relevant_sentences,company_label,company_index
0,"In 2021, 9.6% of our purchased electricity cam...",EliLilly,1
1,A large portion of this renewable electricity ...,EliLilly,1
2,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,1
3,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,1
4,This reduction was partially driven by energy ...,EliLilly,1
5,"In 2021, we reduced our energy consumption by ...",EliLilly,1
6,"In 2021, 9.6% of our purchased electricity was...",EliLilly,1
7,We have reduced GHG emissions at key facilitie...,EliLilly,1
8,"These projects include: Kinsale – In July, 202...",EliLilly,1
9,The solar array is expected to provide up to 1...,EliLilly,1


## B) Extracting Irrelevant Sentences from All Sentences

### This method below doesn't correctly extract irrelevant sentences. Another method is needed

In [221]:
rel_test = []
for i in total_relevant['relevant_sentences'].to_list():
    if i in total_all['all_sentences'].to_list():
        rel_test.append(i)

In [222]:
print("This method to extract relevant sentences: ", len(rel_test), "vs.", "original_relevant_sentences: ", len(total_relevant['relevant_sentences'].to_list()))

This method to extract relevant sentences:  579 vs. original_relevant_sentences:  999


### Testing fuzz package to compare strings

In [223]:
# pip install fuzzywuzzy
# pip install python-Levenshtein

In [224]:
from rapidfuzz import process, fuzz

In [225]:
fuzz.partial_ratio('In 2021, we reduced our energy consumption by 2.9%,', 
                   'In 2021, we reduced our energy consumption by 2.9%, and we reduced our absolute GHG emissions by 9% compared to 2020.')

100.0

### B-1) Eliminating short sentences from relevant and all sentences

### Checking the number of letters of sentences to make sure sentences that have less than 5 letters are eliminated

In [226]:
import numpy as np
np.unique([len(i.split(" ")) for i in total_relevant['relevant_sentences']])[0:50]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])

In [227]:
np.unique([len(i.split(" ")) for i in total_all['all_sentences']])[0:50]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])

In [228]:
total_relevant['sent_count'] = total_relevant['relevant_sentences'].str.split().str.len()

In [229]:
total_all['sent_count'] = total_all['all_sentences'].str.split().str.len()

In [230]:
total_relevant = total_relevant[total_relevant['sent_count'] >= 5]

In [231]:
total_all = total_all[total_all['sent_count'] >= 5]

In [232]:
# Check the length of new sentences after dropping duplicates

print("relevant_sentences:", len(original_relevant), "->", len(total_relevant))
print("all_sentences:", len(original_all), "->", len(total_all))

relevant_sentences: 999 -> 961
all_sentences: 77691 -> 76394


### Convert the sentences into lists for future use

In [233]:
total_relevant

Unnamed: 0,relevant_sentences,company_label,company_index,sent_count
0,"In 2021, 9.6% of our purchased electricity cam...",EliLilly,0001,11
1,A large portion of this renewable electricity ...,EliLilly,0001,24
2,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,0001,12
3,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,0001,11
4,This reduction was partially driven by energy ...,EliLilly,0001,27
...,...,...,...,...
67,2018 Retired and demolished 636 MW of coal and...,NextEraEnergyZeroCarbonBlueprint,0072,38
68,"2019 Aquired Gulf Power, which added 1,750 MW ...",NextEraEnergyZeroCarbonBlueprint,0072,27
69,2020 Retired 615 MW of nuclear and 330 MW of c...,NextEraEnergyZeroCarbonBlueprint,0072,45
70,"2021 Added 2,008 MW of wind, 1,547 MW of solar...",NextEraEnergyZeroCarbonBlueprint,0072,22


In [234]:
rel_var = total_relevant['relevant_sentences'].to_list()
rel_lab = total_relevant['company_label'].to_list()
rel_comp_index = total_relevant['company_index'].to_list()

all_var = total_all['all_sentences'].to_list()
all_lab = total_all['company_label'].to_list()
all_comp_index = total_all['company_index'].to_list()



#### =============================== PAUSE RUNNING HERE ====================================

### B-2) Conduct string matching to separate relevant and irrelevant sentences from all sentences

In [235]:
total_all

Unnamed: 0,all_sentences,company_label,company_index,sent_index,sent_count
0,"7/7/22, 10:29 AM Environmental | 2021 ESG Repo...",EliLilly,0001,0001,30
1,Making medicines requires the use of valuable ...,EliLilly,0001,0002,14
2,We’re committed to reducing our environmental ...,EliLilly,0001,0003,18
3,"To track our progress, we measure and manage e...",EliLilly,0001,0004,27
4,"Lilly manages health, safety and the environme...",EliLilly,0001,0005,13
...,...,...,...,...,...
3565,→ FPL’s four nuclear units continue to operat...,NextEraEnergyZeroCarbonBlueprint,0072,0217,10
3566,Technology We assume that: → FPL’s gas plants...,NextEraEnergyZeroCarbonBlueprint,0072,0218,28
3567,→ NextEra Energy Resources would invest in el...,NextEraEnergyZeroCarbonBlueprint,0072,0219,26
3568,→ All non-FPL fossil generation assets would ...,NextEraEnergyZeroCarbonBlueprint,0072,0220,14


In [248]:
# string comparison to separate relevant and irrelevant sentences from all sentences
matrel = []
matlabel = []
orgrel = []
track = 0
for comp in company_list:
    total_relevant_comp = total_relevant[total_relevant.company_label == comp]
    for i, j in zip(total_relevant_comp.relevant_sentences, total_relevant_comp.company_index):
        total_all_comp = total_all[total_all.company_label == comp]
        # extract relevant sentences from all
        print(i)
        val = process.extractOne(i, total_all_comp.all_sentences, scorer = fuzz.partial_ratio, score_cutoff = 85)
        matrel.append(val)
        matlabel.append(j)
        orgrel.append(i)
        print(track, end = " ")
        print(val)
        track +=1

In 2021, 9.6% of our purchased electricity came from renewable sources.
0 ('Looking toward the future, we have set climate goals for 2030 as we work toward contributing to a low-carbon economy: Secure 100% of our purchased electricity from renewable sources In 2021, 9.6% of our purchased electricity came from renewable sources.', 100.0, 19)
A large portion of this renewable electricity is delivered through our utility providers to our sites in Alcobendas, Spain; Kinsale, Ireland; and Bracknell, UK.
1 ('A large portion of this renewable electricity is delivered through our utility providers to our sites in Alcobendas, Spain; Kinsale, Ireland; and Bracknell, UK.', 100.0, 20)
From 2012 to 2020, we achieved a 26% reduction in absolute emissions.
2 ('From 2012 to 2020, we achieved a 26% reduction in absolute emissions.', 100.0, 24)
In 2021, we achieved a 9% absolute emissions reduction versus 2020.
3 ('In 2021, we achieved a 9% absolute emissions reduction versus 2020.', 100.0, 25)
This red

26 ('VOC emissions decreased from 2019 to 2020 due to variations in production and because of continuous data collection improvements with the adoption of more accurate emission-tracking methods.', 100.0, 2598)
We have photovoltaic installations at five sites in the U.S., Switzerland and China, providing a combined annual electricity output of 3,632,000 kWh, equivalent to taking
27 ('** We have photovoltaic installations at five sites in the U.S., Switzerland and China, providing a combined annual electricity output of 3,632,000 kWh, equivalent to taking 560 vehicles off the road for one year.', 100.0, 4506)
org to develop a program to offset carbon emissions for corporate travel by supporting efforts to stop deforestation and drive sustainable economic growth in the Amazonian Basin.
28 ('org to develop a program to offset carbon emissions for corporate travel by supporting efforts to stop deforestation and drive sustainable economic growth in the Amazonian Basin.', 100.0, 4530)
Betwee

55 None
North America Region Total 63 Electricity From Renewable Energy Sources by Region 1,000 Gigajoules
56 None
In 2021, we achieved a 12% reduction in absolute Scope 1 and Scope 2 emissions compared to our 2018 baseline.
57 ('The key components of our framework include: Energy & emissions Scope 1 and Scope 2 emissions In 2021, we achieved a 12% reduction in absolute Scope 1 and Scope 2 emissions compared to our 2018 baseline.', 100.0, 10648)
Over the past year of operational growth alone, our Scope 1 and Scope 2 carbon intensity equalled 16 metric tons of carbon dioxide per million USD in revenue, down from 19 in 2020 and 27 prior to the pandemic.
58 ('Over the past year of operational growth alone, our Scope 1 and Scope 2 carbon intensity equalled 16 metric tons of carbon dioxide per million USD in revenue, down from 19 in 2020 and 27 prior to the pandemic.', 100.0, 10650)
Renewable electricity procurement is critical to our near-term strategy.
59 ('Renewable electricity procureme

In 2021, over 35% of our electrical energy was obtained from renewable or alternative sources.
83 ('CATERPILLAR 2021 SUSTAINABILITY REPORT 57 2018 2019 2020 2021 GOAL 2030 PERSPECTIVE ENERGY — OPERATIONS (continued) Grid electricity % N/A 83.8 87.3 86.7 N/A In 2021, over 35% of our electrical energy was obtained from renewable or alternative sources.', 100.0, 564)
In 2021, we reduced our energy intensity by 12% from our 2018 baseline.
84 ('Renewable energy 1, 2 % 17.4 21.2 21.0 24.2 N/A Alternative energy 3 % 15.1 14.3 12.0 13.0 N/A Energy intensity /A/ absolute gigajoules energy use/million dollars of sales and revenues 461 445 463 404 N/A In 2021, we reduced our energy intensity by 12% from our 2018 baseline.', 100.0, 565)
Our absolute GHG emissions decreased 32% from 2018 to 2021.
85 ('Absolute greenhouse gas (GHG) emissions /A/ 5, 12 million metric tons CO2e 2.22 1.78 1.47 1.51 1.55 Our absolute GHG emissions decreased 32% from 2018 to 2021.', 100.0, 566)
In 2021, we reduced our GH

Environmentally Responsible Operations Greenhouse gas emissions were 10% under plan primarily due to reduced production activities and procurement of renewables.
97 ('Performance Area1 2025 Goals Versus 2017 2021 Progress Toward 2025 Goals3 2030 Goals Greenhouse Gas Emissions Reduce emissions by 25%2 25% reduction Greenhouse gas emissions were 10% under plan primarily due to reduced production activities and procurement of renewables.', 88.35616438356165, 2571)
Despite cold northwestern U.S. weather in December, energy continued to be under plan overall for the enterprise, ending the 2021 reporting year at 9.8% under plan.
98 ('• Net-zero emissions4 • 55% GHG reduction from 2017 • 100% renewable electricity Energy Reduce energy5 consumption by 10% 12.2% reduction Despite cold northwestern U.S. weather in December, energy continued to be under plan overall for the enterprise, ending the 2021 reporting year at 9.8% under plan.', 100.0, 2572)
Remote working conditions; reduced production 

129 ('Our ability to achieve our ambitious climate goals is dependent on the actions of governments and third parties and will require, among other things, significant capital investment, including from third parties, research and development from manufacturers and other stakeholders, along with government policies and incentives to reduce the cost, and incent production, of SAF and other technologies that are not presently in existence or available at scale.', 100.0, 4688)
In order to achieve net zero by 2050 and meaningful emissions intensity improvement by 2035, we will need to use all levers commercially and economically available to us now and in the future, including: Fleet renewal Expected to provide the largest impact on advancing our goals in near-term SAF Goal of 10% SAF by the end of 2030 with 5% achieving 85% lower life cycle GHG emissions than conventional jet fuel Operational initiatives New Carbon Council expected to drive continuous operational improvements in fuel effi

157 ('FUEL EFFICIENCY (ASM) GALLONS PER 1000 ASM 17.0 16.5 16.0 15.5 15.0 14.5 14.0 13.5 13.0 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 Fuel Efficiency 39% of 2030 SAF secured through offtake agreements 10% SAF by 2030 Announced intention to set science-based targets to achieve net zero GHG emissions no later than 2050 and improve emissions intensity no later than 2035 compared to a 2019 baseline Achieve Net Zero by 2050 $137M invested to purchase and retire offsets relating to 27M metric tons of 2021 carbon emissions $1B Investment through 2030 19% electric ground support equipment (eGSE) as of April 2022 25% eGSE by 2022, 50% by 2025', 98.65186360031721, 4783)
Currently, fleet renewal is our greatest opportunity to improve our fuel efficiency and emissions intensity, and we have established sustainability considerations as one of the core pillars that guide our fleet strategy and our capital allocation decisions with respect to fleet.
158 ('Contents CEO Letter 

190 ('Since project tracking began in 2010, more than 5,700 efficiency projects, including building automation and controls, lighting, and mechanical upgrades have been implemented at our facilities, resulting in $100 million in annualized savings.', 100.0, 6543)
Efficiency projects installed at our site include site-wide LED lighting, new variable frequency drives, upgrades to HVAC controls, and compressed air leak detection and repair, all estimated to save nearly 20% of the site’s GHG emissions.
191 ('For example: Phoenix, Arizona Efficiency projects installed at our site include site-wide LED lighting, new variable frequency drives, upgrades to HVAC controls, and compressed air leak detection and repair, all estimated to save nearly 20% of the site’s GHG emissions.', 100.0, 6553)
Our site implemented LED lighting in the production area and completed renewal projects for their steam traps and absorption cooling system with an estimated savings of 8% of the site’s GHG emissions.
192 

Two 25 MW photovoltaic power plants, equivalent to the elec- tric power needs of 30,000 people.
233 ('Two 25 MW photovoltaic power plants, equivalent to the elec- tric power needs of 30,000 people.', 100.0, 342)
Several renewable electricity projects, which are compati- ble by nature with these criteria, were approved, such as off- shore wind projects Round 4 and ScotWind (United Kingdom), Yunlin (Taiwan), five onshore wind projects in France with a total gross capacity of nearly 200 MW, and several solar ener- gy projects in France, Spain, Iraq and the US, with approximate- ly 3 GW of gross capacity.
234 ('Several renewable electricity projects, which are compati- ble by nature with these criteria, were approved, such as off- shore wind projects Round 4 and ScotWind (United Kingdom), Yunlin (Taiwan), five onshore wind projects in France with a total gross capacity of nearly 200 MW, and several solar ener- gy projects in France, Spain, Iraq and the US, with approximate- ly 3 GW of gros

263 ('From 2010 to 2013, TotalEnergies developed a pilot project in Lacq, France, involving a complete CCS chain, in which carbon from a steam generator was captured using oxy-combustion technology (a European first) and then transported and stored in a depleted reservoir.', 100.0, 614)
TotalEnergies allocated $100 million to CCS research and projects in 2021, and by 2030 it expects to be expanding storage capacity by around 10 Mt annually.
264 ('TotalEnergies allocated $100 million to CCS research and projects in 2021, and by 2030 it expects to be expanding storage capacity by around 10 Mt annually.', 100.0, 620)
— In the Netherlands, the Company is studying a project to capture 900,000 tons/year of CO2 generated by the Zeeland refinery’s hydrogen plant as of 2026.
265 ('Supplying blue hydrogen to our refineries — • In the Netherlands, the Company is studying a project to capture 900,000 tons/year of CO2 generated by the Zeeland refinery’s hydrogen plant as of 2026.', 100.0, 621)
The 

284 ('• 300 service stations on motorways and major roads and 600 urban service stations with high power chargers (HPC) by 2030 to support e-mobility travel in Europe.', 100.0, 703)
Automotive Cells Company (ACC), a joint venture founded by TotalEnergies and Stellantis in 2020, is set to emerge as a global player in the development and manufacture of auto- motive batteries beginning in 2023.
285 ('Production of affordable, high-performance batteries • Automotive Cells Company (ACC), a joint venture founded by TotalEnergies and Stellantis in 2020, is set to emerge as a global player in the development and manufacture of auto- motive batteries beginning in 2023.', 100.0, 706)
With Saft, TotalEnergies is giving the new company the benefit of its expertise in R&D.
286 ('With Saft, TotalEnergies is giving the new company the benefit of its expertise in R&D.', 100.0, 707)
The batteries produced by ACC will power nearly one million EVs a year, or 10% of the European market.
287 ('The batterie

315 ('• Temporary production-related changes accounted for an increase of 1.1MtCO2e associated with higher activity levels, particularly in refining, and temporary flaring increases in 2021.', 100.0, 1870)
• Total hydrocarbons flared increased from 831kt to 967kt in 2021 due to operational variances including temporary flaring associated with a new production start-up.
316 ('• Total hydrocarbons flared increased from 831kt to 967kt in 2021 due to operational variances including temporary flaring associated with a new production start-up.', 100.0, 1871)
SERs across our business and activities in 2021 included: • Gelsenkirchen refinery and chemicals facility reduced its Scope 2 emissions from purchased electricity by 520ktCO2e through new lower carbon power agreements.
317 ('SERs across our business and activities in 2021 included: • Gelsenkirchen refinery and chemicals facility reduced its Scope 2 emissions from purchased electricity by 520ktCO2e through new lower carbon power agreement

337 ('In the UK, we have made several recent announcements, including our plan to build a hydrogen production facility at Teesside in the North East of England, intended to deliver up to 1GW of blue hydrogen by 2030, and the HyGreen Teesside project targeting 60MWe of green hydrogen production by 2025.', 100.0, 2022)
By 2030 we aim to have developed 50GW of renewable generating capacitya We set a target of 20GW of renewable energy generating capacity to be developed to FID by 2025 with an aim of 50GW by 2030.
338 ('Read more: Aims and objectives summary Developing more renewable energy We set a target of 20GW of renewable energy generating capacity to be developed to FID by 2025 with an aim of 50GW by 2030.', 86.6096866096866, 2160)
We are on track with 4.4GW brought to FID by end of 2021 and 23GW in the pipeline.
339 ('We are on track with 4.4GW brought to FID by end of 2021 and 23GW in the pipeline.', 100.0, 2161)
In the UK in January 2022, we were successful in the ScotWind licensin

359 ('In 2021, we reduced our Scope 1 and 2 (market-based) emissions by 21% versus our 2018 baseline by continuing to improve energy efficiency and increase our use of renewable energy.', 100.0, 773)
In South Africa, our Port Elizabeth site installed four centralized chiller units, saving 380 MWh per year in refrigeration, reducing CO2e by 2240 metric tonnes.
360 ('Here are some examples: • In South Africa, our Port Elizabeth site installed four centralized chiller units, saving 380 MWh per year in refrigeration, reducing CO2e by 2240 metric tonnes.', 100.0, 779)
Meanwhile in Poland, our Skarbimierz plant optimized its refrigeration system with the installation of dry coolers as a free cooling loop, reducing the workhours of the chillers and in so doing cutting out 1,300 metric tonnes of CO2e.
361 ('• Meanwhile in Poland, our Skarbimierz plant optimized its refrigeration system with the installation of dry coolers as a free cooling loop, reducing the workhours of the chillers and in so

388 None
We now aim to achieve carbon neutrality in our operations (scope 1+2) by 2025 and net zero emissions across our entire value chain (scope 1+2+3) by 2040.
389 ('We now aim to achieve carbon neutrality in our operations (scope 1+2) by 2025 and net zero emissions across our entire value chain (scope 1+2+3) by 2040.', 100.0, 5285)
Once we have maximized our emissions reductions, we compensate for the remaining unavoidable emissions.
390 ('Once we have maximized our emissions reductions, we compensate for the remaining unavoidable emissions.', 100.0, 5291)
We prioritize insetting projects in our supply chain when possible and purchase certified carbon credits when needed.
391 ('We prioritize insetting projects in our supply chain when possible and purchase certified carbon credits when needed.', 100.0, 5292)
Overall, in 2021, our emissions decreased in absolute terms by 1.7 percent across our value chain versus 2020, amounting to a total reduction of 84,000 tons of CO2e.
392 ('Over

Environmental Resource and Waste Summary
416 ('Environmental Sustainability Message from Virginie Helias 2020 Environmental Goals Progress Environmental Progress versus 2010 Baseline BRANDS Brand 2030 Ambitious Packaging Goals New Packaging that Changes the Game Reclaiming Plastic and Giving it a New Life Loop Tests Refillable, Reusable Packaging Closing in on Our 2020 Packaging Goals Responsible Consumption for Energy Savings Water-Efficient Products Trust and Transparency at the Core SUPPLY CHAIN Supply Chain Climate Responsible Forestry Palm Oil Water Waste SOCIETY Creating Circular Economies Keeping Plastic Waste out of the Environment Responsible Consumption Protecting Water for People and Nature Reinventing Water for Urban Living EMPLOYEES Employees Tracking Our Progress Awards and Recognitions Environmental Resource and Waste Summary Global Measurement and Additional Operational Data 10 ENVIRONMENTAL SUSTAINABILITY', 100.0, 7004)
We’ve announced both our science-based target to 

435 ('INVESTING IN WIND ENERGY We are party to a Virtual Power Purchase Agreement (VPPA) for 22 megawatts (MW) of wind power from the Ponderosa wind farm in Oklahoma, United States.', 100.0, 11562)
The Ponderosa wind farm generated more than 57,700 megawatt hours (MWh) of power in fiscal 2021.
436 ('The Ponderosa wind farm generated more than 57,700 megawatt hours (MWh) of power in fiscal 2021.', 100.0, 11564)
As of fiscal 2021, our total solar capacity is 5.7 MW.
437 ('As of fiscal 2021, our total solar capacity is 5.7 MW.', 100.0, 11570)
During the fiscal year, we generated more than 5,000 MWh of solar energy, which is equivalent to the amount of carbon sequestered by 4,300 acres of U.S. forests in one year.
438 ('During the fiscal year, we generated more than 5,000 MWh of solar energy, which is equivalent to the amount of carbon sequestered by 4,300 acres of U.S. forests in one year.', 100.0, 11571)
We also purchase renewable energy credits (RECs) to offset emissions from electricit

457 ('Through its regional energy and climate strategy, TJX Canada avoided or offset over 35,500 metric tons of CO2e calculated for its fiscal 2021 GHG inventory.', 100.0, 1058)
Conserving Energy: In fiscal 2021, TJX Canada continued to implement technologies, like LED lighting and HVAC replacements, which reduced our GHG inventory by over 739,000 kilowatt hours.
458 ('Here’s how TJX Canada achieved these results: / Conserving Energy: In fiscal 2021, TJX Canada continued to implement technologies, like LED lighting and HVAC replacements, which reduced our GHG inventory by over 739,000 kilowatt hours.', 100.0, 1060)
Renewable Energy: TJX Canada has purchased wind energy since fiscal 2017, which reduces its annual Scope 2 market-based emissions by about 83% annually and its total market-based emissions by about 47%.
459 ('/ Renewable Energy: TJX Canada has purchased wind energy since fiscal 2017, which reduces its annual Scope 2 market-based emissions by about 83% annually and its total 

494 None
To date, Target has certified 59 stores to the U.S. Environmental Protection Agency (EPA)’s GreenChill standards and installed around 15,000 hydrofluorocarbon (HFC)-free units in our stores, accounting for approximately 57% of our operations’ refrigerants in stand-alone cases.
495 ('50 2021 Target Corporate Responsibility Report Introduction People Planet Overview Environmental Responsibility in the Supply Chain Climate and Energy Waste and Circular Economy Chemicals Materials Water Business Indexes and Glossary Climate and Energy Refrigeration and Efficiency To date, Target has certified 59 stores to the U.S. Environmental Protection Agency (EPA)’s GreenChill standards and installed around 15,000 hydrofluorocarbon (HFC)-free units in our stores, accounting for approximately 57% of our operations’ refrigerants in stand-alone cases.', 100.0, 5033)
What We Achieved Planet (Environment)
496 ('69 2021 Target Corporate Responsibility Report Introduction People Planet Business Index

The 6-MW system generates enough energy to provide up to 45% of the 855,000-square-foot facility’s annual energy needs.
539 ('The 6-MW system generates enough energy to provide up to 45% of the 855,000-square-foot facility’s annual energy needs.', 100.0, 8366)
In 2020, we unveiled Amazon’s largest solar rooftop installation in Europe at a fulfillment center in the UK.
540 ('In 2020, we unveiled Amazon’s largest solar rooftop installation in Europe at a fulfillment center in the UK.', 100.0, 8367)
The facility is outfitted with more than 11,500 solar panels, which generate enough electricity to power the equivalent of 700 homes in the UK for one year.
541 ('The facility is outfitted with more than 11,500 solar panels, which generate enough electricity to power the equivalent of 700 homes in the UK for one year.', 100.0, 8368)
70% absolute reduction of greenhouse gas (GHG) emissions in owned or operated facilities through 100% renewable electricity and fleet electrification14,15 0.5M met

Electricity usage at our facilities accounts for approximately 71% of our total energy consumption and approximately 56% of our overall Scope 1 and 2 (location-based) GHG emissions.
575 ('Electricity usage at our facilities accounts for approximately 71% of our total energy consumption and approximately 56% of our overall Scope 1 and 2 (location-based) GHG emissions.', 100.0, 909)
Our second largest source of GHG emissions are PFCs and process gases, which are used in processing equipment at our manufacturing facilities, and account for approximately 34% of our overall Scope 1 and 2 (location-based) GHG emissions.
576 ('Our second largest source of GHG emissions are PFCs and process gases, which are used in processing equipment at our manufacturing facilities, and account for approximately 34% of our overall Scope 1 and 2 (location-based) GHG emissions.', 100.0, 910)
For U.S. facilities, we reduced total energy consumption by 7.3% and total Scope 1 and Scope 2 (location-based) emission

599 ('In addition, our short-term target calls for reducing Scope 1 and Scope 2 GHG emissions by 38 percent, and Scope 3 GHG emissions by 20 percent, between 2016 and 2025, aligning us with the 2015 Paris Agreement.', 100.0, 1785)
All our owned data centers use 100 percent renewable energy.
600 ('DATA CENTERS All our owned data centers use 100 percent renewable energy.', 100.0, 1804)
Two-thirds of them rely on solar energy produced on-site.
601 ('Two-thirds of them rely on solar energy produced on-site.', 100.0, 1805)
Mastercard has been 100 percent renewable since 2017, initially through the purchase of renewable-energy credits.
602 ('RENEWABLE ENERGY Mastercard has been 100 percent renewable since 2017, initially through the purchase of renewable-energy credits.', 100.0, 1815)
grid, using a four-pronged approach.
603 ('In 2020, we joined RE100, refocusing our efforts to bring renewable energy onto the grid, using a four-pronged approach.', 100.0, 1816)
The first and most effective ef

ACHIEVED: 85% renewable By FY25: Reach net zero greenhouse gas emissions for Scope 1 and 2 (FY19 base year).
629 ('ACHIEVED: 85% renewable By FY25: Reach net zero greenhouse gas emissions for Scope 1 and 2 (FY19 base year).', 100.0, 5296)
The Cisco 8201 consumes 96 percent less energy per year than the NCS 6008, while supplying 35 percent more bandwidth, as well as being five times more power-efficient than its closest competitor.
630 ('The Cisco 8201 consumes 96 percent less energy per year than the NCS 6008, while supplying 35 percent more bandwidth, as well as being five times more power-efficient than its closest competitor.', 100.0, 5339)
These technology improvements, along with the use of 80 Plus Titanium-rated power supplies, can reduce the energy consumption of a fully loaded chassis by 11 percent over the previous design, which equates to 689.8 kWh per year for a single rack server, or around 3000 metric tonnes of CO2e per 10,000 units.
631 ('These technology improvements, al

In addition to these purchases, we’re using our campuses have installed on-site renewables at select campuses as well.
661 None
At our Silicon Valley campus, a solar panel system will offset energy consumption up to 15 percent.
662 ('At our Silicon Valley campus, a solar panel system will offset energy consumption up to 15 percent.', 100.0, 5912)
Our Beijing and Shanghai Zizhu campuses have made similar efforts, installing photovoltaic (PV) solar panels on empty roof space in FY20 which are expected to generate 15,450 MWh of electricity over the next 25 years to power the campus.
663 ('Our Beijing and Shanghai Zizhu campuses have made similar efforts, installing photovoltaic (PV) solar panels on empty roof space in FY20 which are expected to generate 15,450 MWh of electricity over the next 25 years to power the campus.', 100.0, 5913)
LinkedIn’s new campus in Omaha features an on-site solar array over the parking structure.
664 ('LinkedIn’s new campus in Omaha features an on-site solar 

694 ('Cement: Cement accounts for approximately 8 percent of global emissions.', 89.92248062015504, 6013)
We are testing and piloting low- GHG cement innovations like CarbonCure to reduce the carbon footprint of our buildings.
695 ('We are testing and piloting low- GHG cement innovations like CarbonCure to reduce the carbon footprint of our buildings.', 100.0, 6014)
LinkedIn is using low-carbon and carbon-sequestering concrete mixes for our new Silicon Valley headquarters and is now using these materials in our LinkedIn Dublin campus.
696 ('LinkedIn is using low-carbon and carbon-sequestering concrete mixes for our new Silicon Valley headquarters and is now using these materials in our LinkedIn Dublin campus.', 100.0, 6015)
In our datacenters, we are reducing how much cement we use by utilizing longer 56-day cure times in lieu of typical 28-day cure times.
697 ('In our datacenters, we are reducing how much cement we use by utilizing longer 56-day cure times in lieu of typical 28-day cu

This represents a 77 percent decrease year-over-year from 2019.
722 ('This represents a 77 percent decrease year-over-year from 2019.', 100.0, 8005)
The decrease in our Scope 1 and 2 emissions is largely due to the achievement of our goal to transition to 100 percent renewable electricity.
723 ('The decrease in our Scope 1 and 2 emissions is largely due to the achievement of our goal to transition to 100 percent renewable electricity.', 100.0, 8006)
In carbon intensity, our Scope 1 and 2 emissions saw significant improvements in 2020 vs 2019: - Decreased 77 percent per employee - Decreased 78 percent per square foot of space - Decreased 76 percent per dollar revenue - Decreased 77 percent per transaction processed In 2020, we achieved carbon neutrality across our operations for the first time.
724 ('\x83 In carbon intensity, our Scope 1 and 2 emissions saw significant improvements in 2020 vs 2019: - Decreased 77 percent per employee - Decreased 78 percent per square foot of space - Dec

Combined with ongoing energy savings from past years, we are now cumulatively saving over 60,000 metric tons of CO2e in fiscal year 2021.
752 ('Combined with ongoing energy savings from past years, we are now cumulatively saving over 60,000 metric tons of CO2e in fiscal year 2021.', 100.0, 8659)
Working with our suppliers to reduce their energy consumption, we avoided more than 1.15 million metric tons of carbon from manufacturing Apple products in fiscal year 2021.
753 ('Working with our suppliers to reduce their energy consumption, we avoided more than 1.15 million metric tons of carbon from manufacturing Apple products in fiscal year 2021.', 100.0, 8678)
Apple has generated or sourced 100 percent renewable electricity for its corporate operations since 2018 and we are now committed to transitioning our entire supply chain to 100 percent renewable electricity as well.
754 ('Apple has generated or sourced 100 percent renewable electricity for its corporate operations since 2018 and we

In 2021, our scope 1 emissions decreased 6.5% to 636,157 MTCO2e and our scope 2 (market-based) emissions decreased 6.6% to 486,410 MTCO2e, compared to our 2019 baseline.
767 ('Region 2019 2020 2021 Africa 20.7 15.2 14.5 APAC 11.3 10.3 9.6 Europe 0.0 0.0 0.7 Latin America 0.2 0.1 0.1 U.S. and Canada 0.8 0.8 0.9 Average 6.9 6.2 5.1 Scope 1 and 2 GHG Emissions per Tower (MTCO2e) In 2021, our scope 1 emissions decreased 6.5% to 636,157 MTCO2e and our scope 2 (market-based) emissions decreased 6.6% to 486,410 MTCO2e, compared to our 2019 baseline.', 100.0, 636)
Our overall scope 1 and 2 emissions reduction of 6.5%, compared to our 2019 baseline, is driven by several factors, including improving communications site energy efficiency, deploying renewable energy solutions and upgrading on-site energy storage systems.
768 ('Our overall scope 1 and 2 emissions reduction of 6.5%, compared to our 2019 baseline, is driven by several factors, including improving communications site energy efficiency

These offsets included rainforest conservation in Brazil, Canada and Mexico and conservation, habitat management and building insulation alternatives in the U.S.
805 ('These offsets included rainforest conservation in Brazil, Canada and Mexico and conservation, habitat management and building insulation alternatives in the U.S. We are exploring opportunities to leverage our onsite generation of excess solar energy to create additive REC and carbon offset opportunities for our customers.', 100.0, 1724)
Established goal to be Carbon Neutral by 2025 in Scope 1 & 2 emissions We have converted 57% of our 12,000+ lit towers.
806 None
In 2021, we contracted to source ~110,000 MWh of renewable electricity across 13 states beginning in 2022.
807 ('2021 2022 175,397 ~110,000 In 2021, we contracted to source ~110,000 MWh of renewable electricity across 13 states beginning in 2022.', 100.0, 2152)
This amounts to 63% of our 2021 electricity consumption.2
808 ('This amounts to 63% of our 2021 electr

838 ('VOC Emissions VOC emissions decreased between 2019 and 2020 by nearly 19 percent, mainly due to the COVID-19 pandemic resulting in lower production volumes at plants with VOC emissions and decreases in transportation activities.', 100.0, 1288)
Linde is at the forefront in the transition to clean hydrogen and has installed nearly 200 hydrogen fueling stations and 80 hydrogen electrolysis plants worldwide and includes decarbonization investments within its SD 2028 targets.
839 ('Linde is at the forefront in the transition to clean hydrogen and has installed nearly 200 hydrogen fueling stations and 80 hydrogen electrolysis plants worldwide and includes decarbonization investments within its SD 2028 targets.', 100.0, 1971)
By 2030, our ambition is to help customers become carbon neutral by reducing greenhouse gas emissions by 6 million metric tons annually, preventing nearly 10 million pollution-related illnesses.
840 ('2018 2018 0 0 Billion Gallons Water Million MT C02e Target 188 3

3.6% 4% 27% >1.5 million >3 times improvement in ASU energy efficiency improvement in CO2 emissions intensitya of purchased electricity from renewable sources metric tons of CO2e avoided through efficiency improvementsa the ratio of CO2e avoided by our customers to our own emissionsa
868 None
CO2 In 2021, we increased global procurement of renewable electricity by 14 percent compared to 2020.
869 ('In 2021, we increased global procurement of renewable electricity by 14 percent compared to 2020.', 100.0, 8237)
Overall, 27 percent of our electricity was from renewable sources in 2021.
870 ('Overall, 27 percent of our electricity was from renewable sources in 2021.', 100.0, 8238)
We purchase renewable electricity directly through our energy suppliers or by buying Renewable Energy Certificates (RECs) that link our power consumption to a specific asset that generates renewable electricity.
871 ('We purchase renewable electricity directly through our energy suppliers or by buying Renewable E

The following table outlines a sampling of reduction initiatives completed in 2021 that have had a material impact on Dow’s reported Scope 1 and Scope 2 emissions.
889 ('The following table outlines a sampling of reduction initiatives completed in 2021 that have had a material impact on Dow’s reported Scope 1 and Scope 2 emissions.', 100.0, 12109)
Dominion Energy Consolidated Energy Mix (GWh) Renewable Build Scenario
890 ('140 120 100 80 60 40 20 0 2030 2040 2050 GW Dominion Energy Consolidated Nameplate Capacity  (GW) Renewable Build Exhibit 7 Dominion Energy Consolidated Energy Mix (GWh)  Renewable Build Scenario Exhibit 6 Consolidated Nameplate  Capacity (GW)  Renewable Build Scenario 2030 2040 2050 ●\t Hydro  ● Natural Gas/RNG ●\t Hydro  ● Natural Gas/RNG ● Nuclear ● Solar ● Offshore and Onshore Wind ● Coal ● Energy Storage — Peak Demand ● Nuclear ● Solar ● Offshore and  Onshore Wind The Renewable Build Scenario would require almost 1,000 square  miles of land in Virginia by 2050 —

908 ('In Florida, the company’s Park and Plug pilot has  installed more than 600 EV public charging stations  throughout the state.', 98.36065573770492, 1160)
To date, drivers have used the Park & Plug network for almost 130,000 charging sessions, displacing more than 215,000 gallons of gasoline
909 ('To date, drivers have used the  Park & Plug network for almost 130,000 charging  sessions, displacing more than 215,000 gallons of  gasoline and installations are in easily accessible  locations across Florida:    ■ 182 public Level 2 chargers at local businesses.', 97.8102189781022, 1161)
 Regulated and Commercial Businesses Combined: 2021 Electricity Generated and Generation Capacity 2021 Electricity Generated 1 36% Natural gas 35% Nuclear 22% Coal 5% Wind/solar 1% Conv.hydro 0.2% Fuel cell 0.1% Oil 1 Excludes pumped-storage hydro.
910 None
Fuels Consumed For Electric Generation 1 Coal (million tons) 63.1 Oil (million gallons) 231 Natural gas (billion cubic feet) 163 2021 Generation Cap

Our carbon goal is to be emissions-free completely by no later  2045. than  Our plan includes  meaningful  milestones  in five-  year increments   that would   to reach allow us  Real Zero  emissions by no later than 2045.
941 ('Our plan includes  meaningful milestones in five- year increments that would  allow us to reach Real Zero  emissions by no later than 2045.', 89.13043478260869, 3355)
NextEra has been Energy   working
942 None
2005 Continued the transition away from foreign oil and added 2,214 MW of natural gas and 434 MW of wind.
943 ('2005 Continued the transition away from foreign oil and added 2,214 MW of natural gas and 434 MW of wind.', 100.0, 3387)
2006 Acquired 615 MW of nuclear and added 824 MW of wind.
944 ('2006 Acquired 615 MW of nuclear and added 824 MW of wind.', 100.0, 3388)
2007 Acquired 1,024 MW of nuclear, added 1,150 MW natural gas and 824 MW of wind.
945 ('2007  Acquired 1,024 MW of nuclear, added 1,150 MW natural gas and 824 MW of wind.', 98.75, 3389)
2008 

### Check the outcome of string matched sentences and save the files

In [249]:
 string_matched_old = pd.DataFrame([matrel, orgrel, matlabel], index = ['matched_rel', 'original_rel', 'company_label']).transpose()

In [250]:
string_matched = pd.DataFrame([matrel, orgrel, matlabel], index = ['matched_rel', 'original_rel', 'company_label']).transpose()

In [251]:
# extracting sentences portion from the tuple with (sentence, ratio) format
string_matched['matched_rel_only'] = [i[0] if i is not None else '' for i in string_matched.matched_rel]

In [252]:
string_matched.to_csv("string_matched.csv", encoding = 'utf-8-sig')

#### =============================== RESUME RUNNING HERE ====================================

### Keep track of sentences that didn't get matched

In [253]:
string_matched = pd.read_csv('string_matched.csv', index_col = 0)

In [254]:
string_matched

Unnamed: 0,matched_rel,original_rel,company_label,matched_rel_only
0,"('Looking toward the future, we have set clima...","In 2021, 9.6% of our purchased electricity cam...",1,"Looking toward the future, we have set climate..."
1,('A large portion of this renewable electricit...,A large portion of this renewable electricity ...,1,A large portion of this renewable electricity ...
2,"('From 2012 to 2020, we achieved a 26% reducti...","From 2012 to 2020, we achieved a 26% reduction...",1,"From 2012 to 2020, we achieved a 26% reduction..."
3,"('In 2021, we achieved a 9% absolute emissions...","In 2021, we achieved a 9% absolute emissions r...",1,"In 2021, we achieved a 9% absolute emissions r..."
4,('This reduction was partially driven by energ...,This reduction was partially driven by energy ...,1,This reduction was partially driven by energy ...
...,...,...,...,...
956,('2018 Retired and demolished 636 MW of coal ...,2018 Retired and demolished 636 MW of coal and...,72,2018 Retired and demolished 636 MW of coal an...
957,"('2019 Aquired Gulf Power, which added 1,750 M...","2019 Aquired Gulf Power, which added 1,750 MW ...",72,"2019 Aquired Gulf Power, which added 1,750 MW ..."
958,('2020 Retired 615 MW of nuclear and 330 MW o...,2020 Retired 615 MW of nuclear and 330 MW of c...,72,2020 Retired 615 MW of nuclear and 330 MW of ...
959,"('2021 Added 2,008 MW of wind, 1,547 MW of sol...","2021 Added 2,008 MW of wind, 1,547 MW of solar...",72,"2021 Added 2,008 MW of wind, 1,547 MW of solar..."


In [255]:
still_not_matched = string_matched[string_matched.matched_rel.isnull()]

In [256]:
len(still_not_matched)

44

### To Understand why String Matching Failed for Some Sentences

In [257]:
total_all

Unnamed: 0,all_sentences,company_label,company_index,sent_index,sent_count
0,"7/7/22, 10:29 AM Environmental | 2021 ESG Repo...",EliLilly,0001,0001,30
1,Making medicines requires the use of valuable ...,EliLilly,0001,0002,14
2,We’re committed to reducing our environmental ...,EliLilly,0001,0003,18
3,"To track our progress, we measure and manage e...",EliLilly,0001,0004,27
4,"Lilly manages health, safety and the environme...",EliLilly,0001,0005,13
...,...,...,...,...,...
3565,→ FPL’s four nuclear units continue to operat...,NextEraEnergyZeroCarbonBlueprint,0072,0217,10
3566,Technology We assume that: → FPL’s gas plants...,NextEraEnergyZeroCarbonBlueprint,0072,0218,28
3567,→ NextEra Energy Resources would invest in el...,NextEraEnergyZeroCarbonBlueprint,0072,0219,26
3568,→ All non-FPL fossil generation assets would ...,NextEraEnergyZeroCarbonBlueprint,0072,0220,14


In [258]:
for i in total_all[total_all.company_index == '="0020"'].all_sentences:
    print(i)
    print()

In [259]:
for i,j in zip(still_not_matched.original_rel, still_not_matched.company_label):
    print(i, "-->", j)
    print()

3,696,540 Total Energy Use (intensity) 193.7 Reduction -9.0% y/y Total GHG Emissions, Scope 1 and 2 312,860 Total GHG Emissions, Scope 1 and 2 (intensity) 16.4 Reduction -7.5% y/y --> 5

From renewable sources 3,355 Percent renewable electricity by region North America 67%◊ Europe 79%◊ On-site generated energy use by type (TJ)1 Co-generation 456 Wind 111 Solar PV 86 Geothermal 21 Fuel cell 4 Biomass 3 On-site clean/renewable energy capacity by type4 Solar PV 44% Co-generation 29% Wind 23% Geothermal 2% Biomass 2% Fuel cell 1% On-site clean/renewable energy technology capacity, (MW) 67.2 Electricity generated from renewable sources 52%◊ --> 6

Generated Electricity (Cogeneration) 196 Generated Electricity (Renewables) 7 Generated On-Site from Renewables 7 Purchased Renewables from Third Party 678 Renewable Energy 678 % of Renewable Energy Purchased 15% Energy Consumption by Source(o) 1,000 Gigajoules Electricity Consumption by Type Purchased Energy Consumed (1,000 --> 8

North America R

In [260]:
still_not_matched['sent_count'] = still_not_matched['original_rel'].str.split().str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  still_not_matched['sent_count'] = still_not_matched['original_rel'].str.split().str.len()


In [261]:
still_not_matched.to_csv('sentenences_not_matched.csv', encoding = 'utf-8-sig')

In [262]:
len(still_not_matched)

44

### Keep track of sentences that got matched (only use this version from now on: disregard sentences not matched and short sentences)

In [263]:
sentences_matched = string_matched[string_matched.matched_rel.notnull()]

In [264]:
sentences_matched = sentences_matched.drop_duplicates(subset = ['matched_rel_only', 'company_label'])

In [265]:
sentences_matched.to_csv('sentences_matched.csv', encoding = 'utf-8-sig')

In [266]:
len(sentences_matched)

914

In [None]:
# Acquired approximately 32 more sentences 881 --> 913

### RUN FROM HERE

### B-4) Separate relevant and irrelevant sentences: aligning matched sentences with total_all sentences

In [267]:
sentences_matched = pd.read_csv('sentences_matched.csv', index_col = 0)

In [268]:
sentences_matched

Unnamed: 0,matched_rel,original_rel,company_label,matched_rel_only
0,"('Looking toward the future, we have set clima...","In 2021, 9.6% of our purchased electricity cam...",1,"Looking toward the future, we have set climate..."
1,('A large portion of this renewable electricit...,A large portion of this renewable electricity ...,1,A large portion of this renewable electricity ...
2,"('From 2012 to 2020, we achieved a 26% reducti...","From 2012 to 2020, we achieved a 26% reduction...",1,"From 2012 to 2020, we achieved a 26% reduction..."
3,"('In 2021, we achieved a 9% absolute emissions...","In 2021, we achieved a 9% absolute emissions r...",1,"In 2021, we achieved a 9% absolute emissions r..."
4,('This reduction was partially driven by energ...,This reduction was partially driven by energy ...,1,This reduction was partially driven by energy ...
...,...,...,...,...
955,"('2017 Retired and demolished 250 MW of coal, ...","2017 Retired and demolished 250 MW of coal, re...",72,"2017 Retired and demolished 250 MW of coal, re..."
956,('2018 Retired and demolished 636 MW of coal ...,2018 Retired and demolished 636 MW of coal and...,72,2018 Retired and demolished 636 MW of coal an...
957,"('2019 Aquired Gulf Power, which added 1,750 M...","2019 Aquired Gulf Power, which added 1,750 MW ...",72,"2019 Aquired Gulf Power, which added 1,750 MW ..."
958,('2020 Retired 615 MW of nuclear and 330 MW o...,2020 Retired 615 MW of nuclear and 330 MW of c...,72,2020 Retired 615 MW of nuclear and 330 MW of ...


In [269]:
sentences_matched[sentences_matched.matched_rel_only.duplicated()]

Unnamed: 0,matched_rel,original_rel,company_label,matched_rel_only


In [270]:
sentences_matched = sentences_matched.drop_duplicates('matched_rel_only')

In [271]:
original_merge_match = sentences_matched.rename(columns = {'original_rel': 'relevant_sentences'})

In [272]:
original_merge_match

Unnamed: 0,matched_rel,relevant_sentences,company_label,matched_rel_only
0,"('Looking toward the future, we have set clima...","In 2021, 9.6% of our purchased electricity cam...",1,"Looking toward the future, we have set climate..."
1,('A large portion of this renewable electricit...,A large portion of this renewable electricity ...,1,A large portion of this renewable electricity ...
2,"('From 2012 to 2020, we achieved a 26% reducti...","From 2012 to 2020, we achieved a 26% reduction...",1,"From 2012 to 2020, we achieved a 26% reduction..."
3,"('In 2021, we achieved a 9% absolute emissions...","In 2021, we achieved a 9% absolute emissions r...",1,"In 2021, we achieved a 9% absolute emissions r..."
4,('This reduction was partially driven by energ...,This reduction was partially driven by energy ...,1,This reduction was partially driven by energy ...
...,...,...,...,...
955,"('2017 Retired and demolished 250 MW of coal, ...","2017 Retired and demolished 250 MW of coal, re...",72,"2017 Retired and demolished 250 MW of coal, re..."
956,('2018 Retired and demolished 636 MW of coal ...,2018 Retired and demolished 636 MW of coal and...,72,2018 Retired and demolished 636 MW of coal an...
957,"('2019 Aquired Gulf Power, which added 1,750 M...","2019 Aquired Gulf Power, which added 1,750 MW ...",72,"2019 Aquired Gulf Power, which added 1,750 MW ..."
958,('2020 Retired 615 MW of nuclear and 330 MW o...,2020 Retired 615 MW of nuclear and 330 MW of c...,72,2020 Retired 615 MW of nuclear and 330 MW of ...


In [273]:
merge_rel_matched = total_relevant.merge(original_merge_match, how = 'left', on = 'relevant_sentences')

In [274]:
merge_rel_matched = merge_rel_matched[merge_rel_matched.matched_rel.notnull()].drop(['company_label_y'], axis = 1)

In [275]:
merge_rel_matched

Unnamed: 0,relevant_sentences,company_label_x,company_index,sent_count,matched_rel,matched_rel_only
0,"In 2021, 9.6% of our purchased electricity cam...",EliLilly,0001,11,"('Looking toward the future, we have set clima...","Looking toward the future, we have set climate..."
1,A large portion of this renewable electricity ...,EliLilly,0001,24,('A large portion of this renewable electricit...,A large portion of this renewable electricity ...
2,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,0001,12,"('From 2012 to 2020, we achieved a 26% reducti...","From 2012 to 2020, we achieved a 26% reduction..."
3,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,0001,11,"('In 2021, we achieved a 9% absolute emissions...","In 2021, we achieved a 9% absolute emissions r..."
4,This reduction was partially driven by energy ...,EliLilly,0001,27,('This reduction was partially driven by energ...,This reduction was partially driven by energy ...
...,...,...,...,...,...,...
955,"2017 Retired and demolished 250 MW of coal, re...",NextEraEnergyZeroCarbonBlueprint,0072,23,"('2017 Retired and demolished 250 MW of coal, ...","2017 Retired and demolished 250 MW of coal, re..."
956,2018 Retired and demolished 636 MW of coal and...,NextEraEnergyZeroCarbonBlueprint,0072,38,('2018 Retired and demolished 636 MW of coal ...,2018 Retired and demolished 636 MW of coal an...
957,"2019 Aquired Gulf Power, which added 1,750 MW ...",NextEraEnergyZeroCarbonBlueprint,0072,27,"('2019 Aquired Gulf Power, which added 1,750 M...","2019 Aquired Gulf Power, which added 1,750 MW ..."
958,2020 Retired 615 MW of nuclear and 330 MW of c...,NextEraEnergyZeroCarbonBlueprint,0072,45,('2020 Retired 615 MW of nuclear and 330 MW o...,2020 Retired 615 MW of nuclear and 330 MW of ...


In [276]:
merge_rel_matched[merge_rel_matched.relevant_sentences.duplicated()]

Unnamed: 0,relevant_sentences,company_label_x,company_index,sent_count,matched_rel,matched_rel_only


In [277]:
merge_rel_all = merge_rel_matched.rename(columns = {'matched_rel_only': 'all_sentences'})

In [278]:
test_merge = merge_rel_all.merge(total_all, how = 'left', on = 'all_sentences')

In [279]:
test_merge = test_merge.drop(['company_label_x', 'company_index_x'], axis = 1)

In [280]:
test_merge_clean = test_merge.drop(['matched_rel'], axis = 1)
test_merge_clean.rename(columns = {'company_index_y': 'company_index'})

Unnamed: 0,relevant_sentences,sent_count_x,all_sentences,company_label,company_index,sent_index,sent_count_y
0,"In 2021, 9.6% of our purchased electricity cam...",11,"Looking toward the future, we have set climate...",EliLilly,0001,0020,40
1,A large portion of this renewable electricity ...,24,A large portion of this renewable electricity ...,EliLilly,0001,0021,24
2,"From 2012 to 2020, we achieved a 26% reduction...",12,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,0001,0025,12
3,"In 2021, we achieved a 9% absolute emissions r...",11,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,0001,0026,11
4,This reduction was partially driven by energy ...,27,This reduction was partially driven by energy ...,EliLilly,0001,0027,27
...,...,...,...,...,...,...,...
909,"2017 Retired and demolished 250 MW of coal, re...",23,"2017 Retired and demolished 250 MW of coal, re...",NextEraEnergyZeroCarbonBlueprint,0072,0052,23
910,2018 Retired and demolished 636 MW of coal and...,38,2018 Retired and demolished 636 MW of coal an...,NextEraEnergyZeroCarbonBlueprint,0072,0053,38
911,"2019 Aquired Gulf Power, which added 1,750 MW ...",27,"2019 Aquired Gulf Power, which added 1,750 MW ...",NextEraEnergyZeroCarbonBlueprint,0072,0054,27
912,2020 Retired 615 MW of nuclear and 330 MW of c...,45,2020 Retired 615 MW of nuclear and 330 MW of ...,NextEraEnergyZeroCarbonBlueprint,0072,0055,45


In [281]:
test_merge_clean['label'] = 'rel'
test_rel = test_merge_clean

In [282]:
test_rel = test_rel.rename(columns = {'all_sentences': 'rel_match_all', 'company_index_y': 'company_index'})

In [283]:
test_rel

Unnamed: 0,relevant_sentences,sent_count_x,rel_match_all,company_label,company_index,sent_index,sent_count_y,label
0,"In 2021, 9.6% of our purchased electricity cam...",11,"Looking toward the future, we have set climate...",EliLilly,0001,0020,40,rel
1,A large portion of this renewable electricity ...,24,A large portion of this renewable electricity ...,EliLilly,0001,0021,24,rel
2,"From 2012 to 2020, we achieved a 26% reduction...",12,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,0001,0025,12,rel
3,"In 2021, we achieved a 9% absolute emissions r...",11,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,0001,0026,11,rel
4,This reduction was partially driven by energy ...,27,This reduction was partially driven by energy ...,EliLilly,0001,0027,27,rel
...,...,...,...,...,...,...,...,...
909,"2017 Retired and demolished 250 MW of coal, re...",23,"2017 Retired and demolished 250 MW of coal, re...",NextEraEnergyZeroCarbonBlueprint,0072,0052,23,rel
910,2018 Retired and demolished 636 MW of coal and...,38,2018 Retired and demolished 636 MW of coal an...,NextEraEnergyZeroCarbonBlueprint,0072,0053,38,rel
911,"2019 Aquired Gulf Power, which added 1,750 MW ...",27,"2019 Aquired Gulf Power, which added 1,750 MW ...",NextEraEnergyZeroCarbonBlueprint,0072,0054,27,rel
912,2020 Retired 615 MW of nuclear and 330 MW of c...,45,2020 Retired 615 MW of nuclear and 330 MW of ...,NextEraEnergyZeroCarbonBlueprint,0072,0055,45,rel


In [284]:
total_all

Unnamed: 0,all_sentences,company_label,company_index,sent_index,sent_count
0,"7/7/22, 10:29 AM Environmental | 2021 ESG Repo...",EliLilly,0001,0001,30
1,Making medicines requires the use of valuable ...,EliLilly,0001,0002,14
2,We’re committed to reducing our environmental ...,EliLilly,0001,0003,18
3,"To track our progress, we measure and manage e...",EliLilly,0001,0004,27
4,"Lilly manages health, safety and the environme...",EliLilly,0001,0005,13
...,...,...,...,...,...
3565,→ FPL’s four nuclear units continue to operat...,NextEraEnergyZeroCarbonBlueprint,0072,0217,10
3566,Technology We assume that: → FPL’s gas plants...,NextEraEnergyZeroCarbonBlueprint,0072,0218,28
3567,→ NextEra Energy Resources would invest in el...,NextEraEnergyZeroCarbonBlueprint,0072,0219,26
3568,→ All non-FPL fossil generation assets would ...,NextEraEnergyZeroCarbonBlueprint,0072,0220,14


In [285]:
total_all = total_all.reset_index()
total_all = total_all.drop(columns = ['index'])

In [286]:
test_irr = total_all.merge(test_merge_clean, how = 'outer', on = 'all_sentences', indicator = True)

In [287]:
test_irr = test_irr[test_irr.relevant_sentences.isnull()]

In [288]:
test_irr['label'] = 'irr'

In [289]:
test_irr = test_irr.drop(['relevant_sentences', 'sent_count_x', 'company_label_y', 'company_index_y', '_merge', 'sent_index_y', 'sent_count_y'], axis = 1)

In [290]:
test_irr = test_irr.reset_index().drop('index', axis = 1 )

In [291]:
test_irr = test_irr.rename(columns = {'company_label_x': 'company_label', 'sent_index_x': 'sent_index'})

In [292]:
# total_all.company_index = total_all.company_index.apply('="{}"'.format)

In [293]:
# total_all.sent_index = total_all.sent_index.apply('="{}"'.format)

In [294]:
total_all.to_csv('all_with_index.csv', encoding = 'utf-8-sig')

In [295]:
# test_rel.company_index = test_rel.company_index.apply('="{}"'.format)

In [296]:
# test_rel.sent_index = test_rel.sent_index.apply('="{}"'.format)

In [297]:
test_rel = test_rel[['relevant_sentences', 'sent_count_x', 'rel_match_all', 'sent_count_y', 'company_label', 'company_index', 'sent_index', 'label']]

In [298]:
test_rel.to_csv('rel_with_index.csv', encoding = 'utf-8-sig')

In [299]:
# test_irr.company_index = test_irr.company_index.apply('="{}"'.format)

In [300]:
# test_irr.sent_index = test_irr.sent_index.apply('="{}"'.format)

In [301]:
test_irr.to_csv('irr_with_index.csv', encoding = 'utf-8-sig')

In [302]:
len(test_irr)

75480

In [303]:
len(test_merge_clean)

914

In [304]:
len(total_all)

76394

In [305]:
# Noted on Jan 25 2023
# NOTE: this version only contains sentences with more than 5 letters

In [306]:
test_rel.head()

Unnamed: 0,relevant_sentences,sent_count_x,rel_match_all,sent_count_y,company_label,company_index,sent_index,label
0,"In 2021, 9.6% of our purchased electricity cam...",11,"Looking toward the future, we have set climate...",40,EliLilly,1,20,rel
1,A large portion of this renewable electricity ...,24,A large portion of this renewable electricity ...,24,EliLilly,1,21,rel
2,"From 2012 to 2020, we achieved a 26% reduction...",12,"From 2012 to 2020, we achieved a 26% reduction...",12,EliLilly,1,25,rel
3,"In 2021, we achieved a 9% absolute emissions r...",11,"In 2021, we achieved a 9% absolute emissions r...",11,EliLilly,1,26,rel
4,This reduction was partially driven by energy ...,27,This reduction was partially driven by energy ...,27,EliLilly,1,27,rel


In [307]:
total_all.head()

Unnamed: 0,all_sentences,company_label,company_index,sent_index,sent_count
0,"7/7/22, 10:29 AM Environmental | 2021 ESG Repo...",EliLilly,1,1,30
1,Making medicines requires the use of valuable ...,EliLilly,1,2,14
2,We’re committed to reducing our environmental ...,EliLilly,1,3,18
3,"To track our progress, we measure and manage e...",EliLilly,1,4,27
4,"Lilly manages health, safety and the environme...",EliLilly,1,5,13


In [308]:
test_irr.head()

Unnamed: 0,all_sentences,company_label,company_index,sent_index,sent_count,label
0,"7/7/22, 10:29 AM Environmental | 2021 ESG Repo...",EliLilly,1,1,30,irr
1,Making medicines requires the use of valuable ...,EliLilly,1,2,14,irr
2,We’re committed to reducing our environmental ...,EliLilly,1,3,18,irr
3,"To track our progress, we measure and manage e...",EliLilly,1,4,27,irr
4,"Lilly manages health, safety and the environme...",EliLilly,1,5,13,irr


## C) TODO: Update Sentences Statistics

In [321]:
rel_from_all = test_rel.rename(columns = {'relevant_sentences': 'relevant_sentences'})

In [322]:
irr_from_all = test_irr.rename(columns = {'all_sentences': 'irrelevant_sentences'})

In [323]:
total_relevant_stat = rel_from_all.groupby('company_label', sort = False).count()[['relevant_sentences']]

In [324]:
total_irrelevant_stat = irr_from_all.groupby('company_label', sort = False).count()[['irrelevant_sentences']]

In [325]:
total_stat_final = total_irrelevant_stat.merge(total_relevant_stat, how = 'left', on = 'company_label', sort = False)

In [326]:
total_stat_final['relevant_sentences'] = total_stat_final['relevant_sentences'].fillna(0)

In [327]:
total_stat_final['relevant_sentences'] = total_stat_final['relevant_sentences'].astype(int)

In [328]:
total_stat_final['rel/total percentages'] = round((total_stat_final['relevant_sentences'] / (total_stat_final['relevant_sentences'] + total_stat_final['irrelevant_sentences'])) * 100, 2)

In [329]:
total_stat_final.to_csv('total_stat_final.csv')

In [330]:
total_stat_final.sum()

irrelevant_sentences     75480.0
relevant_sentences         914.0
rel/total percentages      125.2
dtype: float64

### TODO: Save the sentence dictionary into json file

In [None]:
# sent_dict = {}

# for key, sent in zip(all_sent.key, all_sent.sentences):
#     sent_dict[key] = sent

In [None]:
# import json

# with open('sentence_dict.json', 'w') as fp:
#     json.dump(sent_dict, fp, sort_keys=True, indent=4)

In [None]:
# with open('sentence_dict.json', 'r') as fp:
#     data = json.load(fp)

In [None]:
# import os
# import shutil
# import glob

In [None]:
# path = '/Users/tylerryoo/t3/extracted_sentences/notebooks/final_extracted_statistics_notebooks'
# files = glob.glob(path + "/*.csv")

In [None]:
# files.append('/Users/tylerryoo/t3/extracted_sentences/notebooks/final_extracted_statistics_notebooks/sentence_dict.json')

In [None]:
# for file in files:
    
#     if file == '/Users/tylerryoo/t3/extracted_sentences/notebooks/final_extracted_statistics_notebooks/string_matched.csv':
#         continue

#     filename = file.split('/')[-1]
    
#     target = (r'/Users/tylerryoo/t3/relevant_irrelevant_sentences_labeled_final/' + filename)

#     shutil.move(file, target)