## A) Read Data Files

### A-1) Pulling in All the Data Across Notebooks

In [145]:
import pandas as pd
import os 
from nltk import tokenize
import json
import re

In [146]:
%store -r HC_reltext
%store -r HC_alltext
%store -r HC_stat

%store -r IND_reltext
%store -r IND_alltext
%store -r IND_stat

%store -r Energy_reltext
%store -r Energy_alltext
%store -r Energy_stat

%store -r CONSTA_reltext
%store -r CONSTA_alltext
%store -r CONSTA_stat

%store -r CONDIS_reltext
%store -r CONDIS_alltext
%store -r CONDIS_stat

%store -r IT_reltext
%store -r IT_alltext
%store -r IT_stat

%store -r Real_Estate_reltext
%store -r Real_Estate_alltext
%store -r Real_Estate_stat

%store -r Materials_reltext
%store -r Materials_alltext
%store -r Materials_stat

%store -r Utilities_reltext
%store -r Utilities_alltext
%store -r Utilities_stat

### Read  <ins>total_relevant</ins>, <ins>total_all</ins>, (both relevant and irrelevant),  <ins>total_stat</ins> (statistics of relevant and all sentences)

In [427]:
total_relevant = pd.concat([HC_reltext, IND_reltext, Energy_reltext,
                 CONSTA_reltext, CONDIS_reltext, IT_reltext,
                 Real_Estate_reltext, Materials_reltext, Utilities_reltext])

In [428]:
total_all = pd.concat([HC_alltext, IND_alltext, Energy_alltext,
                 CONSTA_alltext, CONDIS_alltext, IT_alltext,
                 Real_Estate_alltext, Materials_alltext, Utilities_alltext])

### Removing Duplicates of Sentences

In [429]:
total_all = total_all.drop_duplicates('all_sentences')

In [430]:
total_relevant = total_relevant.drop_duplicates('relevant_sentences')

In [431]:
unique_comp = total_all.company_label.unique()

In [432]:
comp_dict = dict()

count = 1
for i in unique_comp:
    comp_dict[i] = str("%04d" % count)
    count += 1

In [433]:
comp_dict

{'EliLilly': '0001',
 'UnitedHealthGroup': '0002',
 'Merck': '0003',
 'BristolMyersSquibb': '0004',
 'Danaher': '0005',
 'johnsonandjohnson': '0006',
 'Pfizer': '0007',
 'Abbott': '0008',
 'ThermoFisherScientifiic': '0009',
 'Amgen': '0010',
 'Caterpillar': '0011',
 'Lockheed': '0012',
 'Boeing': '0013',
 'UPS': '0014',
 'Raytheon': '0015',
 'Delta': '0016',
 'Deere': '0017',
 'Honeywell': '0018',
 '3M': '0019',
 'UnionPacific': '0020',
 'Total': '0021',
 'BP': '0022',
 'Shell': '0023',
 'Mondelez_Intl': '0024',
 'Hershey': '0025',
 'Philip_Morris_Intl': '0026',
 'PepsiCo': '0027',
 'Altria_Environmental': '0028',
 'PandG': '0029',
 'Altria_TCFD': '0030',
 'Costco': '0031',
 'CocaCola': '0032',
 'Altria_2021': '0033',
 'Walmart': '0034',
 'EsteeLauder': '0035',
 'McDonalds': '0036',
 'TJX': '0037',
 'HomeDepot': '0038',
 'Lowes': '0039',
 'Target': '0040',
 'BookingHoldings': '0041',
 'Tesla': '0042',
 'Amazon': '0043',
 'Nike': '0044',
 'Starbucks': '0045',
 'Accenture': '0046',
 'Bro

In [434]:
total_all.head()

Unnamed: 0,all_sentences,company_label
0,"7/7/22, 10:29 AM Environmental | 2021 ESG Repo...",EliLilly
1,Making medicines requires the use of valuable ...,EliLilly
2,We’re committed to reducing our environmental ...,EliLilly
3,"To track our progress, we measure and manage e...",EliLilly
4,"Lilly manages health, safety and the environme...",EliLilly


In [435]:
total_all['company_index'] = total_all.apply(lambda x: comp_dict[x.company_label], axis = 1)

In [436]:
cur_comp_index = ""
sent_index = [] 
for i in total_all.company_index:
    if i != cur_comp_index:
        cur_comp_index = i
        sent_val = 0
        sent_val += 1
        sent_index.append(str("%04d" % sent_val))
    else:
        sent_val += 1
        sent_index.append(str("%04d" % sent_val))

In [437]:
total_all.groupby('company_label', sort = False).count()

Unnamed: 0_level_0,all_sentences,company_index
company_label,Unnamed: 1_level_1,Unnamed: 2_level_1
EliLilly,105,105
UnitedHealthGroup,1052,1052
Merck,2411,2411
BristolMyersSquibb,1197,1197
Danaher,943,943
...,...,...
Dow,2874,2874
Dominion_Energy,672,672
Duke_Energy,1018,1018
AEP,1582,1582


In [438]:
total_all['sent_index'] = sent_index

In [439]:
total_all[total_all.sent_index == '0001']

Unnamed: 0,all_sentences,company_label,company_index,sent_index
0,"7/7/22, 10:29 AM Environmental | 2021 ESG Repo...",EliLilly,0001,0001
105,Our Mission in Action 2021 Sustainability Report,UnitedHealthGroup,0002,0001
1170,"Environmental, Social & Governance (ESG) Progr...",Merck,0003,0001
3634,"Environmental, Social and Governance Report Ou...",BristolMyersSquibb,0004,0001
4845,2021 Sustainability Report,Danaher,0005,0001
...,...,...,...,...
10200,"2021 ENVIRONMENTAL, SOCIAL & GOVERNANCE REPORT...",Dow,0068,0001
0,REPORT 2021 A report based on the recommendati...,Dominion_Energy,0069,0001
677,1 2021 DUKE ENERGY ESG REPORT D UK E E NE ...,Duke_Energy,0070,0001
1752,2022 Corporate Sustainability Report 2022 CORP...,AEP,0071,0001


In [440]:
for i in total_all.all_sentences[0:5]:
    print(tokenize.sent_tokenize(i))

['7/7/22, 10:29 AM Environmental | 2021 ESG Report | Eli Lilly and Company https://esg.lilly.com/environmental#tab-control-tab4 1/7 Our purpose, to make life better, includes protecting and preserving the world we live in.']
['Making medicines requires the use of valuable resources including energy, water and raw materials.']
['We’re committed to reducing our environmental footprint across the life cycles of our products and our supply chain.']
['To track our progress, we measure and manage energy and water use, greenhouse gas (GHG) emissions and the generation of waste and wastewater throughout our manufacturing process.']
['Lilly manages health, safety and the environment (HSE) under a uni�ed governance structure.']


In [441]:
# total_all['all_sentences'] = total_all['all_sentences'].apply(lambda x: re.sub(' +', ' ', x))

In [442]:
# total_all['all_sentences'] = total_all['all_sentences'].apply(lambda x: re.sub('\xa0', ' ', x))

In [167]:
# total_all['all_sentences'] = total_all['all_sentences'].apply(lambda x: re.sub('\t', ' ', x))

In [168]:
# total_all['all_sentences'] = total_all['all_sentences'].apply(lambda x: x.strip())

In [169]:
# total_all['all_sentences'] = total_all['all_sentences'].apply(lambda x: re.sub('\r', ' ', x))

In [19]:
# for i,j in zip(total_all['all_sentences'], total_all['company_label']) :
#     if i == '(e) High performer is defined as an individual with two consecutive annual performance ratings of Exceeds/Exceeds, Exceeds/Fully Meets, or Fully Meets/Exceeds (note that “exceeds” is one out of four possible dimension ratings).':
#         print(i,j)

In [443]:
total_relevant['company_index'] = total_relevant.apply(lambda x: comp_dict[x.company_label], axis = 1)

In [444]:
total_relevant

Unnamed: 0,relevant_sentences,company_label,company_index
0,"In 2021, 9.6% of our purchased electricity cam...",EliLilly,0001
1,A large portion of this renewable electricity ...,EliLilly,0001
2,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,0001
3,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,0001
4,This reduction was partially driven by energy ...,EliLilly,0001
...,...,...,...
67,2018 Retired and demolished 636 MW of coal and...,NextEraEnergyZeroCarbonBlueprint,0072
68,"2019 Aquired Gulf Power, which added 1,750 MW ...",NextEraEnergyZeroCarbonBlueprint,0072
69,2020 Retired 615 MW of nuclear and 330 MW of c...,NextEraEnergyZeroCarbonBlueprint,0072
70,"2021 Added 2,008 MW of wind, 1,547 MW of solar...",NextEraEnergyZeroCarbonBlueprint,0072


In [445]:
# total_relevant['relevant_sentences'] = total_relevant['relevant_sentences'].apply(lambda x: re.sub(' +', ' ', x))

In [446]:
# total_relevant['relevant_sentences'] = total_relevant['relevant_sentences'].apply(lambda x: re.sub('\xa0', ' ', x))

In [447]:
# total_relevant['relevant_sentences'] = total_relevant['relevant_sentences'].apply(lambda x: re.sub('\t', ' ', x))

In [448]:
# total_relevant['relevant_sentences'] = total_relevant['relevant_sentences'].apply(lambda x: x.strip())

In [449]:
# total_relevant['relevant_sentences'] = total_relevant['relevant_sentences'].apply(lambda x: re.sub('\r', ' ', x))

## B) Extracting Irrelevant Sentences from All Sentences

### This method below doesn't correctly extract irrelevant sentences. Another method is needed

In [288]:
rel_test = []
for i in total_relevant['relevant_sentences'].to_list():
    if i in total_all['all_sentences'].to_list():
        rel_test.append(i)

In [289]:
print("This method to extract relevant sentences: ", len(rel_test), "vs.", "original_relevant_sentences: ", len(total_relevant['relevant_sentences'].to_list()))

This method to extract relevant sentences:  579 vs. original_relevant_sentences:  999


### Testing fuzz package to compare strings

In [178]:
# pip install fuzzywuzzy
# pip install python-Levenshtein

In [179]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [180]:
fuzz.partial_ratio('In 2021, we reduced our energy consumption by 2.9%,', 
                   'In 2021, we reduced our energy consumption by 2.9%, and we reduced our absolute GHG emissions by 9% compared to 2020.')

100

### B-1) Eliminating short sentences from relevant and all sentences

### Checking the number of letters of sentences to make sure sentences that have less than 5 letters are eliminated

In [181]:
import numpy as np
np.unique([len(i.split(" ")) for i in total_relevant['relevant_sentences']])[0:50]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])

In [182]:
np.unique([len(i.split(" ")) for i in total_all['all_sentences']])[0:50]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])

In [183]:
total_relevant['sent_count'] = total_relevant['relevant_sentences'].str.split().str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_relevant['sent_count'] = total_relevant['relevant_sentences'].str.split().str.len()


In [184]:
total_all['sent_count'] = total_all['all_sentences'].str.split().str.len()

In [185]:
total_relevant = total_relevant[total_relevant['sent_count'] >= 5]

In [186]:
total_all = total_all[total_all['sent_count'] >= 5]

In [187]:
# Check the length of new sentences after dropping duplicates

print("relevant_sentences:", original_relevant, "->", len(total_relevant))
print("all_sentences:", original_all, "->", len(total_all))

relevant_sentences: 1003 -> 961
all_sentences: 81346 -> 79134


### Convert the sentences into lists for future use

In [188]:
rel_var = total_relevant['relevant_sentences'].to_list()
rel_lab = total_relevant['company_label'].to_list()
rel_comp_index = total_relevant['company_index'].to_list()

all_var = total_all['all_sentences'].to_list()
all_lab = total_all['company_label'].to_list()
all_comp_index = total_all['company_index'].to_list()



#### =============================== PAUSE RUNNING HERE ====================================

### B-2) Conduct string matching to separate relevant and irrelevant sentences from all sentences

In [450]:
total_all

Unnamed: 0,all_sentences,company_label,company_index,sent_index
0,"7/7/22, 10:29 AM Environmental | 2021 ESG Repo...",EliLilly,0001,0001
1,Making medicines requires the use of valuable ...,EliLilly,0001,0002
2,We’re committed to reducing our environmental ...,EliLilly,0001,0003
3,"To track our progress, we measure and manage e...",EliLilly,0001,0004
4,"Lilly manages health, safety and the environme...",EliLilly,0001,0005
...,...,...,...,...
3567,→ NextEra Energy Resources would invest in el...,NextEraEnergyZeroCarbonBlueprint,0072,0219
3568,→ All non-FPL fossil generation assets would ...,NextEraEnergyZeroCarbonBlueprint,0072,0220
3569,→ Vehicle fleet conversions are based on the ...,NextEraEnergyZeroCarbonBlueprint,0072,0221
3570,Assumptions 0 0,NextEraEnergyZeroCarbonBlueprint,0072,0222


In [193]:
rel_var[46]

'From renewable sources 3,355 Percent renewable electricity by region North America 67%◊ Europe 79%◊ On-site generated energy use by type (TJ)1 Co-generation 456 Wind 111 Solar PV 86 Geothermal 21 Fuel cell 4 Biomass 3 On-site clean/renewable energy capacity by type4 Solar PV 44% Co-generation 29% Wind 23% Geothermal 2% Biomass 2% Fuel cell 1% On-site clean/renewable energy technology capacity, (MW) 67.2 Electricity generated from renewable sources 52%◊'

In [195]:
# string comparison to separate relevant and irrelevant sentences from all sentences
matrel = []
matlabel = []
orgrel = []
track = 0
for i, j in zip(rel_var, rel_comp_index):
    # extract relevant sentences from all
    val = process.extractOne(i, all_var, scorer = fuzz.partial_ratio, score_cutoff = 85)
    matrel.append(val)
    matlabel.append(j)
    orgrel.append(i)
    print(track, end = " ")
    print(val)
    track +=1

0 ('Looking toward the future, we have set climate goals for 2030 as we work toward contributing to a low-carbon economy: Secure 100% of our purchased electricity from renewable sources In 2021, 9.6% of our purchased electricity came from renewable sources.', 100)
1 ('A large portion of this renewable electricity is delivered through our utility providers to our sites in Alcobendas, Spain; Kinsale, Ireland; and Bracknell, UK.', 100)
2 ('From 2012 to 2020, we achieved a 26% reduction in absolute emissions.', 100)
3 ('In 2021, we achieved a 9% absolute emissions reduction versus 2020.', 100)
4 ('This reduction was partially driven by energy e�ciency improvements and an increase in the use of renewable electricity including the startup of our solar array in Kinsale.', 100)
5 ('In 2021, we reduced our energy consumption by 2.9%, and we reduced our absolute GHG emissions by 9% compared to 2020.', 100)
6 ('Enhancing the Use of Solar In 2021, 9.6% of our purchased electricity was secured from

40 ('New this year was the introduction of on-site PPA structure in the form of multiyear “energy as a service” contracts at two sites, enabling GHG-reduction benefits without the need for capital investment.', 100)
41 ('At one of the largest manufacturing sites in South Africa, the Cape Town installation represented the first renewable electricity initiative for Johnson & Johnson in South Africa.', 100)
42 ('Reducing carbon emissions by land, air and sea: As part of our ongoing efforts to leverage data science across our operations, in 2021, we automated a manual freight load planning process to produce a 3D loading plan that optimizes space utilization of shipping containers, resulting in lower GHG emissions from shipping, a faster loading process and cost savings.', 100)
43 ('In Europe, we identified opportunities to switch deliveries from our Belgium distribution center to Denmark, France and Spain from air to road—significantly reducing the carbon emissions with minimal impact to 

79 ('We also updated our D10 series dozers, which now use 4% less fuel while offering a 3% boost in productivity.', 100)
80 ('REDUCED EMISSIONS LARGE WHEEL LOADERS The Cat 992 Large Wheel Loader has set the standard in its size class for more than 50 years.', 100)
81 ('In 2021, we raised the bar with our new 992, which has been demonstrated to produce up to 32% less emissions than its 992K counterpart.', 100)
82 ('CarbonPoint Solutions provides technology to concentrate and capture CO2 for utilization or sequestration, with applications including engines and turbines at oil and gas sites, distributed power and industrial plants, and waste-to-energy sites.', 100)
83 ('CATERPILLAR 2021 SUSTAINABILITY REPORT 57 2018 2019 2020 2021 GOAL 2030 PERSPECTIVE ENERGY — OPERATIONS (continued) Grid electricity % N/A 83.8 87.3 86.7 N/A In 2021, over 35% of our electrical energy was obtained from renewable or alternative sources.', 100)
84 ('Renewable energy 1, 2 % 17.4 21.2 21.0 24.2 N/A Alternative

95 ('Goal: Sustainable Operations Maintain a net-zero future for Boeing manufacturing and work-site operations through conservation, renewable energy and responsible offsets Partner with the supply chain for responsible business practices 2021 Highlights Manufacturing and other facilities achieved NET-ZERO CO2e emissions again in 2021 Since 2017, Boeing has reduced greenhouse gas emissions from our operations1 by 25% Achieved 28% renewable electricity in 2021 on our path to 100% in 2030 Received EPA ENERGY STAR Partner of the Year Award for Sustained Excellence 72,000 employee actions taken in a month in support of the Battle of the Buildings conservation program Boeing spent $4B+ with small and diverse suppliers Supplier Code of Conduct implemented 1.', 100)
96 ('Net-Zero at Manufacturing and Facilities Boeing achieved net-zero carbon emissions at manufacturing and other facilities and in business travel in 2021 for the second consecutive year, by expanding conservation and renewable 

132 ('Following 2035, we envision availability of next-generation aircraft technology such as the deployment of novel airframe designs and advanced propulsion technologies that could continue to improve fuel efficiency.', 100)
133 ('By 2030, Delta aims to procure more than 400 million gallons of SAF annually to meet our 10% goal, which is almost 40 times the total global SAF production in 2019.', 100)
134 ('Delta is establishing a goal to replace at least 5% of conventional jet fuel consumption with SAF that achieves at least an 85% reduction in life cycle GHG emissions relative to conventional jet fuel by 2030, pending availability and feasibility.', 100)
135 ('Current feedstock and technologies can produce SAF with up to 80% lower life cycle emissions than conventional jet fuel, which can be blended with conventional jet fuel at up to 50%.', 100)
136 ('For Delta to reach net zero, out-of-sector solutions will likely be a necessary complement to fleet, SAF, operational initiatives and

175 ('Delta purchased nearly 12M metric tons of offsets dedicated to preventing deforestation AVOIDANCE: Includes projects that work to avoid the release of emissions, e.g.', 100)
176 ('protecting forests • REDD+: Reducing Emissions from Deforestation and Forest Degradation • LULUCF: Land Use, Land Use Change and Forestry REDUCTION: Includes technologies or projects that increase the availability of renewable energy and convert waste into energy • Energy Capture • Renewable Energy: projects like solar- and wind-generated power installations REMOVAL: Involves projects to remove CO2 from the atmosphere and store it • Carbon Capture and Storage (CCS) • Afforestation OFFSET PROJECT TYPES REDUCTION AVOIDANCE REMOVAL (Carbon Capture and Storage (CCS) & Afforestation ) (REDD+, LULUCF) (Energy Capture & Renewable Energy) 50% 44% 6%', 100)
177 ('2019 100% 90% 53% 67% 71% 29% 0% 50% 2020 2021 2022 Goal 2018 (BASE YEAR) 1.32 4.0 3.2 2.4 1.6 0.8 0.0 # of Recordable Incidents Per 100 Employees 1.65

209 ('At the same time, we have retired about 2,500 older, less fuel-efficient locomotives, which reduced our total fleet by approximately 35%.', 100)
210 (' Union Pacific | Climate Action Plan 9 Nearly 175 high- and low-horsepower locomotives were overhauled in 2020, meaning they were completely rebuilt to meet more stringent emissions standards.', 100)
211 ('Each modernization results in an approximate 53% reduction in emissions and an additional 5% reduction in fuel consumption per engine.', 100)
212 ('Our Energy Management System (EMS) on locomotives helps us identify opportunities to save fuel.', 100)
213 ('EMS has been implemented in approximately two thirds of our active road fleet with a target of full implementation by 2025.', 100)
214 ('We estimate that EMS will reduce our absolute GHG emissions by 4% annually by 2025.', 100)
215 ('Additional management actions have allowed us to further reduce overall locomotive energy consumption by 1.5% from 2018 to 2020.', 100)
216 ('In 2

259 ('In 2021, the decline from the year before linked to reductions in venting came to 6 kt per year (projects in Gabon and the U.K.).', 100)
260 ('• Reductions in flaring: In 2021, the decrease in flaring from 2020 reduced emissions by 1.8 kt per year.', 100)
261 ('• Leak reduction: annual campaigns to identify and repair leaks at all operated sites will be deployed starting in 2022.', 100)
262 ('In 2021, emissions declined by 4 kt as a result of leak reduction efforts, including a significant upgrade to the OML58 facility in Nigeria.', 100)
263 ('From 2010 to 2013, TotalEnergies developed a pilot project in Lacq, France, involving a complete CCS chain, in which carbon from a steam generator was captured using oxy-combustion technology (a European first) and then transported and stored in a depleted reservoir.', 100)
264 ('TotalEnergies allocated $100 million to CCS research and projects in 2021, and by 2030 it expects to be expanding storage capacity by around 10 Mt annually.', 100)

299 ('Divestments are, and continue to be, an important part of our strategy.', 100)
300 ('They enable us to strengthen our balance sheet and high-grade or diversify our portfolio.', 100)
301 ('They will also help bp to create a resilient, lower cost and lower carbon oil, gas and refining portfolio that is smaller but high quality.', 100)
302 ('For aim 1, divestments contribute to reducing our operational emissions.', 100)
303 ('Our combined Scope 1 and Scope 2 emissions, covered by aim 1 were 35.6MtCO2e, a decrease of 35% from our 2019 baseline of 54.4MtCO2e.', 100)
304 ('The total decrease of almost 19MtCO2e includes 14.7MtCO2e in divestments and 2.6MtCO2e in sustainable emission reductions (SERs)b.', 100)
305 ('Compared with 2020 (45.5MtCO2e), Scope 1 and 2 emissions in 2021 decreased by 22%.', 100)
306 ('Scope 1 (direct) emissions, covered by aim 1, were 33.2MtCO2e in 2021, a decrease of 20% from 41.7MtCO2e in 2020.', 100)
307 ('Of those Scope 1 emissions 32.0MtCO2e were from CO2 a

350 ('This new target covers all Scope 1 and 2 emissions under Shell’s operational control and complements our existing carbon-intensity targets.', 96)
351 ('These decreases were partly offset by higher emissions due to the restart of the Prelude floating liquefied natural gas (LNG) facility in Australia (which was shut down for most of 2020) and increased flaring at Shell Nigeria Exploration and Production Company Limited (SNEPCo) in Nigeria.', 100)
352 ('SCOPE 1 AND SCOPE 2 GHG EMISSIONS CHANGES FROM 2020 TO 2021 million tonnes CO2e 55 60 65 70 75 2021 2020 Acquisitions Reduction activities and purchased renewable electricity [C] [D] Emissions [A] [B] a b c Divestments and other reasons Change in output d e 71 0.0 (4.0) 2.7 (2.2) 68 a b c d e [A] Total Scope 1 and Scope 2 emissions, rounded to the closest million tonnes.', 100)
353 ('In 2021, Shell’s total methane emissions were 55 thousand tonnes compared with 67 thousand tonnes in 2020, in part due to reduced methane emissions repo

391 ('We prioritize insetting projects in our supply chain when possible and purchase certified carbon credits when needed.', 100)
392 ('Overall, in 2021, our emissions decreased in absolute terms by 1.7 percent across our value chain versus 2020, amounting to a total reduction of 84,000 tons of CO2e.', 100)
393 ('More specifically, we achieved an absolute reduction in scope 1 emissions of 4.3 percent, in scope 2 of 28.9 percent, and in scope 3 of 1.2 percent versus 2020.', 100)
394 ('Carbon emissions along our value chain in 2021 UPSTREAM VALUE CHAIN SCOPE 3: 85.9% OUR DIRECT OPERATIONS SCOPE 1: 7.5% DOWNSTREAM VALUE CHAIN SCOPE 3: 5.1% 2.6% Use of our products Offices and warehouses 15.8% Tobacco growing 1.6% Indirect emissions from the generation of purchased or acquired electricity, steam, heat, or cooling consumed by PMI SCOPE 2: 1.6% 2025 2040 CARBON-NEUTRAL OPERATIONS BY 2025 NET ZERO GHG EMISSIONS ALONG OUR VALUE CHAIN BY 2040 1.2% Transport and distribution 1.3% Products end-o

428 ('To continue our progress, we are focused on renewable energy procurement, energy efficiency, refrigeration, transportation and stationary fuels.', 100)
429 ('In fiscal 2020, we achieved Net Zero Scope 1 and Scope 2 emissions and sourced 100% renewable electricity globally for our direct operations—commitments that we met again in fiscal 2021* and intend to maintain moving forward.', 100)
430 ('PROGRESS TOWARD OUR GOALS GOALS: Reduce absolute Scope 1 and 2 GHG emissions 50% by 2030 from a 2018 base year.', 100)
431 ('* In fiscal 2021, we continued to make progress toward our Scope 1 and 2 emissions reduction target through a portfolio of climate solutions, including on-site renewables, energy efficiency projects, green utility contracts, and renewable energy credits.', 100)
432 ('Reduce Scope 3 GHG emissions from purchased goods and services, upstream transportation and distribution, and business travel by 60% per unit revenue over the same time frame.', 100)
433 ('To address Scop

464 ('AVOIDING AND OFFSETTING EMISSIONS We source low-carbon and renewable energy to further reduce our GHG emissions from electricity consumption.', 100)
465 ('In certain geographies, we leverage carbon offsets16 12to offset emissions resulting from our direct operations and business travel.', 100)
466 ('Some examples of our current sourcing strategies include: Wholesale power purchase agreements On-site power purchase agreements Electricity supply contracts Renewable energy credits In fiscal 2021, we sourced over 325 million kilowatt hours of renewable energy in total, generated from a variety of technologies.', 100)
467 ('Together, our renewable and low-carbon energy sourcing strategy in fiscal 2021 enabled us to reduce our Scope 2 market-based GHG inventory by more than 146,000 metric tons of CO2e, or by about 25% over the prior year.', 100)
468 ('(Page 54) • We’ve committed to setting Science Based Targets initiative (SBTi) Scope 1, 2 and 3 goals by 2023 to reduce emissions in lin

505 ('It delivers enough energy to meet the demands of over 700,000 Indonesians on average each year.', 100)
506 ('Kariba Forest Protection, Zimbabwe The Kariba project protects almost 785,000 hectares of forests near the Zimbabwe-Zambia border.', 100)
507 ('It is registered with REDD+, a United Nations–backed program that aims to stop the destruction of forests.', 100)
508 ('Our Emissions Offset Strategy We offset emissions that we are unable to reduce through the purchase of credible carbon offset projects from organizations such as Gold Standard and Verified Carbon Standard (VCS).', 100)
509 ('Our primary strategy for offsetting is avoidance and nature-based solutions, and in the near future we will transition to carbon removal projects.', 100)
510 ('3 In 2021, the global fleet of Tesla vehicles, energy storage and solar panels enabled our customers to avoid emitting 8.4 million metric tons of CO2e The 6.8 million metric tons of vehicle CO2e savings estimate is based on the net CO2e

555 ('Quantitative Target 0.5M metric tons of GHG emissions reduced through increasing our use of environmentally preferred materials to 50% of all key materials39 We know that materials account for approximately 70% of NIKE’s product carbon footprint.', 100)
556 ('2030 Planet Goals Carbon 50% absolute reduction in scope 1, 2 and 3 greenhouse (GHG) emissions representing all of Starbucks direct operations and value chain.', 100)
557 ('With performance-based standards that incorporate design and extend throughout the life of a store, Starbucks Greener Stores in North America have reduced energy consumption by 30% compared with the company’s prior store designs.', 100)
558 ('Worldwide, Starbucks purchases enough renewable electricity to power 100% of our company-operated stores in the U.S., Canada and U.K.', 100)
559 ('In FY21, renewable energy powered 66% of company-operated facilities globally compared to 72% in FY20, with market constraints in China and Japan challenging our ability t

593 ('Mastercard Donate technology is also seamlessly integrated into the Carbon Calculator, empowering people to donate to Conservation International and helping restore forests.', 100)
594 ('When presented with the approximate carbon impact of their monthly card spend in their issuer’s mobile banking app, cardholders can choose to donate to forest restoration initiatives that achieve significant impact by counterbalancing CO2.', 100)
595 ('GHG Emissions Scope 3 Emissions', 90)
596 ('We will continue reducing carbon emissions from our operations through energy efficiency, renewable energy and alternative fuel use.', 100)
597 ('We are continuing efforts with our suppliers, who account for more than 70 percent of our Scope 3 emissions, through engagement, interventions and target-setting.', 100)
598 ('We have developed a supplier engagement program that will help us achieve our 2040 net-zero goal.', 100)
599 ('In addition, our short-term target calls for reducing Scope 1 and Scope 2 GHG

638 ('1 We are engaging suppliers and our business groups to cut our Scope 3 emissions by more than 50 percent and we’ll rely on carbon removal to reach carbon negative.', 100)
639 ('Our commitment: carbon negative by 2030 and by 2050 to remove from the atmosphere an equivalent amount of all the carbon dioxide our company has emitted either directly or by our electricity consumption since we were founded in 1975.', 100)
640 ('Our progress Reduced Scope 1 and 2 by 16.9% We reduced our Scope 1 and 2 (market- based) emissions by 58,654 metric tons of carbon dioxide equivalents (mtCO2) in FY21.', 100)
641 ('Scope 3 emissions increased by 22.7 percent.', 100)
642 ('5.8 GW of renewable energy In FY21, we signed new power purchase agreements (PPAs) for approximately 5.8 gigawatts (GW) of renewable energy across 10 countries around the globe, totaling more than 8 GW of renewable energy via PPAs or long-term contracts.', 100)
643 ('6 Carbon Water Waste Ecosystems 2.5M tons In FY21 and FY22, Mic

688 None
689 ('Building on this work, we’re announcing a new around-the-clock pilot in the Netherlands with energy provider Eneco and FlexiDAO, a technology supplier, which will match one of our Amsterdam datacenter’s hourly energy consumption with the Dutch offshore windfarm Borssele.', 100)
690 ('We used the EC3 tool in designing 17 new buildings (3 million square feet) in our Puget Sound campus modernization project, where we are on track to reduce embodied carbon emissions by at least 30 percent.', 100)
691 ('We are now using the EC3 tool around the world in both our campuses and our datacenters to track and reduce embodied carbon and have found opportunities to reduce concrete and steel embodied carbon by 30 to 60 percent in our datacenters.', 100)
692 ('Mass timber: At our Silicon Valley campus, we are using mass timber, resulting in the largest mass timber building in the United States.', 100)
693 ('In addition to using this lower carbon building material, we elected to keep two

733 ('For us, this means taking bold action.', 100)
734 ('Our goal to be carbon neutral by 2030 is both ambitious and necessary.', 100)
735 ('Achieve carbon neutrality for our entire carbon footprint, including products, by 2030.', 100)
736 ('And reduce related emissions by 75% compared with fiscal year 2015 40% emissions reduction since 2015 across our value chain Established the $200M Apple Restore Fund with the aim of removing over 1M metric tons of carbon per year 23M metric tons of emissions avoided in fiscal year 2021 alone due to carbon reduction initiatives across our value chain Achieved since April 2020 by implementing energy efficiency initiatives, sourcing 100% renewable electricity for Apple facilities, and securing carbon offsets for the remaining corporate emissions Become carbon neutral for corporate operations 0 Transition our entire manufacturing supply chain to 100% renewable electricity by 2030 As of March 2022, 213 suppliers have committed to 100% renewable electri

765 ('The GHG emissions reduction targets we have recently set, combined with the significant investments we have made internationally in on-site renewable energy generation and advanced energy storage, not only help address climate change but also help our customers meet their own GHG emissions reduction commitments.” Marek Busfy, SVP and Chief Executive Officer, Africa Our SBTi Approved GHG Emissions Reduction Targets1 Scope 3 American Tower is targeting a 40% reduction in indirect scope 3 value chain GHG emissions by 2035 from a 2019 base year.', 100)
766 ('Scope 1 and 2 American Tower is targeting a 40% reduction in absolute scope 1 and 2 GHG emissions by 2035 from a 2019 base year.', 100)
767 ('Region 2019 2020 2021 Africa 20.7 15.2 14.5 APAC 11.3 10.3 9.6 Europe 0.0 0.0 0.7 Latin America 0.2 0.1 0.1 U.S. and Canada 0.8 0.8 0.9 Average 6.9 6.2 5.1 Scope 1 and 2 GHG Emissions per Tower (MTCO2e) In 2021, our scope 1 emissions decreased 6.5% to 636,157 MTCO2e and our scope 2 (market-

812 ('New technologies will be piloted and scaled commercially.', 100)
813 ('Green fuels and materials will be scaling and the feasibility for hydrogen usage as an agent of decarbonization in multiple applications will be growing.', 100)
814 ('SMRs will continue to be used for most large-scale hydrogen production.', 100)
815 ('Optimize Operational Energy Use and GHG Emissions Linde’s 2020 Scope 1 GHG emissions were 16.2 MM MT CO₂e.', 100)
816 ('This represents a 1 percent absolute reduction, driven by lower hydrogen production due to the pandemic, improved HyCO GHG efficiency.', 100)
817 ('Linde’s Scope 2 GHG emissions were 21 MM MT CO₂e.', 100)
818 ('This represents a 6 percent absolute reduction versus 2019, driven by improvements in plant specific grid emission factors; increased low-carbon (LC) and renewable power purchases; and lower production due to the pandemic, as well as updated methodology for Scope 2 emissions from steam.', 100)
819 None
820 ('In 2020, Linde’s GHG intensity

852 ('To underscore our commitment to supporting the health of the planet and leading in science-based climate action, we have identified the following high-level environmental goals: • Reduce absolute direct and indirect (Scope 1 and 2 under our operational control) GHG emissions by 50% below 2021 levels by 2030 • Achieve net zero (Scope 1 and 2) GHG emissions by 2040; and be net positive across our entire value chain (cradle-to-gate) by 205010 • Verify zero waste to landfill for all major manufacturing facilities while minimizing single-use plastics and championing water stewardship by 2030 As we progress on our integration activities and understand our new 2021 combined company baseline, we are expanding on these goals and adding critical internal key performance indicators.', 100)
853 ('/ TJX purchased 29,000 megawatt hours more renewable and low-carbon energy including renewable energy certificates (RECs), onsite solar power purchase agreements (PPAs), and utility-supplied renewab

878 ('Newmont Corporation\u20032021 Climate Report 36 Performance, Metrics and Targets\u2002/\u20022050 Carbon-Neutral Goal 2050 Carbon-Neutral Goal Newmont’s goal to be carbon neutral by 2050 is supported by a combination of our long-life portfolio of gold and copper projects with anticipated economic, technological and policy improvements.', 100)
879 ('We will also work concurrently to decarbonize our existing operations, develop and adopt low-carbon technologies, and reconceptualize our greenfield and brownfield project pipeline as carbon neutral.', 87)
880 ('Newmont Corporation\u20032021 Climate Report 38 Performance, Metrics and Targets\u2002/\u20022050 Carbon-Neutral Goal CARBON NEUTRALIZATION Achieving our carbon‑neutral goal by 2050 will require carbon removal to neutralize limited “hard to abate” emissions that cannot yet be eliminated.', 100)
881 ('These carbon removals are commonly referred to as carbon offsets and such abatement may consider carbon sequestration or capture,

914 ('Over 40% of the  electricity we generated in 2021 was from  carbon-free sources, including nuclear, wind,  hydro and solar.', 97)
915 ('Thirty six percent was from  lower-carbon natural gas, which emits about  half as much carbon dioxide as coal.', 98)
916 ('And  about 22% was from higher-carbon coal and  oil.', 96)
917 ('Taken together, owned and purchased  renewables are equivalent to almost 11% of  our electricity generation.', 98)
918 ('- 3 4 35 2.', 88)
919 ('Scope 1 Emissions Emissions From Electric Generation 1 2005 2019 2020 2021 CO2 emissions (thousand metric/short tons)  139,000/ 153,000 84,000/  93,000 74,000/  82,000 77,000/  85,000 CO2 emissions intensity (pounds per net kWh) 1.29 0.86 0.78 0.79 SO2 emissions (metric/short tons)  1,004,000/  1,107,000 28,000/  31,000 24,000/  27,000 23,000/  25,000 SO2 emissions intensity (pounds per net MWh) 9.3 0.3 0.3 0.2 NOX emissions (metric/short tons)  221,000/  244,000 45,000/  50,000 39,000/  43,000 38,000/  42,000 NOX emiss

### Check the outcome of string matched sentences and save the files

In [196]:
 string_matched_old = pd.DataFrame([matrel, orgrel, matlabel], index = ['matched_rel', 'original_rel', 'company_label']).transpose()

In [197]:
string_matched = pd.DataFrame([matrel, orgrel, matlabel], index = ['matched_rel', 'original_rel', 'company_label']).transpose()

In [198]:
# extracting sentences portion from the tuple with (sentence, ratio) format
string_matched['matched_rel_only'] = [i[0] if i is not None else '' for i in string_matched.matched_rel]

In [199]:
string_matched.to_csv("string_matched.csv", encoding = 'utf-8-sig')

#### =============================== RESUME RUNNING HERE ====================================

### Keep track of sentences that didn't get matched

In [200]:
string_matched = pd.read_csv('string_matched.csv', index_col = 0)

In [201]:
string_matched

Unnamed: 0,matched_rel,original_rel,company_label,matched_rel_only
0,"('Looking toward the future, we have set clima...","In 2021, 9.6% of our purchased electricity cam...",1,"Looking toward the future, we have set climate..."
1,('A large portion of this renewable electricit...,A large portion of this renewable electricity ...,1,A large portion of this renewable electricity ...
2,"('From 2012 to 2020, we achieved a 26% reducti...","From 2012 to 2020, we achieved a 26% reduction...",1,"From 2012 to 2020, we achieved a 26% reduction..."
3,"('In 2021, we achieved a 9% absolute emissions...","In 2021, we achieved a 9% absolute emissions r...",1,"In 2021, we achieved a 9% absolute emissions r..."
4,('This reduction was partially driven by energ...,This reduction was partially driven by energy ...,1,This reduction was partially driven by energy ...
...,...,...,...,...
956,('2018 Retired and demolished 636 MW of coal ...,2018 Retired and demolished 636 MW of coal and...,72,2018 Retired and demolished 636 MW of coal an...
957,"('2019 Aquired Gulf Power, which added 1,750 M...","2019 Aquired Gulf Power, which added 1,750 MW ...",72,"2019 Aquired Gulf Power, which added 1,750 MW ..."
958,('2020 Retired 615 MW of nuclear and 330 MW o...,2020 Retired 615 MW of nuclear and 330 MW of c...,72,2020 Retired 615 MW of nuclear and 330 MW of ...
959,"('2021 Added 2,008 MW of wind, 1,547 MW of sol...","2021 Added 2,008 MW of wind, 1,547 MW of solar...",72,"2021 Added 2,008 MW of wind, 1,547 MW of solar..."


In [202]:
still_not_matched = string_matched[string_matched.matched_rel.isnull()]

In [229]:
still_not_matched

Unnamed: 0,matched_rel,original_rel,company_label,matched_rel_only,sent_count
32,,"3,696,540 Total Energy Use (intensity) 193.7 R...",5,,29
46,,"From renewable sources 3,355 Percent renewable...",6,,68
55,,Generated Electricity (Cogeneration) 196 Gener...,8,,42
56,,North America Region Total 63 Electricity From...,8,,14
61,,3MW 3.5MW of solar power installed at our site...,9,,16
63,,"Use of sold products (category 11) 2,228,880 T...",9,,43
123,,One of our largest renewable energy contracts ...,15,,15
187,,"% Renewable Electricity1,2 40.5% % of Waste Re...",17,,9
218,,Progress Rail approved the use of up to 20% bi...,20,,60
237,,Scope 1+2 – Net Zero by 2050 Scope 3 – Net Zer...,21,,33


### To Understand why String Matching Failed for Some Sentences

In [218]:
for i in total_all[total_all.company_index == '0051'].all_sentences:
    print(i)
    print()

Our Purpose, Our Progress 2021 Cisco Purpose Report

The journey continues Our purpose to Power an Inclusive Future for All is a never-ending journey—one constantly refueled by new opportunities to pursue and new challenges to overcome.

In this year’s Purpose Report, we share our progress over the past year and our work ahead.

This report contains forward-looking statements regarding future events.

All statements other than statements of historical facts are statements that could be deemed forward-looking statements.

These statements are based on current expectations, estimates, forecasts, and projections about the industries in which we operate and the beliefs and assumptions of our management.

Readers are cautioned that these forward- looking statements are only predictions and are subject to risks, uncertainties, and assumptions that are difficult to predict, including those identified in our most recent filings with the Securities and Exchange Commission on Form 10-K and Form 

In [211]:
for i,j in zip(still_not_matched.original_rel, still_not_matched.company_label):
    print(i, "-->", j)
    print()

3,696,540 Total Energy Use (intensity) 193.7 Reduction -9.0% y/y Total GHG Emissions, Scope 1 and 2 312,860 Total GHG Emissions, Scope 1 and 2 (intensity) 16.4 Reduction -7.5% y/y --> 5

From renewable sources 3,355 Percent renewable electricity by region North America 67%◊ Europe 79%◊ On-site generated energy use by type (TJ)1 Co-generation 456 Wind 111 Solar PV 86 Geothermal 21 Fuel cell 4 Biomass 3 On-site clean/renewable energy capacity by type4 Solar PV 44% Co-generation 29% Wind 23% Geothermal 2% Biomass 2% Fuel cell 1% On-site clean/renewable energy technology capacity, (MW) 67.2 Electricity generated from renewable sources 52%◊ --> 6

Generated Electricity (Cogeneration) 196 Generated Electricity (Renewables) 7 Generated On-Site from Renewables 7 Purchased Renewables from Third Party 678 Renewable Energy 678 % of Renewable Energy Purchased 15% Energy Consumption by Source(o) 1,000 Gigajoules Electricity Consumption by Type Purchased Energy Consumed (1,000 --> 8

North America R

In [219]:
still_not_matched['sent_count'] = still_not_matched['original_rel'].str.split().str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  still_not_matched['sent_count'] = still_not_matched['original_rel'].str.split().str.len()


In [220]:
still_not_matched.to_csv('sentenences_not_matched.csv', encoding = 'utf-8-sig')

In [230]:
still_not_matched.head()

Unnamed: 0,matched_rel,original_rel,company_label,matched_rel_only,sent_count
32,,"3,696,540 Total Energy Use (intensity) 193.7 R...",5,,29
46,,"From renewable sources 3,355 Percent renewable...",6,,68
55,,Generated Electricity (Cogeneration) 196 Gener...,8,,42
56,,North America Region Total 63 Electricity From...,8,,14
61,,3MW 3.5MW of solar power installed at our site...,9,,16


### Keep track of sentences that got matched (only use this version from now on: disregard sentences not matched and short sentences)

In [231]:
sentences_matched = string_matched[string_matched.matched_rel.notnull()]

In [232]:
sentences_matched = sentences_matched.drop_duplicates(subset = ['matched_rel_only', 'company_label'])

In [233]:
sentences_matched.to_csv('sentences_matched.csv', encoding = 'utf-8-sig')

In [234]:
len(sentences_matched)

913

In [235]:
# Acquired approximately 32 more sentences 881 --> 913

### RUN FROM HERE

### B-4) Separate relevant and irrelevant sentences: aligning matched sentences with total_all sentences

In [451]:
sentences_matched = pd.read_csv('sentences_matched.csv', index_col = 0)

In [452]:
sentences_matched

Unnamed: 0,matched_rel,original_rel,company_label,matched_rel_only
0,"('Looking toward the future, we have set clima...","In 2021, 9.6% of our purchased electricity cam...",1,"Looking toward the future, we have set climate..."
1,('A large portion of this renewable electricit...,A large portion of this renewable electricity ...,1,A large portion of this renewable electricity ...
2,"('From 2012 to 2020, we achieved a 26% reducti...","From 2012 to 2020, we achieved a 26% reduction...",1,"From 2012 to 2020, we achieved a 26% reduction..."
3,"('In 2021, we achieved a 9% absolute emissions...","In 2021, we achieved a 9% absolute emissions r...",1,"In 2021, we achieved a 9% absolute emissions r..."
4,('This reduction was partially driven by energ...,This reduction was partially driven by energy ...,1,This reduction was partially driven by energy ...
...,...,...,...,...
955,"('2017 Retired and demolished 250 MW of coal, ...","2017 Retired and demolished 250 MW of coal, re...",72,"2017 Retired and demolished 250 MW of coal, re..."
956,('2018 Retired and demolished 636 MW of coal ...,2018 Retired and demolished 636 MW of coal and...,72,2018 Retired and demolished 636 MW of coal an...
957,"('2019 Aquired Gulf Power, which added 1,750 M...","2019 Aquired Gulf Power, which added 1,750 MW ...",72,"2019 Aquired Gulf Power, which added 1,750 MW ..."
958,('2020 Retired 615 MW of nuclear and 330 MW o...,2020 Retired 615 MW of nuclear and 330 MW of c...,72,2020 Retired 615 MW of nuclear and 330 MW of ...


In [453]:
sentences_matched[sentences_matched.matched_rel_only.duplicated()]

Unnamed: 0,matched_rel,original_rel,company_label,matched_rel_only
853,"('/ TJX purchased 29,000 megawatt hours more r...",Renewable energy certificates (RECs)† 330,62,"/ TJX purchased 29,000 megawatt hours more ren..."


In [454]:
sentences_matched = sentences_matched.drop_duplicates('matched_rel_only')

In [455]:
original_merge_match = sentences_matched.rename(columns = {'original_rel': 'relevant_sentences'})

In [456]:
original_merge_match

Unnamed: 0,matched_rel,relevant_sentences,company_label,matched_rel_only
0,"('Looking toward the future, we have set clima...","In 2021, 9.6% of our purchased electricity cam...",1,"Looking toward the future, we have set climate..."
1,('A large portion of this renewable electricit...,A large portion of this renewable electricity ...,1,A large portion of this renewable electricity ...
2,"('From 2012 to 2020, we achieved a 26% reducti...","From 2012 to 2020, we achieved a 26% reduction...",1,"From 2012 to 2020, we achieved a 26% reduction..."
3,"('In 2021, we achieved a 9% absolute emissions...","In 2021, we achieved a 9% absolute emissions r...",1,"In 2021, we achieved a 9% absolute emissions r..."
4,('This reduction was partially driven by energ...,This reduction was partially driven by energy ...,1,This reduction was partially driven by energy ...
...,...,...,...,...
955,"('2017 Retired and demolished 250 MW of coal, ...","2017 Retired and demolished 250 MW of coal, re...",72,"2017 Retired and demolished 250 MW of coal, re..."
956,('2018 Retired and demolished 636 MW of coal ...,2018 Retired and demolished 636 MW of coal and...,72,2018 Retired and demolished 636 MW of coal an...
957,"('2019 Aquired Gulf Power, which added 1,750 M...","2019 Aquired Gulf Power, which added 1,750 MW ...",72,"2019 Aquired Gulf Power, which added 1,750 MW ..."
958,('2020 Retired 615 MW of nuclear and 330 MW o...,2020 Retired 615 MW of nuclear and 330 MW of c...,72,2020 Retired 615 MW of nuclear and 330 MW of ...


In [457]:
merge_rel_matched = total_relevant.merge(original_merge_match, how = 'left', on = 'relevant_sentences')

In [458]:
merge_rel_matched = merge_rel_matched[merge_rel_matched.matched_rel.notnull()].drop(['company_label_y'], axis = 1)

In [459]:
merge_rel_matched

Unnamed: 0,relevant_sentences,company_label_x,company_index,matched_rel,matched_rel_only
0,"In 2021, 9.6% of our purchased electricity cam...",EliLilly,0001,"('Looking toward the future, we have set clima...","Looking toward the future, we have set climate..."
1,A large portion of this renewable electricity ...,EliLilly,0001,('A large portion of this renewable electricit...,A large portion of this renewable electricity ...
2,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,0001,"('From 2012 to 2020, we achieved a 26% reducti...","From 2012 to 2020, we achieved a 26% reduction..."
3,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,0001,"('In 2021, we achieved a 9% absolute emissions...","In 2021, we achieved a 9% absolute emissions r..."
4,This reduction was partially driven by energy ...,EliLilly,0001,('This reduction was partially driven by energ...,This reduction was partially driven by energy ...
...,...,...,...,...,...
993,"2017 Retired and demolished 250 MW of coal, re...",NextEraEnergyZeroCarbonBlueprint,0072,"('2017 Retired and demolished 250 MW of coal, ...","2017 Retired and demolished 250 MW of coal, re..."
994,2018 Retired and demolished 636 MW of coal and...,NextEraEnergyZeroCarbonBlueprint,0072,('2018 Retired and demolished 636 MW of coal ...,2018 Retired and demolished 636 MW of coal an...
995,"2019 Aquired Gulf Power, which added 1,750 MW ...",NextEraEnergyZeroCarbonBlueprint,0072,"('2019 Aquired Gulf Power, which added 1,750 M...","2019 Aquired Gulf Power, which added 1,750 MW ..."
996,2020 Retired 615 MW of nuclear and 330 MW of c...,NextEraEnergyZeroCarbonBlueprint,0072,('2020 Retired 615 MW of nuclear and 330 MW o...,2020 Retired 615 MW of nuclear and 330 MW of ...


In [460]:
merge_rel_matched[merge_rel_matched.relevant_sentences.duplicated()]

Unnamed: 0,relevant_sentences,company_label_x,company_index,matched_rel,matched_rel_only


In [461]:
merge_rel_all = merge_rel_matched.rename(columns = {'matched_rel_only': 'all_sentences'})

In [462]:
test_merge = merge_rel_all.merge(total_all, how = 'left', on = 'all_sentences')

In [463]:
test_merge = test_merge.drop(['company_label_x', 'company_index_x'], axis = 1)

In [464]:
test_merge_clean = test_merge.drop(['matched_rel'], axis = 1)
test_merge_clean.rename(columns = {'company_index_y': 'company_index'})

Unnamed: 0,relevant_sentences,all_sentences,company_label,company_index,sent_index
0,"In 2021, 9.6% of our purchased electricity cam...","Looking toward the future, we have set climate...",EliLilly,0001,0020
1,A large portion of this renewable electricity ...,A large portion of this renewable electricity ...,EliLilly,0001,0021
2,"From 2012 to 2020, we achieved a 26% reduction...","From 2012 to 2020, we achieved a 26% reduction...",EliLilly,0001,0025
3,"In 2021, we achieved a 9% absolute emissions r...","In 2021, we achieved a 9% absolute emissions r...",EliLilly,0001,0026
4,This reduction was partially driven by energy ...,This reduction was partially driven by energy ...,EliLilly,0001,0027
...,...,...,...,...,...
907,"2017 Retired and demolished 250 MW of coal, re...","2017 Retired and demolished 250 MW of coal, re...",NextEraEnergyZeroCarbonBlueprint,0072,0052
908,2018 Retired and demolished 636 MW of coal and...,2018 Retired and demolished 636 MW of coal an...,NextEraEnergyZeroCarbonBlueprint,0072,0053
909,"2019 Aquired Gulf Power, which added 1,750 MW ...","2019 Aquired Gulf Power, which added 1,750 MW ...",NextEraEnergyZeroCarbonBlueprint,0072,0054
910,2020 Retired 615 MW of nuclear and 330 MW of c...,2020 Retired 615 MW of nuclear and 330 MW of ...,NextEraEnergyZeroCarbonBlueprint,0072,0055


In [465]:
test_merge_clean['label'] = 'rel'
test_rel = test_merge_clean

In [466]:
test_rel = test_rel.rename(columns = {'all_sentences': 'rel_match_all', 'company_index_y': 'company_index'})

In [467]:
test_rel

Unnamed: 0,relevant_sentences,rel_match_all,company_label,company_index,sent_index,label
0,"In 2021, 9.6% of our purchased electricity cam...","Looking toward the future, we have set climate...",EliLilly,0001,0020,rel
1,A large portion of this renewable electricity ...,A large portion of this renewable electricity ...,EliLilly,0001,0021,rel
2,"From 2012 to 2020, we achieved a 26% reduction...","From 2012 to 2020, we achieved a 26% reduction...",EliLilly,0001,0025,rel
3,"In 2021, we achieved a 9% absolute emissions r...","In 2021, we achieved a 9% absolute emissions r...",EliLilly,0001,0026,rel
4,This reduction was partially driven by energy ...,This reduction was partially driven by energy ...,EliLilly,0001,0027,rel
...,...,...,...,...,...,...
907,"2017 Retired and demolished 250 MW of coal, re...","2017 Retired and demolished 250 MW of coal, re...",NextEraEnergyZeroCarbonBlueprint,0072,0052,rel
908,2018 Retired and demolished 636 MW of coal and...,2018 Retired and demolished 636 MW of coal an...,NextEraEnergyZeroCarbonBlueprint,0072,0053,rel
909,"2019 Aquired Gulf Power, which added 1,750 MW ...","2019 Aquired Gulf Power, which added 1,750 MW ...",NextEraEnergyZeroCarbonBlueprint,0072,0054,rel
910,2020 Retired 615 MW of nuclear and 330 MW of c...,2020 Retired 615 MW of nuclear and 330 MW of ...,NextEraEnergyZeroCarbonBlueprint,0072,0055,rel


In [468]:
total_all

Unnamed: 0,all_sentences,company_label,company_index,sent_index
0,"7/7/22, 10:29 AM Environmental | 2021 ESG Repo...",EliLilly,0001,0001
1,Making medicines requires the use of valuable ...,EliLilly,0001,0002
2,We’re committed to reducing our environmental ...,EliLilly,0001,0003
3,"To track our progress, we measure and manage e...",EliLilly,0001,0004
4,"Lilly manages health, safety and the environme...",EliLilly,0001,0005
...,...,...,...,...
3567,→ NextEra Energy Resources would invest in el...,NextEraEnergyZeroCarbonBlueprint,0072,0219
3568,→ All non-FPL fossil generation assets would ...,NextEraEnergyZeroCarbonBlueprint,0072,0220
3569,→ Vehicle fleet conversions are based on the ...,NextEraEnergyZeroCarbonBlueprint,0072,0221
3570,Assumptions 0 0,NextEraEnergyZeroCarbonBlueprint,0072,0222


In [469]:
total_all = total_all.reset_index()
total_all = total_all.drop(columns = ['index'])

In [470]:
test_irr = total_all.merge(test_merge_clean, how = 'outer', on = 'all_sentences', indicator = True)

In [471]:
test_irr = test_irr[test_irr.relevant_sentences.isnull()]

In [472]:
test_irr['label'] = 'irr'

In [473]:
test_irr = test_irr.drop(['relevant_sentences', 'company_label_y', 'company_index_y', 'sent_index_y', '_merge'], axis = 1)

In [474]:
test_irr = test_irr.reset_index().drop('index', axis = 1 )

In [475]:
test_irr = test_irr.rename(columns = {'company_label_x': 'company_label', 'sent_index_x': 'sent_index'})

In [476]:
total_all.company_index = total_all.company_index.apply('="{}"'.format)

In [477]:
total_all.sent_index = total_all.sent_index.apply('="{}"'.format)

In [478]:
total_all.to_csv('all_with_index.csv', encoding = 'utf-8-sig')

In [479]:
test_rel.company_index = test_rel.company_index.apply('="{}"'.format)

In [480]:
test_rel.sent_index = test_rel.sent_index.apply('="{}"'.format)

In [481]:
test_rel.to_csv('rel_with_index.csv', encoding = 'utf-8-sig')

In [482]:
test_irr.company_index = test_irr.company_index.apply('="{}"'.format)

In [483]:
test_irr.sent_index = test_irr.sent_index.apply('="{}"'.format)

In [484]:
test_irr.to_csv('irr_with_index.csv', encoding = 'utf-8-sig')

In [363]:
len(test_irr)

77728

In [364]:
len(test_merge_clean)

912

In [365]:
len(total_all)

78640

In [413]:
# Reindexing sentences
sentences_matched = pd.read_csv('sentences_matched.csv', index_col = 0)
sentences_matached = sentences_matched.reset_index()
sentences_matched = sentences_matched.drop(columns = ['index'])

In [414]:
all_var_df = pd.DataFrame([all_var, all_lab], index = ['matched_rel_only', 'company_label']).transpose()

In [415]:
sentences_matched = sentences_matched.merge(all_var_df, how = 'right', on = ['matched_rel_only', 'company_label'])

In [416]:
rel_from_all = sentences_matched[sentences_matched.matched_rel.notnull()]

In [417]:
irr_from_all = sentences_matched[sentences_matched.matched_rel.isnull()]

In [418]:
print("Relevant Sentences: ", len(rel_from_all), "| Irrelevant Sentences: ", len(irr_from_all), "| Total: ", len(rel_from_all) + len(irr_from_all))

Relevant Sentences:  871 | Irrelevant Sentences:  77241 | Total:  78112


In [419]:
rel_from_all = rel_from_all.rename(columns = {'original_rel': 'relevant_sentences'})
irr_from_all = irr_from_all.rename(columns = {'matched_rel_only': 'irrelevant_sentences'})

In [420]:
rel_from_all = rel_from_all[['relevant_sentences', 'company_label']]
irr_from_all = irr_from_all[['irrelevant_sentences', 'company_label']]

In [421]:
rel_from_all = rel_from_all.rename(columns = {'relevant_sentences': 'sentences'})

In [422]:
rel_from_all['label'] = 'rel'

In [423]:
irr_from_all = irr_from_all.rename(columns = {'irrelevant_sentences': 'sentences'})

In [424]:
irr_from_all['label'] = 'irr'

In [425]:
all_sent = pd.concat([rel_from_all, irr_from_all])

In [426]:
len(all_sent)

78112

### Clean up extra spaces in sentences to further remove duplicates

In [427]:
all_sent['sentences'] = all_sent['sentences'].apply(lambda x: re.sub(' +', ' ', x))

In [428]:
all_sent['sentences'] = all_sent['sentences'].apply(lambda x: re.sub('\xa0', ' ', x))

In [429]:
all_sent['sentences'] = all_sent['sentences'].apply(lambda x: re.sub('\t', ' ', x))

In [430]:
all_sent['sentences'] = all_sent['sentences'].apply(lambda x: x.strip())

In [431]:
all_sent['sentences'] = all_sent['sentences'].apply(lambda x: re.sub('\r', ' ', x))

In [432]:
all_sent[all_sent.duplicated(subset = 'sentences')]

Unnamed: 0,sentences,company_label,label
609,• Achieve a 60% reduction in scope 1 and scope...,UnitedHealthGroup,irr
1606,This includes overseeing the company’s program...,Merck,irr
1855,This agreement represents another landmark mil...,Merck,irr
1856,"The global stockpile will offer a critical, ra...",Merck,irr
2093,Prescription Drug Marketing Act and all applic...,Merck,irr
...,...,...,...
75840,Then we will prioritize the categories for whi...,Duke_Energy,irr
75856,These releases are expected to decrease signif...,Duke_Energy,irr
75875,"Building a safe, diverse and engaged workforce.",Duke_Energy,irr
75933,"In December, 71% of enrollments in payment ass...",Duke_Energy,irr


In [433]:
all_sent['sent_no_space'] = all_sent.sentences.str.replace(" ", "")

In [434]:
all_sent = all_sent.drop_duplicates(subset = 'sent_no_space')

In [435]:
all_sent = all_sent.drop_duplicates(subset = 'sentences')

In [436]:
all_sent = all_sent.drop('sent_no_space', axis = 1)

In [437]:
all_sent.index = [x for x in range(1, len(all_sent)+1)]

In [438]:
all_sent['key'] = all_sent.index

In [439]:
all_sent = all_sent.reindex(columns= ['key', 'sentences', 'company_label', 'label'])

In [440]:
all_sent

Unnamed: 0,key,sentences,company_label,label
1,1,"In 2021, 9.6% of our purchased electricity cam...",EliLilly,rel
2,2,A large portion of this renewable electricity ...,EliLilly,rel
3,3,"From 2012 to 2020, we achieved a 26% reduction...",EliLilly,rel
4,4,"In 2021, we achieved a 9% absolute emissions r...",EliLilly,rel
5,5,This reduction was partially driven by energy ...,EliLilly,rel
...,...,...,...,...
77273,77273,→ FPL’s four nuclear units continue to operate...,NextEraEnergyZeroCarbonBlueprint,irr
77274,77274,Technology We assume that: → FPL’s gas plants ...,NextEraEnergyZeroCarbonBlueprint,irr
77275,77275,→ NextEra Energy Resources would invest in ele...,NextEraEnergyZeroCarbonBlueprint,irr
77276,77276,→ All non-FPL fossil generation assets would r...,NextEraEnergyZeroCarbonBlueprint,irr


In [441]:
all_sent.drop_duplicates('sentences').groupby('label').count()[['key']]

Unnamed: 0_level_0,key
label,Unnamed: 1_level_1
irr,76406
rel,871


### Save extracted_relevant_sentences, extracted_irrelevant_sentences, all_sentences with keys

In [442]:
rel = all_sent[all_sent.label == 'rel']
irr = all_sent[all_sent.label == 'irr']

In [443]:
all_sent.to_csv('all_sentences.csv', encoding = 'utf-8-sig')
rel.to_csv("extracted_relevant_sentences.csv", encoding = 'utf-8-sig')
irr.to_csv("extracted_irrelevant_sentences.csv", encoding = 'utf-8-sig')

In [444]:
all_sent.groupby('label').count()[['key']]

Unnamed: 0_level_0,key
label,Unnamed: 1_level_1
irr,76406
rel,871


## C) Update Sentences Statistics

In [445]:
# rel_from_all = rel_from_all.rename(columns = {'original_rel': 'relevant_sentences'})
# irr_from_all = irr_from_all.rename(columns = {'matched_rel_only': 'irrelevant_sentences'})

In [446]:
# rel_from_all = rel_from_all[['relevant_sentences', 'company_label']]
# irr_from_all = irr_from_all[['irrelevant_sentences', 'company_label']]

In [447]:
rel_from_all = rel.rename(columns = {'sentences': 'relevant_sentences'})

In [448]:
irr_from_all = irr.rename(columns = {'sentences': 'irrelevant_sentences'})

In [449]:
total_relevant_stat = rel_from_all.groupby('company_label', sort = False).count()[['relevant_sentences']]

In [450]:
total_irrelevant_stat = irr_from_all.groupby('company_label', sort = False).count()[['irrelevant_sentences']]

In [451]:
total_stat_final = total_irrelevant_stat.merge(total_relevant_stat, how = 'left', on = 'company_label', sort = False)

In [452]:
total_stat_final['relevant_sentences'] = total_stat_final['relevant_sentences'].fillna(0)

In [453]:
total_stat_final['relevant_sentences'] = total_stat_final['relevant_sentences'].astype(int)

In [454]:
total_stat_final['rel/total percentages'] = round((total_stat_final['relevant_sentences'] / (total_stat_final['relevant_sentences'] + total_stat_final['irrelevant_sentences'])) * 100, 2)

In [455]:
total_stat_final.to_csv('total_stat_final.csv')

In [456]:
total_stat_final.sum()

irrelevant_sentences     76406.00
relevant_sentences         871.00
rel/total percentages      119.52
dtype: float64

### Save the sentence dictionary into json file

In [457]:
sent_dict = {}

for key, sent in zip(all_sent.key, all_sent.sentences):
    sent_dict[key] = sent

In [458]:
import json

with open('sentence_dict.json', 'w') as fp:
    json.dump(sent_dict, fp, sort_keys=True, indent=4)

In [459]:
with open('sentence_dict.json', 'r') as fp:
    data = json.load(fp)

In [460]:
import os
import shutil
import glob

In [461]:
path = '/Users/tylerryoo/t3/extracted_sentences/notebooks/final_extracted_statistics_notebooks'
files = glob.glob(path + "/*.csv")

In [462]:
files.append('/Users/tylerryoo/t3/extracted_sentences/notebooks/final_extracted_statistics_notebooks/sentence_dict.json')

In [463]:
for file in files:
    
    if file == '/Users/tylerryoo/t3/extracted_sentences/notebooks/final_extracted_statistics_notebooks/string_matched.csv':
        continue

    filename = file.split('/')[-1]
    
    target = (r'/Users/tylerryoo/t3/relevant_irrelevant_sentences_labeled_final/' + filename)

    shutil.move(file, target)

### Appendix

In [464]:
#### Previous Method to Separate relevant and irrelevant sentences --> also reordering sentences

In [224]:
# test_extract_relevant = []
# test_extract_relevant_label = []
# test_extract_irrelevant = []
# test_extract_irrelevant_label = []

# for sent, lab in zip(all_var, all_lab):
#     if sent in sentences_matched.matched_rel_only.to_list():
#         test_extract_relevant.append(sent)
#         test_extract_relevant_label.append(lab)
#     elif sent not in sentences_matched.matched_rel_only.to_list():
#         test_extract_irrelevant.append(sent)
#         test_extract_irrelevant_label.append(lab)

In [225]:
# len(test_extract_relevant), len(test_extract_relevant_label)

In [226]:
# len(test_extract_irrelevant), len(test_extract_irrelevant_label)

In [227]:
# ACTION: Make sure to match rel_from_all and extract_rel_var --> to preserve the order of relevant sentences 

In [228]:
# rel_from_all = pd.DataFrame(list(zip(test_extract_relevant, test_extract_relevant_label)), columns = ['relevant_sentences', 'company_label'])

In [229]:
# rel_from_all_reorder = rel_from_all.rename(columns = {'relevant_sentences': 'matched_rel_only'}) 

In [230]:
# rel_from_all_reorder = rel_from_all_reorder.merge(sentences_matched, how = 'left', on = ['matched_rel_only', 'company_label'])

In [231]:
# rel_from_all_reorder[rel_from_all_reorder.company_label == 'Target']

In [232]:
# Observed instances where more null values from original relevant sentences were observed but discard them as they are considered irrelevant
# for i in rel_from_all_reorder[rel_from_all_reorder.original_rel.isnull()]['matched_rel_only']:
#     print(i)

In [233]:
#  #remove null values after merge --> sentences were irrelevant
# rel_from_all_reorder = rel_from_all_reorder[rel_from_all_reorder.original_rel.notnull()]

In [234]:
# rel_from_all_reorder = rel_from_all_reorder[['original_rel', 'matched_rel_only', 'company_label']] 

In [235]:
# irr_from_all = pd.DataFrame(list(zip(test_extract_irrelevant, test_extract_irrelevant_label)),  columns = ['irrelevant_sentences', 'company_label'])

In [236]:
# org_rel = pd.DataFrame(list(zip(sentences_matched.original_rel, sentences_matched.company_label)), columns = ['relevant_sentences', 'company_label'])

In [237]:
# len(rel_from_all_reorder), len(irr_from_all)

In [238]:
# # save the final version of relevant sentences
# pd.DataFrame(rel_from_all).to_csv("extracted_relevant_sentences.csv", encoding = 'utf-8-sig')

In [239]:
# save the final version of irrelevant sentences
# pd.DataFrame(irr_from_all).to_csv("extracted_irrelevant_sentences.csv", encoding = 'utf-8-sig')

In [240]:
# save the final version of relevant sentences
# pd.DataFrame(org_rel).to_csv("extracted_relevant_sentences.csv", encoding = 'utf-8-sig')

In [241]:
# Remove sentences that are shorter than 10 (these are irrelevant sentences)
# still_not_matched = still_not_matched[still_not_matched['sent_count'] >= 10]

In [242]:
# more_match = []
# org_rel = []
# track = 0
# for i, j in zip(still_not_matched.original_rel, still_not_matched.company_label):
#     val = process.extractOne(i.lower(), [i.lower() for i in all_var], scorer = fuzz.partial_ratio)
#     more_match.append(val)
#     org_rel.append(i)
#     print(track, end = " ")
#     track += 1

In [243]:
# more_match_above = []
# for i, j in zip(more_match, org_rel):
#     if i[1] >= 80:
#         print(i)
#         print()
#         print(j)
#         print()
# #         more_match_above.append(i[0])

In [244]:
# extract_rel_var = []
# extract_mat1 = []

# # extracting relevant sentences with ratio greater than 90
# for i, j in zip(mat1, rel_var):
#     if i is None:
#         continue
#     elif i[1] >= 90:
#         extract_rel_var.append(j) # original relevant sentences
#         extract_mat1.append(i) # relevant sentences from all

In [245]:
# extract_mat1 = []
# extract_mat_label = []

# for i, j in zip(mat1, matlabel):
#     if i is None:
#         continue
#     elif i[1] >= 90:
#         extract_mat_label.append(j) # relevant sentences comp labels
#         extract_mat1.append(i) # relevant sentences from all

In [246]:
# len(extract_mat1), len(extract_mat_label), len(extract_rel_var)

In [247]:
# only saving extracted relevant sentences without ratio values  
# rel_from_all = []
# for i in extract_mat1:
#     rel_from_all.append(i[0])