In [1]:
!pip3 install selenium



In [2]:
import pandas as pd
import numpy as np

## 1. Run Selenium and BeautifulSoup to scrape S&P Global ESG Scores page

In [3]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time

chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome("/chromedriver", options=chrome_options)

driver.get("https://www.spglobal.com/esg/scores/")
time.sleep(3)
click_accept = driver.find_elements(By.XPATH, '//*[@id="onetrust-accept-btn-handler"]')
click_accept[0].click()

historical_scores = {}
company_data = {}

# Key: Ticker, Value: Official Company Name (Based on S&P Global website)
companies = {
    "AAPL": "Apple Inc.",
    "ABBV": "AbbVie Inc.",
    "AMZN": "Amazon.com, Inc.",
    "CVX": "Chevron Corporation",
    "GOOGL": "Alphabet Inc.",
    "HD": "The Home Depot, Inc.",
    "JNJ": "Johnson & Johnson",
    "JPM": "JPMorgan Chase & Co.",
    "LLY": "Eli Lilly and Company",
    "MA": "Mastercard Incorporated",
    "META": "Meta Platforms, Inc.",
    "MRK": "Merck & Co., Inc.",
    "MSFT": "Microsoft Corporation",
    "NVDA": "NVIDIA Corporation",
    "PFE": "Pfizer Inc.",
    "PG": "The Procter & Gamble Company",
    "TSLA": "Tesla, Inc.",
    "UNH": "UnitedHealth Group Incorporated",
    "V": "Visa Inc.",
    "XOM": "Exxon Mobil Corporation"
}


i = 1
for ticker, name in companies.items():
    print(f"##### Iteration {i}: SCRAPING {name} #####")
    if i == 1:
        search_box = driver.find_elements(By.XPATH, "/html/body/div[3]/div[11]/div/div[3]/div/div/div[2]/div[1]/input")
        search_box[0].send_keys(name)
    else:
        search_box = driver.find_elements(By.XPATH, "/html/body/div[3]/div[1]/div[3]/div/div/div[2]/div[1]/input")
        search_box[0].send_keys(name)
    time.sleep(1)
    search_box[0].send_keys(Keys.ENTER)
    time.sleep(5)

    try:
        ESG_score = driver.find_elements(By.XPATH, '//*[@id="esg-score"]')[0].text
        industry = driver.find_elements(By.XPATH, '//*[@id="company-industry"]')[0].text
        company_data[ticker] = industry
        time.sleep(3)

        historical_scores[ticker] = {}
        historical_scores[ticker]['ESG Scores By Year'] = []
        historical_scores[ticker]['ESG Scores By Category'] = {}
        historical_scores[ticker]['Company Performance'] = []
        historical_scores[ticker]['Industry Best Performance'] = []
        historical_scores[ticker]['Industry Mean Performance'] = []

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        historical_chart = soup.find('g', {'class': 'highcharts-markers highcharts-series-0 highcharts-line-series highcharts-tracker'})
        points = historical_chart.find_all('path', {'class':'highcharts-point'})
        for point in points:
            label = point.get('aria-label')
            historical_scores[ticker]['ESG Scores By Year'].append(label)

        company_perf = soup.find('g', {'class' : 'highcharts-markers highcharts-series-0 highcharts-area-series highcharts-tracker'})
        company_points = company_perf.find_all('path', {'class': 'highcharts-point'})
        for point in company_points:
            label = point.get('aria-label')
            historical_scores[ticker]['Company Performance'].append(label)

        industry_mean = soup.find('g', {'class' : 'highcharts-markers highcharts-series-2 highcharts-line-series highcharts-tracker'})
        industry_mean_points = industry_mean.find_all('path', {'class': 'highcharts-point'})
        for point in industry_mean_points:
            label = point.get('aria-label')
            historical_scores[ticker]['Industry Mean Performance'].append(label)

        industry_best = soup.find('g', {'class' : 'highcharts-markers highcharts-series-1 highcharts-line-series highcharts-tracker'})
        industry_best_points = industry_best.find_all('path', {'class': 'highcharts-point'})
        for point in industry_best_points:
            label = point.get('aria-label')
            historical_scores[ticker]['Industry Best Performance'].append(label)

        environmental_score_div = soup.find('div', {'class': 'dimention-chart1'})
        environmental_scores = environmental_score_div.find_all('li')
        historical_scores[ticker]['ESG Scores By Category']['Environmental'] = []
        for score in environmental_scores:
            historical_scores[ticker]['ESG Scores By Category']['Environmental'].append(score.text)

        social_score_div = soup.find('div', {'class': 'dimention-chart2'})
        social_scores = social_score_div.find_all('li')
        historical_scores[ticker]['ESG Scores By Category']['Social'] = []
        for score in social_scores:
            historical_scores[ticker]['ESG Scores By Category']['Social'].append(score.text)

        governance_score_div = soup.find('div', {'class': 'dimention-chart3'})
        governance_scores = governance_score_div.find_all('li')
        historical_scores[ticker]['ESG Scores By Category']['Governance'] = []
        for score in governance_scores:
            historical_scores[ticker]['ESG Scores By Category']['Governance'].append(score.text)

    except:
        if i == 1:
            search_box = driver.find_elements(By.XPATH, "/html/body/div[3]/div[11]/div/div[3]/div/div/div[2]/div[1]/input")
            search_box[0].clear()
        else:
            search_box = driver.find_elements(By.XPATH, "/html/body/div[3]/div[1]/div[3]/div/div/div[2]/div[1]/input")
            search_box[0].clear()
        print(f"!!!!!!!!!! unable to scrape {ticker} !!!!!!!!!!")
    i += 1
    print(f"##### SCRAPING DONE FOR {name} #####")

driver.close()
print("##################################################")
print(f"##### SCRAPING COMPLETED #####")

  driver = webdriver.Chrome("/chromedriver", options=chrome_options)


##### Iteration 1: SCRAPING Apple Inc. #####
##### SCRAPING DONE FOR Apple Inc. #####
##### Iteration 2: SCRAPING AbbVie Inc. #####
##### SCRAPING DONE FOR AbbVie Inc. #####
##### Iteration 3: SCRAPING Amazon.com, Inc. #####
##### SCRAPING DONE FOR Amazon.com, Inc. #####
##### Iteration 4: SCRAPING Chevron Corporation #####
##### SCRAPING DONE FOR Chevron Corporation #####
##### Iteration 5: SCRAPING Alphabet Inc. #####
##### SCRAPING DONE FOR Alphabet Inc. #####
##### Iteration 6: SCRAPING The Home Depot, Inc. #####
##### SCRAPING DONE FOR The Home Depot, Inc. #####
##### Iteration 7: SCRAPING Johnson & Johnson #####
!!!!!!!!!! unable to scrape JNJ !!!!!!!!!!
##### SCRAPING DONE FOR Johnson & Johnson #####
##### Iteration 8: SCRAPING JPMorgan Chase & Co. #####
##### SCRAPING DONE FOR JPMorgan Chase & Co. #####
##### Iteration 9: SCRAPING Eli Lilly and Company #####
##### SCRAPING DONE FOR Eli Lilly and Company #####
##### Iteration 10: SCRAPING Mastercard Incorporated #####
##### SCRA

In [4]:
# Helper Function to Process Scrapped Data
def process_historical_scores(scores):
    """
    Takes in scrapped data and extract Historical ESG Scores, Company Performance in various categories,
    and Industry Performances (Mean and Best) in the same categories.

    Returns historical_ESG_scores -> dict, ESG_component_scores -> dict, company_performances -> dict
    """
    historical_ESG_scores = {}
    ESG_component_scores = {}
    company_performances = {}

    for ticker, scrapped_data in scores.items():
        historical_ESG_scores[ticker] = {}
        company_performances[ticker] = {}
        ESG_component_scores[ticker] = {}

        for data in scrapped_data['ESG Scores By Year']:
            split = data.split(", ")
            year = split[0].split(". ")[-1]
            esg_score = split[-1].split(".")[0]
            historical_ESG_scores[ticker][int(year)] = int(esg_score)
        
        for component, data in scrapped_data['ESG Scores By Category'].items():
            ESG_component_scores[ticker][component] = {}
            ESG_component_scores[ticker][component]['Company'] = int(data[0].split(" ")[-1])
            ESG_component_scores[ticker][component]['Industry Mean'] = int(data[1].split(" ")[-1])
            ESG_component_scores[ticker][component]['Industry Best'] = int(data[2].split(" ")[-1])

        for data in scrapped_data['Company Performance']:
            if "/ " in data: # to handle "Information Security/ Cybersecurity & System Availability"
                data = data.replace("/ ", " ")
            split = data.split(", ")
            category = split[0].split(". ")[-1]
            perf_score = split[-1].split(".")[0]
            if category not in company_performances[ticker].keys():
                company_performances[ticker][category] = {}    
            company_performances[ticker][category]['Company'] = int(perf_score)

        for data in scrapped_data['Industry Best Performance']:
            if "/ " in data: # to handle "Information Security/ Cybersecurity & System Availability"
                data = data.replace("/ ", " ")
            split = data.split(", ")
            category = split[0].split(". ")[-1]
            perf_score = split[-1].split(".")[0]
            if category not in company_performances[ticker].keys():
                company_performances[ticker][category] = {}  
            company_performances[ticker][category]['Industry Best'] = int(perf_score)

        for data in scrapped_data['Industry Mean Performance']:
            if "/ " in data: # to handle "Information Security/ Cybersecurity & System Availability"
                data = data.replace("/ ", " ")
            split = data.split(", ")
            category = split[0].split(". ")[-1]
            perf_score = split[-1].split(".")[0]
            if category not in company_performances[ticker].keys():
                company_performances[ticker][category] = {}  
            company_performances[ticker][category]['Industry Mean'] = int(perf_score)

    return historical_ESG_scores, ESG_component_scores, company_performances

In [5]:
historical_ESG_scores, ESG_component_scores, company_performances = process_historical_scores(historical_scores)

In [6]:
component_dict = {}
for ticker, data in ESG_component_scores.items():
    component_dict[ticker] = pd.DataFrame.from_dict(data, orient='index')

component_dict['AAPL']

Unnamed: 0,Company,Industry Mean,Industry Best
Environmental,61,33,95
Social,24,29,89
Governance,29,32,89


In [7]:
performance_dict = {}
for ticker, data in company_performances.items():
    performance_dict[ticker] = pd.DataFrame.from_dict(data, orient='index').sort_index()
performance_dict['AAPL']

Unnamed: 0,Company,Industry Best,Industry Mean
Climate Strategy,78,98,33
Human Capital Development,45,100,42
Human Rights,33,100,27
Information Security Cybersecurity & System Availability,16,93,26
Innovation Management,4,100,19
Operational Eco-Efficiency,74,100,36
Product Stewardship,59,100,25
Supply Chain Management,6,99,27


In [8]:
performance_dict

{'AAPL':                                                     Company  Industry Best  \
 Climate Strategy                                         78             98   
 Human Capital Development                                45            100   
 Human Rights                                             33            100   
 Information Security Cybersecurity & System Ava...       16             93   
 Innovation Management                                     4            100   
 Operational Eco-Efficiency                               74            100   
 Product Stewardship                                      59            100   
 Supply Chain Management                                   6             99   
 
                                                     Industry Mean  
 Climate Strategy                                               33  
 Human Capital Development                                      42  
 Human Rights                                                   27  
 In

In [9]:
historical_esg_scores_df = pd.DataFrame.from_dict(historical_ESG_scores)
historical_esg_scores_df

Unnamed: 0,AAPL,ABBV,AMZN,CVX,GOOGL,HD,JPM,LLY,MA,META,MRK,MSFT,NVDA,PFE,PG,TSLA,UNH,V,XOM
2018,27,80,14,43,31,30,45,30,45,11,39,58,69,27,52,13,76,53,35
2019,29,76,18,40,38,31,37,30,58,15,40,57,72,29,60,14,69,58,37
2020,29,81,21,42,40,26,37,29,62,14,39,58,74,31,60,15,70,63,36
2021,32,82,24,39,44,37,40,33,59,18,38,58,74,30,58,27,74,62,36
2022,37,75,22,43,46,33,36,41,61,25,43,56,72,35,49,37,79,65,37


In [10]:
industries = set(company_data.values())
industries_company_dict = {}
for industry in industries:
    industries_company_dict[industry] = [company for company in company_data.keys() if company_data[company] == industry]
industries_company_dict

{'Industry: SOF Software': ['MSFT'],
 'Industry: AUT Automobiles': ['TSLA'],
 'Industry: HEA Health Care Providers & Services': ['UNH'],
 'Industry: THQ Computers & Peripherals and Office Electronics': ['AAPL'],
 'Industry: DRG Pharmaceuticals': ['LLY', 'MRK', 'PFE'],
 'Industry: BTC Biotechnology': ['ABBV'],
 'Industry: BNK Banks': ['JPM'],
 'Industry: OGX Oil & Gas Upstream & Integrated': ['CVX', 'XOM'],
 'Industry: RTS Retailing': ['AMZN', 'HD'],
 'Industry: TSV IT services': ['MA', 'V'],
 'Industry: HOU Household Products': ['PG'],
 'Industry: SEM Semiconductors & Semiconductor Equipment': ['NVDA'],
 'Industry: IMS Interactive Media, Services & Home Entertainment': ['GOOGL',
  'META']}

In [11]:
lst = ['MA', 'V', 'MSFT','AAPL']
for company in lst:
    print(company)
    print(performance_dict[company])

MA
                                                    Company  Industry Best  \
Climate Strategy                                         77             99   
Environmental Policy & Management Systems                31            100   
Human Capital Development                                85             98   
Information Security Cybersecurity & System Ava...       57            100   
Innovation Management                                    39            100   
Operational Eco-Efficiency                               82            100   
Talent Attraction & Retention                            59             90   

                                                    Industry Mean  
Climate Strategy                                               18  
Environmental Policy & Management Systems                      25  
Human Capital Development                                      35  
Information Security Cybersecurity & System Ava...             26  
Innovation Management           

In [12]:
performance_dict['AAPL']

Unnamed: 0,Company,Industry Best,Industry Mean
Climate Strategy,78,98,33
Human Capital Development,45,100,42
Human Rights,33,100,27
Information Security Cybersecurity & System Availability,16,93,26
Innovation Management,4,100,19
Operational Eco-Efficiency,74,100,36
Product Stewardship,59,100,25
Supply Chain Management,6,99,27


### Exporting Historical ESG Scores

In [26]:
historical_ESG_scores_df = pd.DataFrame.from_dict(historical_ESG_scores)
historical_ESG_scores_df.to_csv("Historical_ESG_Scores/Companies_Historical_ESG_Scores.csv")

### Exporting ESG Component Score and Company Performance Scores

In [31]:
companies = list(ESG_component_scores.keys())
for company in companies:
    company_df = pd.DataFrame.from_dict(ESG_component_scores[company])
    company_perf_df = pd.DataFrame.from_dict(company_performances[company])
    company_df.to_csv(f"ESG_Component_Scores/{company}_ESG_Component.csv")
    company_perf_df.to_csv(f"Company_Performances/{company}_Performance.csv")