# Scraping Sustainalytics ESG Ratings from Yahoo Finance

## Set up

In [1]:
import pandas as pd
import os
from datetime import datetime as dt
from urllib import request
import json

In [5]:
portfolio = pd.read_csv("tickers.csv")
tickers = portfolio[portfolio.type == "Equity"].ticker.tolist()
tickers.append("ANTM")
tickers.append("PEP")
tickers

['TGT', 'PEP', 'NOVN', 'AMT', 'SVT', 'UNH', 'VZ', 'ELV', 'ANTM', 'PEP']

## Scrape a ticker's data

### Reference code

In [11]:
url = "https://query2.finance.yahoo.com/v1/finance/esgChart?symbol=TGT"
connection = request.urlopen(url)

data = connection.read()
data_2 = json.loads(data)
Formatdata = data_2["esgChart"]["result"][0]["symbolSeries"]
Formatdata_2 = pd.DataFrame(Formatdata)
Formatdata_2["timestamp"] = pd.to_datetime(Formatdata_2["timestamp"], unit="s")

In [9]:
pd.set_option('display.max_rows', 50)
Formatdata_2

Unnamed: 0,timestamp,esgScore,governanceScore,environmentScore,socialScore
0,2014-09-01,66.00,74.00,60.00,66.00
1,2014-10-01,66.00,74.00,60.00,66.00
2,2014-11-01,66.00,74.00,60.00,66.00
3,2014-12-01,66.00,74.00,60.00,66.00
4,2015-01-01,66.00,74.00,60.00,66.00
...,...,...,...,...,...
91,2022-04-01,,,,
92,2022-05-01,14.79,5.06,2.17,7.56
93,2022-06-01,,,,
94,2022-07-01,,,,


### Test out reference code

In [32]:
# get a list of urls to scrape
ticker = "ANTM"

base_url = "https://query2.finance.yahoo.com/v1/finance/esgChart?symbol="
url = base_url + ticker

In [33]:
# open url and get json data
connection = request.urlopen(url)
jsondata = connection.read()

# decode json to Python objects
data = json.loads(jsondata)

In [34]:
data

{'esgChart': {'result': [{'peerGroup': 'Healthcare',
    'symbolSeries': {'timestamp': [1409529600,
      1412121600,
      1414800000,
      1417392000,
      1420070400,
      1422748800,
      1425168000,
      1427846400,
      1430438400,
      1433116800,
      1435708800,
      1438387200,
      1441065600,
      1443657600,
      1446336000,
      1448928000,
      1451606400,
      1454284800,
      1456790400,
      1459468800,
      1462060800,
      1464739200,
      1467331200,
      1470009600,
      1472688000,
      1475280000,
      1477958400,
      1480550400,
      1483228800,
      1485907200,
      1488326400,
      1491004800,
      1493596800,
      1496275200,
      1498867200,
      1501545600,
      1504224000,
      1506816000,
      1509494400,
      1512086400,
      1514764800,
      1517443200,
      1519862400,
      1522540800,
      1525132800,
      1527811200,
      1530403200,
      1533081600,
      1535760000,
      1538352000,
      1541030400,


In [35]:
# extract and format data (including the timestamp column)
try:
    peer_group = data["esgChart"]["result"][0]["peerGroup"]
except:
    print("\tno sustainability data!")

peer_series = pd.DataFrame(data["esgChart"]["result"][0]["peerSeries"])
peer_series["timestamp"] = pd.to_datetime(peer_series["timestamp"], unit="s")
peer_series["ticker"] = ticker

symbol_series = pd.DataFrame(data["esgChart"]["result"][0]["symbolSeries"])
symbol_series["timestamp"] = pd.to_datetime(symbol_series["timestamp"], unit="s")
symbol_series["ticker"] = ticker

In [36]:
peer_series

Unnamed: 0,timestamp,esgScore,governanceScore,environmentScore,socialScore,ticker
0,2014-09-01,58.267857,61.821429,60.875000,54.410714,ANTM
1,2014-10-01,58.375000,61.875000,61.017857,54.517857,ANTM
2,2014-11-01,58.357143,61.839286,61.017857,54.517857,ANTM
3,2014-12-01,58.321429,61.839286,61.017857,54.464286,ANTM
4,2015-01-01,58.875000,61.803571,61.678571,55.321429,ANTM
...,...,...,...,...,...,...
88,2022-01-01,,,,,ANTM
89,2022-02-01,22.345690,7.974035,2.282456,12.069474,ANTM
90,2022-03-01,,,,,ANTM
91,2022-04-01,,,,,ANTM


## Scrape list of tickers' data

In [6]:
# define scraper and formatting function

list_peer_series = []
list_symbol_series = []
no_data = []

def get_esgdata(base_url, ticker):
    
    print("getting data for", ticker, "...")
    
    # open url and get json data
    url = base_url + ticker
    connection = request.urlopen(url)
    jsondata = connection.read()

    # decode json to Python objects
    data = json.loads(jsondata)
    
    # extract and format data (including the timestamp column)
    try:
        peer_group = data["esgChart"]["result"][0]["peerGroup"]
    except:
        print("\tno sustainability data!")
        no_data.append(ticker)
        return

    peer_series = pd.DataFrame(data["esgChart"]["result"][0]["peerSeries"])
    peer_series["ticker"] = ticker
    peer_series["peer_group"] = peer_group
    list_peer_series.append(peer_series)

    symbol_series = pd.DataFrame(data["esgChart"]["result"][0]["symbolSeries"])
    symbol_series["ticker"] = ticker
    symbol_series["peer_group"] = peer_group
    list_symbol_series.append(symbol_series)
    
    print("data for", ticker, "retrieved")

    return

In [7]:
# get data for each url in list of urls

portfolio = pd.read_csv("tickers.csv")
tickers = portfolio[portfolio.type == "Equity"].ticker.tolist()
tickers.append("ANTM")
tickers.append("RMS.PA")
tickers.append("PEP")
base_url = "https://query2.finance.yahoo.com/v1/finance/esgChart?symbol="

for ticker in tickers:
    get_esgdata(base_url, ticker)
    
print("\ndata extraction complete!")
    
peer_data = pd.concat(list_peer_series)
symbol_data = pd.concat(list_symbol_series)

peer_data["timestamp"] = pd.to_datetime(peer_data["timestamp"], unit="s")
symbol_data["timestamp"] = pd.to_datetime(symbol_data["timestamp"], unit="s")

getting data for TGT ...
data for TGT retrieved
getting data for PEP ...
data for PEP retrieved
getting data for NOVN ...
	no sustainability data!
getting data for AMT ...
data for AMT retrieved
getting data for SVT ...
	no sustainability data!
getting data for UNH ...
data for UNH retrieved
getting data for VZ ...
data for VZ retrieved
getting data for ELV ...
data for ELV retrieved
getting data for ANTM ...
data for ANTM retrieved
getting data for RMS.PA ...
data for RMS.PA retrieved
getting data for PEP ...
data for PEP retrieved

data extraction complete!


In [8]:
peer_data = peer_data.reset_index(drop=True)
symbol_data = symbol_data.reset_index(drop=True)

In [9]:
peer_data

Unnamed: 0,timestamp,esgScore,governanceScore,environmentScore,socialScore,ticker,peer_group
0,2014-09-01,57.272727,64.236364,51.345455,58.036364,TGT,Retailing
1,2014-10-01,57.357143,64.285714,51.446429,58.125000,TGT,Retailing
2,2014-11-01,57.357143,64.285714,51.446429,58.107143,TGT,Retailing
3,2014-12-01,57.267857,64.285714,51.285714,58.125000,TGT,Retailing
4,2015-01-01,57.250000,64.410714,51.196429,58.000000,TGT,Retailing
...,...,...,...,...,...,...,...
761,2022-04-01,,,,,PEP,Food Products
762,2022-05-01,28.063696,6.393933,10.497079,10.959663,PEP,Food Products
763,2022-06-01,,,,,PEP,Food Products
764,2022-07-01,,,,,PEP,Food Products


In [10]:
symbol_data

Unnamed: 0,timestamp,esgScore,governanceScore,environmentScore,socialScore,ticker,peer_group
0,2014-09-01,66.00,74.00,60.00,66.00,TGT,Retailing
1,2014-10-01,66.00,74.00,60.00,66.00,TGT,Retailing
2,2014-11-01,66.00,74.00,60.00,66.00,TGT,Retailing
3,2014-12-01,66.00,74.00,60.00,66.00,TGT,Retailing
4,2015-01-01,66.00,74.00,60.00,66.00,TGT,Retailing
...,...,...,...,...,...,...,...
761,2022-04-01,,,,,PEP,Food Products
762,2022-05-01,16.01,4.55,4.91,6.55,PEP,Food Products
763,2022-06-01,,,,,PEP,Food Products
764,2022-07-01,,,,,PEP,Food Products


In [11]:
# write to csv
peer_data.to_csv("peer_data.csv", index=False)
symbol_data.to_csv("symbol_data.csv", index=False)