# ETL of Dow 30 Stock Data
#### Fabienne Zumbuehl; James Ye; Tanvir Khan

In [1]:
import pandas as pd
#from bs4 import BeautifulSoup as bs
from splinter import Browser
import time
from sqlalchemy import create_engine
import psycopg2

In [2]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

## Price data from cnbc.com

In [3]:
price_url = "https://www.cnbc.com/dow-30/"

In [4]:
browser.visit(price_url)
time.sleep(5)
price_html = browser.html

In [5]:
price_data_tables = pd.read_html(price_html)
print("Number of tables: ", len(price_data_tables))
price_data_df = price_data_tables[0]

# drop ticker 'Dow' because their substainability can not be found in Yahoo Finance
price_data_df.drop(index=29, inplace=True)

price_data_df = price_data_df[["SYMBOL","NAME", "PRICE", "LOW", "HIGH", "PREVIOUS CLOSE"]].copy()
price_data_df.rename(columns = {"SYMBOL":"symbol","NAME":"company", "PRICE":"price", "LOW":"low", "HIGH":"high", "PREVIOUS CLOSE":"previous_close"}, inplace=True)
price_data_df

Number of tables:  1


Unnamed: 0,symbol,company,price,low,high,previous_close
0,AXP,American Express Co,94.32,92.89,95.57,89.33
1,AAPL,Apple Inc,316.73,316.5,324.24,318.89
2,BA,Boeing Co,144.73,142.61,145.91,137.53
3,CAT,Caterpillar Inc,117.41,115.98,118.562,112.47
4,CSCO,Cisco Systems Inc,44.84,44.68,45.8,44.9
5,CVX,Chevron Corp,93.3,92.03,94.34,90.28
6,XOM,Exxon Mobil Corp,45.91,45.47,46.3,44.6
7,GS,Goldman Sachs Group Inc,196.06,185.22,197.1,179.93
8,HD,Home Depot Inc,242.48,242.07,248.22,241.88
9,IBM,International Business Machines Corp,121.76,121.01,122.62,118.39


In [6]:
dow29_symbols = price_data_df["symbol"]
type(dow29_symbols)

pandas.core.series.Series

## Dividend data from dividend.com

In [7]:
dividend_url = "https://www.dividend.com/dividend-stocks/dow-30-dividend-stocks/"

In [8]:
dividend_data_tables = pd.read_html(dividend_url)
print("Number of tables: ", len(dividend_data_tables))
dividend_data_df = dividend_data_tables[0]
dividend_data_df.head()

Number of tables:  1


Unnamed: 0.1,Unnamed: 0,Stock Symbol,Company Name,DARSâ¢ Rating,Dividend Yield,Closing Price,Annualized Dividend,Ex-Div Date,Pay Date
0,,XOM,Exxon Mobil,locked,7.81%,$41.97,$3.4800,2020-05-12,2020-06-10
1,,BA,Boeing Co.,locked,5.98%,$120.57,$8.2200,2020-02-13,2020-03-06
2,,CVX,Chevron Corp,locked,5.72%,$89.53,$5.1600,2020-05-18,2020-06-10
3,,IBM,IBM Corp,locked,5.51%,$114.33,$6.5200,2020-05-07,2020-06-10
4,,UTX,United Technologies,locked,4.89%,$86.01,$2.9400,2020-02-13,2020-03-10


In [9]:
# only run these once!!!
del(dividend_data_df["Unnamed: 0"])
#dividend_data_df.set_index("Stock Symbol", inplace=True)
dividend_data_df.head()

Unnamed: 0,Stock Symbol,Company Name,DARSâ¢ Rating,Dividend Yield,Closing Price,Annualized Dividend,Ex-Div Date,Pay Date
0,XOM,Exxon Mobil,locked,7.81%,$41.97,$3.4800,2020-05-12,2020-06-10
1,BA,Boeing Co.,locked,5.98%,$120.57,$8.2200,2020-02-13,2020-03-06
2,CVX,Chevron Corp,locked,5.72%,$89.53,$5.1600,2020-05-18,2020-06-10
3,IBM,IBM Corp,locked,5.51%,$114.33,$6.5200,2020-05-07,2020-06-10
4,UTX,United Technologies,locked,4.89%,$86.01,$2.9400,2020-02-13,2020-03-10


In [10]:
dividend_reduced_df = dividend_data_df[['Stock Symbol', 'Dividend Yield', 'Annualized Dividend', 'Ex-Div Date', 'Pay Date']].copy()
dividend_reduced_df.drop(index=28, inplace=True)
dividend_df = dividend_reduced_df
dividend_df

Unnamed: 0,Stock Symbol,Dividend Yield,Annualized Dividend,Ex-Div Date,Pay Date
0,XOM,7.81%,$3.4800,2020-05-12,2020-06-10
1,BA,5.98%,$8.2200,2020-02-13,2020-03-06
2,CVX,5.72%,$5.1600,2020-05-18,2020-06-10
3,IBM,5.51%,$6.5200,2020-05-07,2020-06-10
4,UTX,4.89%,$2.9400,2020-02-13,2020-03-10
5,WBA,4.60%,$1.8300,2020-05-19,2020-06-12
6,PFE,4.08%,$1.5200,2020-05-07,2020-06-05
7,MMM,4.03%,$5.8800,2020-05-21,2020-06-12
8,JPM,3.99%,$3.6000,2020-07-02,2020-07-31
9,CAT,3.66%,$4.1200,2020-04-17,2020-05-20


In [11]:
dividend_df['Dividend Yield'] = dividend_df['Dividend Yield'].str.replace('%','').astype(float)
dividend_df['Annualized Dividend'] = dividend_df['Annualized Dividend'].str.replace('$','').astype(float)


In [12]:
print(dividend_df.dtypes)

dividend_df.rename(columns = {"Stock Symbol":"symbol", "Dividend Yield": "dividend_yield", "Annualized Dividend":"annualized_dividend", "Ex-Div Date":"ex_div_date", "Pay Date":"pay_date"}, inplace=True)


dividend_df.head()

Stock Symbol            object
Dividend Yield         float64
Annualized Dividend    float64
Ex-Div Date             object
Pay Date                object
dtype: object


Unnamed: 0,symbol,dividend_yield,annualized_dividend,ex_div_date,pay_date
0,XOM,7.81,3.48,2020-05-12,2020-06-10
1,BA,5.98,8.22,2020-02-13,2020-03-06
2,CVX,5.72,5.16,2020-05-18,2020-06-10
3,IBM,5.51,6.52,2020-05-07,2020-06-10
4,UTX,4.89,2.94,2020-02-13,2020-03-10


## ESG Rating data from finance.yahoo.com

In [13]:
print("ticker", "ESG Score")
egs_rating_list = []
first = True
for stock_symbol in dow29_symbols:
    sustainability_url = f"https://finance.yahoo.com/quote/{stock_symbol}/sustainability?p={stock_symbol}"
    browser.visit(sustainability_url)
    if first:
        time.sleep(5)
        first = False
    else:
        time.sleep(1)
    ESG_Risk_Score = browser.find_by_css('div[class="Fz(36px) Fw(600) D(ib) Mend(5px)"]').value
    print(stock_symbol, ESG_Risk_Score)
    egs_rating_list.append(ESG_Risk_Score)

ticker ESG Score
AXP 22
AAPL 24
BA 39
CAT 38
CSCO 14
CVX 40
XOM 34
GS 32
HD 13
IBM 18
INTC 16
JNJ 35
KO 26
JPM 22
MCD 25
MMM 34
MRK 28
MSFT 16
NKE 17
PFE 33
PG 25
TRV 24
UNH 21
RTX 29
VZ 20
V 18
WBA 17
WMT 29
DIS 15


In [14]:
egs_rating_list

['22',
 '24',
 '39',
 '38',
 '14',
 '40',
 '34',
 '32',
 '13',
 '18',
 '16',
 '35',
 '26',
 '22',
 '25',
 '34',
 '28',
 '16',
 '17',
 '33',
 '25',
 '24',
 '21',
 '29',
 '20',
 '18',
 '17',
 '29',
 '15']

In [15]:
price_data_df['esg_rating'] = egs_rating_list


In [16]:
price_data_df['esg_rating'] = pd.to_numeric(price_data_df['esg_rating'])
price_data_df.dtypes

symbol             object
company            object
price             float64
low               float64
high              float64
previous_close    float64
esg_rating          int64
dtype: object

In [17]:
price_data_df

Unnamed: 0,symbol,company,price,low,high,previous_close,esg_rating
0,AXP,American Express Co,94.32,92.89,95.57,89.33,22
1,AAPL,Apple Inc,316.73,316.5,324.24,318.89,24
2,BA,Boeing Co,144.73,142.61,145.91,137.53,39
3,CAT,Caterpillar Inc,117.41,115.98,118.562,112.47,38
4,CSCO,Cisco Systems Inc,44.84,44.68,45.8,44.9,14
5,CVX,Chevron Corp,93.3,92.03,94.34,90.28,40
6,XOM,Exxon Mobil Corp,45.91,45.47,46.3,44.6,34
7,GS,Goldman Sachs Group Inc,196.06,185.22,197.1,179.93,32
8,HD,Home Depot Inc,242.48,242.07,248.22,241.88,13
9,IBM,International Business Machines Corp,121.76,121.01,122.62,118.39,18


# Load data into Postgres database

In [18]:
# Creating connection with database
# stock_db is database created in PGAdmin
connection_string = "postgres:postgres@localhost:5432/stock_db"
engine = create_engine(f'postgresql://{connection_string}')

In [19]:
# Confirm tables
engine.table_names()


['price_data_df', 'dividend_df']

In [20]:
print(price_data_df.dtypes)

price_data_df.head()


symbol             object
company            object
price             float64
low               float64
high              float64
previous_close    float64
esg_rating          int64
dtype: object


Unnamed: 0,symbol,company,price,low,high,previous_close,esg_rating
0,AXP,American Express Co,94.32,92.89,95.57,89.33,22
1,AAPL,Apple Inc,316.73,316.5,324.24,318.89,24
2,BA,Boeing Co,144.73,142.61,145.91,137.53,39
3,CAT,Caterpillar Inc,117.41,115.98,118.562,112.47,38
4,CSCO,Cisco Systems Inc,44.84,44.68,45.8,44.9,14


In [21]:
price_data_df.to_sql(name='price_data_df', con=engine, if_exists='replace', index=False)
# if_exists='replace' instead of append method if table already have a data


In [22]:
print(dividend_df.dtypes)

dividend_df.head()


symbol                  object
dividend_yield         float64
annualized_dividend    float64
ex_div_date             object
pay_date                object
dtype: object


Unnamed: 0,symbol,dividend_yield,annualized_dividend,ex_div_date,pay_date
0,XOM,7.81,3.48,2020-05-12,2020-06-10
1,BA,5.98,8.22,2020-02-13,2020-03-06
2,CVX,5.72,5.16,2020-05-18,2020-06-10
3,IBM,5.51,6.52,2020-05-07,2020-06-10
4,UTX,4.89,2.94,2020-02-13,2020-03-10


In [23]:
dividend_df.to_sql(name='dividend_df', con=engine, if_exists='replace', index=False)
# if_exists='replace' if table already have a data

### Confirm data has been added by querying the price_data_df table


In [24]:
pd.read_sql_query('select * from price_data_df ORDER BY symbol ASC', con=engine)


Unnamed: 0,symbol,company,price,low,high,previous_close,esg_rating
0,AAPL,Apple Inc,316.73,316.5,324.24,318.89,24
1,AXP,American Express Co,94.32,92.89,95.57,89.33,22
2,BA,Boeing Co,144.73,142.61,145.91,137.53,39
3,CAT,Caterpillar Inc,117.41,115.98,118.562,112.47,38
4,CSCO,Cisco Systems Inc,44.84,44.68,45.8,44.9,14
5,CVX,Chevron Corp,93.3,92.03,94.34,90.28,40
6,DIS,Walt Disney Co,120.95,120.43,122.505,118.02,15
7,GS,Goldman Sachs Group Inc,196.06,185.22,197.1,179.93,32
8,HD,Home Depot Inc,242.48,242.07,248.22,241.88,13
9,IBM,International Business Machines Corp,121.76,121.01,122.62,118.39,18


### Confirm data has been added by querying the dividend_df table
#### UTX  is not in the price_data_df whereas VZ, RTX, are not in the dividend_df

In [25]:
pd.read_sql_query('select * from dividend_df ORDER BY symbol ASC', con=engine)

Unnamed: 0,symbol,dividend_yield,annualized_dividend,ex_div_date,pay_date
0,AAPL,1.03,3.28,2020-05-08,2020-05-14
1,AXP,1.93,1.72,2020-07-01,2020-08-10
2,BA,5.98,8.22,2020-02-13,2020-03-06
3,CAT,3.66,4.12,2020-04-17,2020-05-20
4,CSCO,3.21,1.44,2020-04-02,2020-04-22
5,CVX,5.72,5.16,2020-05-18,2020-06-10
6,DIS,1.49,1.76,2019-12-13,2020-01-16
7,GS,2.78,5.0,2020-05-29,2020-06-29
8,HD,2.48,6.0,2020-06-03,2020-06-18
9,IBM,5.51,6.52,2020-05-07,2020-06-10


### Joining both tables on "Symbol" column outputs final CSV file

In [32]:
final_dow_data = pd.merge(price_data_df, dividend_df, on='symbol', how='inner')
final_dow_data

Unnamed: 0,symbol,company,price,low,high,previous_close,esg_rating,dividend_yield,annualized_dividend,ex_div_date,pay_date
0,AXP,American Express Co,94.32,92.89,95.57,89.33,22,1.93,1.72,2020-07-01,2020-08-10
1,AAPL,Apple Inc,316.73,316.5,324.24,318.89,24,1.03,3.28,2020-05-08,2020-05-14
2,BA,Boeing Co,144.73,142.61,145.91,137.53,39,5.98,8.22,2020-02-13,2020-03-06
3,CAT,Caterpillar Inc,117.41,115.98,118.562,112.47,38,3.66,4.12,2020-04-17,2020-05-20
4,CSCO,Cisco Systems Inc,44.84,44.68,45.8,44.9,14,3.21,1.44,2020-04-02,2020-04-22
5,CVX,Chevron Corp,93.3,92.03,94.34,90.28,40,5.72,5.16,2020-05-18,2020-06-10
6,XOM,Exxon Mobil Corp,45.91,45.47,46.3,44.6,34,7.81,3.48,2020-05-12,2020-06-10
7,GS,Goldman Sachs Group Inc,196.06,185.22,197.1,179.93,32,2.78,5.0,2020-05-29,2020-06-29
8,HD,Home Depot Inc,242.48,242.07,248.22,241.88,13,2.48,6.0,2020-06-03,2020-06-18
9,IBM,International Business Machines Corp,121.76,121.01,122.62,118.39,18,5.51,6.52,2020-05-07,2020-06-10


In [33]:
final_dow_data = final_dow_data.sort_values(by='esg_rating', ascending=False)
final_dow_data

Unnamed: 0,symbol,company,price,low,high,previous_close,esg_rating,dividend_yield,annualized_dividend,ex_div_date,pay_date
5,CVX,Chevron Corp,93.3,92.03,94.34,90.28,40,5.72,5.16,2020-05-18,2020-06-10
2,BA,Boeing Co,144.73,142.61,145.91,137.53,39,5.98,8.22,2020-02-13,2020-03-06
3,CAT,Caterpillar Inc,117.41,115.98,118.562,112.47,38,3.66,4.12,2020-04-17,2020-05-20
11,JNJ,Johnson & Johnson,144.56,144.3,146.44,144.37,35,2.75,4.04,2020-05-22,2020-06-09
6,XOM,Exxon Mobil Corp,45.91,45.47,46.3,44.6,34,7.81,3.48,2020-05-12,2020-06-10
15,MMM,3M Co,152.08,149.2,152.98,146.44,34,4.03,5.88,2020-05-21,2020-06-12
19,PFE,Pfizer Inc,37.49,37.45,37.87,37.5,33,4.08,1.52,2020-05-07,2020-06-05
7,GS,Goldman Sachs Group Inc,196.06,185.22,197.1,179.93,32,2.78,5.0,2020-05-29,2020-06-29
25,WMT,Walmart Inc,123.86,123.63,125.51,124.33,29,1.73,2.16,2020-08-13,2020-09-08
16,MRK,Merck & Co Inc,77.26,76.91,78.6,76.37,28,3.19,2.44,2020-03-13,2020-04-07
