# ETL of Dow 30 Stock Data
#### Fabienne Zumbuehl; James Ye; Tanvir Khan

In [1]:
import pandas as pd
from bs4 import BeautifulSoup as bs
from splinter import Browser
import time
from sqlalchemy import create_engine

In [2]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

## price data from cnbc.com

In [3]:
price_url = "https://www.cnbc.com/dow-30/"

In [4]:
browser.visit(price_url)
time.sleep(5)
price_html = browser.html

In [5]:
price_data_tables = pd.read_html(price_html)
print("Number of tables: ", len(price_data_tables))
price_data_df = price_data_tables[0]
#price_data_df.set_index('id', inplace=True)

# drop ticker 'Dow' because their substainability can not be found in Yahoo Finance
price_data_df.drop(index=29, inplace=True)

price_data_df = price_data_df[["SYMBOL","NAME", "PRICE", "LOW", "HIGH", "PREVIOUS CLOSE"]].copy()
price_data_df.rename(columns = {"SYMBOL":"symbol","NAME":"company", "PRICE":"price", "LOW":"low", "HIGH":"high", "PREVIOUS CLOSE":"previous_close"}, inplace=True)
price_data_df

Number of tables:  1


Unnamed: 0,symbol,company,price,low,high,previous_close
0,AXP,American Express Co,89.83,89.125,91.43,89.83
1,AAPL,Apple Inc,316.85,315.87,320.89,316.85
2,BA,Boeing Co,139.0,136.151,144.239,139.0
3,CAT,Caterpillar Inc,114.06,113.82,115.68,114.06
4,CSCO,Cisco Systems Inc,44.64,44.555,45.655,44.64
5,CVX,Chevron Corp,92.04,91.28,93.37,92.04
6,XOM,Exxon Mobil Corp,44.56,44.39,45.79,44.56
7,GS,Goldman Sachs Group Inc,180.1,177.68,181.17,180.1
8,HD,Home Depot Inc,240.88,235.79,241.18,240.88
9,IBM,International Business Machines Corp,119.12,118.97,121.72,119.12


In [6]:
dow29_symbols = price_data_df["symbol"]
type(dow29_symbols)

pandas.core.series.Series

## dividend data from dividend.com

In [43]:
dividend_url = "https://www.dividend.com/dividend-stocks/dow-30-dividend-stocks/"

In [45]:
dividend_data_tables = pd.read_html(dividend_url)
print("Number of tables: ", len(dividend_data_tables))
dividend_data_df = dividend_data_tables[0]
dividend_data_df.head()

Number of tables:  1


Unnamed: 0.1,Unnamed: 0,Stock Symbol,Company Name,DARSâ¢ Rating,Dividend Yield,Closing Price,Annualized Dividend,Ex-Div Date,Pay Date
0,,XOM,Exxon Mobil,locked,7.87%,$45.74,$3.4800,2020-05-12,2020-06-10
1,,BA,Boeing Co.,locked,6.16%,$128.91,$8.2200,2020-02-13,2020-03-06
2,,CVX,Chevron Corp,locked,5.40%,$93.37,$5.1600,2020-05-18,2020-06-10
3,,IBM,IBM Corp,locked,5.30%,$122.59,$6.5200,2020-05-07,2020-06-10
4,,UTX,United Technologies,locked,5.16%,$86.01,$2.9400,2020-02-13,2020-03-10


In [46]:
# only run these once!!!
del(dividend_data_df["Unnamed: 0"])
#dividend_data_df.set_index("Stock Symbol", inplace=True)
dividend_data_df.head()

Unnamed: 0,Stock Symbol,Company Name,DARSâ¢ Rating,Dividend Yield,Closing Price,Annualized Dividend,Ex-Div Date,Pay Date
0,XOM,Exxon Mobil,locked,7.87%,$45.74,$3.4800,2020-05-12,2020-06-10
1,BA,Boeing Co.,locked,6.16%,$128.91,$8.2200,2020-02-13,2020-03-06
2,CVX,Chevron Corp,locked,5.40%,$93.37,$5.1600,2020-05-18,2020-06-10
3,IBM,IBM Corp,locked,5.30%,$122.59,$6.5200,2020-05-07,2020-06-10
4,UTX,United Technologies,locked,5.16%,$86.01,$2.9400,2020-02-13,2020-03-10


In [47]:
dividend_reduced_df = dividend_data_df[['Stock Symbol', 'Dividend Yield', 'Annualized Dividend', 'Ex-Div Date', 'Pay Date']].copy()
#dividend_reduced_df.index.name = "SYMBOL"
#dividend_reduced_df.drop(index=29, inplace=True)
dividend_df = dividend_reduced_df
# dividend_df.index.name = 'id'
dividend_df

Unnamed: 0,Stock Symbol,Dividend Yield,Annualized Dividend,Ex-Div Date,Pay Date
0,XOM,7.87%,$3.4800,2020-05-12,2020-06-10
1,BA,6.16%,$8.2200,2020-02-13,2020-03-06
2,CVX,5.40%,$5.1600,2020-05-18,2020-06-10
3,IBM,5.30%,$6.5200,2020-05-07,2020-06-10
4,UTX,5.16%,$2.9400,2020-02-13,2020-03-10
5,WBA,4.50%,$1.8300,2020-05-19,2020-06-12
6,PFE,4.11%,$1.5200,2020-05-07,2020-06-05
7,MMM,4.03%,$5.8800,2020-02-13,2020-03-12
8,JPM,3.95%,$3.6000,2020-04-03,2020-04-30
9,KO,3.68%,$1.6400,2020-06-12,2020-07-01


In [48]:
dividend_df['Dividend Yield'] = dividend_df['Dividend Yield'].str.replace('%','').astype(float)
dividend_df['Annualized Dividend'] = dividend_df['Annualized Dividend'].str.replace('$','').astype(float)


In [50]:
print(dividend_df.dtypes)

dividend_df.rename(columns = {"Stock Symbol":"symbol", "Dividend Yield": "dividend_yield", "Annualized Dividend":"annualized_dividend", "Ex-Div Date":"ex_div_date", "Pay Date":"pay_date"}, inplace=True)


dividend_df.head()

symbol                  object
dividend_yield         float64
Annualized Dividend    float64
ex_div_date             object
pay_date                object
dtype: object


Unnamed: 0,symbol,dividend_yield,annualized_dividend,ex_div_date,pay_date
0,XOM,7.87,3.48,2020-05-12,2020-06-10
1,BA,6.16,8.22,2020-02-13,2020-03-06
2,CVX,5.4,5.16,2020-05-18,2020-06-10
3,IBM,5.3,6.52,2020-05-07,2020-06-10
4,UTX,5.16,2.94,2020-02-13,2020-03-10


## finance.yahoo.com

In [13]:
print("ticker", "ESG Score")
egs_rating_list = []
first = True
for stock_symbol in dow29_symbols:
    sustainability_url = f"https://finance.yahoo.com/quote/{stock_symbol}/sustainability?p={stock_symbol}"
    browser.visit(sustainability_url)
    if first:
        time.sleep(5)
        first = False
    else:
        time.sleep(1)
    ESG_Risk_Score = browser.find_by_css('div[class="Fz(36px) Fw(600) D(ib) Mend(5px)"]').value
    print(stock_symbol, ESG_Risk_Score)
    egs_rating_list.append(ESG_Risk_Score)

ticker ESG Score
AXP 22
AAPL 24
BA 39
CAT 38
CSCO 14
CVX 40
XOM 34
GS 32
HD 13
IBM 18
INTC 16
JNJ 35
KO 26
JPM 22
MCD 25
MMM 34
MRK 28
MSFT 16
NKE 17
PFE 33
PG 25
TRV 24
UNH 21
RTX 29
VZ 20
V 18
WBA 17
WMT 29
DIS 15


In [14]:
egs_rating_list

['22',
 '24',
 '39',
 '38',
 '14',
 '40',
 '34',
 '32',
 '13',
 '18',
 '16',
 '35',
 '26',
 '22',
 '25',
 '34',
 '28',
 '16',
 '17',
 '33',
 '25',
 '24',
 '21',
 '29',
 '20',
 '18',
 '17',
 '29',
 '15']

In [15]:
price_data_df['esg_rating'] = egs_rating_list


In [16]:
price_data_df['esg_rating'] = pd.to_numeric(price_data_df['esg_rating'])
#price_data_df.rename(columns={'%CHANGE':'PCT_CHANGE'}, inplace=True)
#price_data_df['NAME'] = price_data_df['NAME'].astype(str)
price_data_df.dtypes

symbol             object
company            object
price             float64
low               float64
high              float64
previous_close    float64
esg_rating          int64
dtype: object

In [17]:
price_data_df

Unnamed: 0,symbol,company,price,low,high,previous_close,esg_rating
0,AXP,American Express Co,89.83,89.125,91.43,89.83,22
1,AAPL,Apple Inc,316.85,315.87,320.89,316.85,24
2,BA,Boeing Co,139.0,136.151,144.239,139.0,39
3,CAT,Caterpillar Inc,114.06,113.82,115.68,114.06,38
4,CSCO,Cisco Systems Inc,44.64,44.555,45.655,44.64,14
5,CVX,Chevron Corp,92.04,91.28,93.37,92.04,40
6,XOM,Exxon Mobil Corp,44.56,44.39,45.79,44.56,34
7,GS,Goldman Sachs Group Inc,180.1,177.68,181.17,180.1,32
8,HD,Home Depot Inc,240.88,235.79,241.18,240.88,13
9,IBM,International Business Machines Corp,119.12,118.97,121.72,119.12,18


# load data into database

In [18]:
connection_string = "postgres:postgres@localhost:5432/stock_db"
engine = create_engine(f'postgresql://{connection_string}')

In [19]:
# Confirm tables
engine.table_names()


['price_data_df', 'dividend_df']

In [55]:
#price_data_df.columns = [["symbol", "company", "price", "low", "high", "previous_close","esg_rating" ]]
print(price_data_df.dtypes)

price_data_df.head()


symbol             object
company            object
price             float64
low               float64
high              float64
previous_close    float64
esg_rating          int64
dtype: object


Unnamed: 0,symbol,company,price,low,high,previous_close,esg_rating
0,AXP,American Express Co,89.83,89.125,91.43,89.83,22
1,AAPL,Apple Inc,316.85,315.87,320.89,316.85,24
2,BA,Boeing Co,139.0,136.151,144.239,139.0,39
3,CAT,Caterpillar Inc,114.06,113.82,115.68,114.06,38
4,CSCO,Cisco Systems Inc,44.64,44.555,45.655,44.64,14


In [21]:
price_data_df.to_sql(name='price_data_df', con=engine, if_exists='append', index=False)

In [53]:
print(dividend_df.dtypes)

dividend_df.head()


symbol                  object
dividend_yield         float64
annualized_dividend    float64
ex_div_date             object
pay_date                object
dtype: object


Unnamed: 0,symbol,dividend_yield,annualized_dividend,ex_div_date,pay_date
0,XOM,7.87,3.48,2020-05-12,2020-06-10
1,BA,6.16,8.22,2020-02-13,2020-03-06
2,CVX,5.4,5.16,2020-05-18,2020-06-10
3,IBM,5.3,6.52,2020-05-07,2020-06-10
4,UTX,5.16,2.94,2020-02-13,2020-03-10


In [52]:
dividend_df.to_sql(name='dividend_df', con=engine, if_exists='append', index=False)

### Confirm data has been added by querying the price_data_df table


In [56]:
pd.read_sql_query('select * from price_data_df', con=engine).head()


Unnamed: 0,symbol,company,price,low,high,previous_close,esg_rating
0,AXP,American Express Co,89.83,89.125,91.43,89.83,22
1,AAPL,Apple Inc,316.85,315.87,320.89,316.85,24
2,BA,Boeing Co,139.0,136.151,144.239,139.0,39
3,CAT,Caterpillar Inc,114.06,113.82,115.68,114.06,38
4,CSCO,Cisco Systems Inc,44.64,44.555,45.655,44.64,14


### Confirm data has been added by querying the dividend_df table

In [57]:
pd.read_sql_query('select * from dividend_df', con=engine).head()

Unnamed: 0,symbol,dividend_yield,annualized_dividend,ex_div_date,pay_date
0,XOM,7.87,3.48,2020-05-12,2020-06-10
1,BA,6.16,8.22,2020-02-13,2020-03-06
2,CVX,5.4,5.16,2020-05-18,2020-06-10
3,IBM,5.3,6.52,2020-05-07,2020-06-10
4,UTX,5.16,2.94,2020-02-13,2020-03-10
