# Web scraping Yahoo Finance stock data

# Packages   

Import all the below packages

In [1]:
import urllib.request
from bs4 import BeautifulSoup
import datetime
import pandas as pd

# Symbols   

Load the symbols from the df_symbols.csv file to collect the data from the yahoo financial web page. Or use the 'symbols_file.txt' file the same way as below. 

In [2]:
# Using the symbols from the csv file to pass on the Yahoo Finance URL below.

with open('df_symbols.csv', 'r')as f:
    symbol = f.readlines()

# print(symbol) # you can choose to print the symbols to make sure they load. 

In [3]:
allData = {} # create an empty dictionary variable to pass the results later. 
data_no = 0
for s in symbol:
    # the symbol appears twice on the Yahoo URL and because of that I separate the URL in two parts
    # to pass the 's' (symbol) to each part, otherwise it will not work. 
    html0 = (f'https://finance.yahoo.com/quote/{s}')
    html1 = f'/?p={s}'
    # concatenate the two parts togather 
    html2 = html0.strip('\n') + '' + html1.strip('\n')
    # pass the concatenated URL to the urllib.request
    html = urllib.request.urlopen(html2)
    
    bsObj = BeautifulSoup(html, 'lxml')
    # page = bsObj.find('div', {'class': 'render-target-active'})
    td_date = datetime.date.today() 
    date_today = td_date.strftime("%b %d %Y") # date is not scraped but created based on the day the data are collected.
    # locate the HTML tags where the data is placed and store the results into variables. 
    stock_name = bsObj.find('h1', {'class': 'D(ib)'}).text
    price = bsObj.find('span', {'class': 'Trsdu(0.3s)'}).text
    stock_change = bsObj.find_all('span', {'class': 'Trsdu(0.3s)'})[1].text
    after_close = bsObj.find('span', {'class': 'Fw(b)'}).text
    p_close = bsObj.find('table', {'class': 'W(100%)'})
    prev_close = p_close.find_all('td')[1].text
    stock_open = p_close.find_all('td')[3].text
    day_range = p_close.find_all('td')[9].text
    week_52_range = p_close.find_all('td')[11].text
    volume = p_close.find_all('td')[13].text
    avg_volume = p_close.find_all('td')[15].text
    cap_table = bsObj.find('table', {'class': 'M(0)'})
    cap = cap_table.find_all('td')[1].text
    beta = cap_table.find_all('td')[3].text
    per = cap_table.find_all('td')[5].text
    eps = cap_table.find_all('td')[7].text
    earn_date = cap_table.find_all('td')[9].text
    dividen = cap_table.find_all('td')[11].text
    year_target = cap_table.find_all('td')[15].text
    
    #time.sleep(.1)
    
    data_no += 1
    # pass the scraped data per each variable to the empty dictionary created above. 
    allData[data_no] = [date_today, stock_name, price, stock_change, after_close, prev_close, stock_open,\
                       day_range, week_52_range, volume, avg_volume, cap, beta, per, eps,\
                        earn_date, dividen, year_target ]


In [4]:
# Loop below shows the number of stocks collected. Good to use to see how many symbol are collected
# or which part of the code is broken. Especially when there are changes to a stock's symbol.

for z in allData:
    print(z)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119


In [5]:
# Check the result of data web scraped. 

allData

{1: ['Apr 29 2020',
  'XOM - Exxon Mobil Corporation',
  '44.97',
  '+1.03 (+2.34%)',
  '44.97',
  '43.94',
  '44.37',
  '44.11 - 45.39',
  '30.11 - 80.53',
  '25,289,901',
  '38,672,771',
  '190.242B',
  '1.27',
  '13.39',
  '3.36',
  'May 01, 2020',
  '3.48 (7.92%)',
  '47.13'],
 2: ['Apr 29 2020',
  'OXY - Occidental Petroleum Corporation',
  '15.17',
  '+1.02 (+7.21%)',
  '15.17',
  '14.15',
  '14.46',
  '14.30 - 15.26',
  '9.00 - 60.73',
  '45,496,862',
  '37,022,361',
  '13.908B',
  '1.75',
  'N/A',
  '-1.22',
  'May 05, 2020',
  '0.44 (3.19%)',
  '15.12'],
 3: ['Apr 29 2020',
  'CVX - Chevron Corporation',
  '89.91',
  '+0.20 (+0.22%)',
  '89.91',
  '89.71',
  '89.55',
  '88.83 - 90.96',
  '51.60 - 127.00',
  '10,152,429',
  '14,838,419',
  '167.86B',
  '1.18',
  '58.38',
  '1.54',
  'May 01, 2020',
  '5.16 (5.75%)',
  '90.62'],
 4: ['Apr 29 2020',
  'BRK-A - Berkshire Hathaway Inc.',
  '280,600.00',
  '-664.00 (-0.24%)',
  '280,600.00',
  '281,264.00',
  '285,400.00',
  '280,60

## Columns  

Name the column names to match the variable names in the above script. 

In [6]:
allData2 = pd.DataFrame.from_dict(allData, orient='index', columns = ['Date', 'company_name', 'price_at_close',\
                                                                      'price_change', 'price_after_hours',\
                                                                      'previous_close', 'today opened',\
                                                                       'price range', '52 week range', 'volume',\
                                                                      'average volume', 'market cap', 'beta (3Y Monthly)',\
                                                                      'PE Ratio', 'EPS', 'earnings date', 'Dividend Yield',\
                                                                      '1Y target est'
                                                                      ])

## Save

Save the scraped data into a csv file and use the datetime package to include the current date into the title of the csv file. 

In [7]:
# use datetime package

today = datetime.date.today() # store the today's date.
td = today.strftime("%b_%d_%Y") # format the date as 'Month dd yyyy'

allData2.to_csv(f'stock_data_yahoo_{td}.csv')