In [1]:
import requests
import bs4
import pandas as pd
import re  # regular expressions - can find patterns in text, which is useful for parsing website responses 
import numpy as np

In [2]:
url = 'https://finance.yahoo.com/quote/ARNC/financials?p=ARNC'
r = requests.get(url)

In [3]:
soup = bs4.BeautifulSoup(r.text)

In [4]:
# seems like most of the data is in 'span' tags
span_list = soup.find_all('span')
# most of the index for the table is found in the span tags where there is class="Va(m)"
vam_span_list = soup.find_all('span', attrs = {'class': 'Va(m)'})

In [5]:
vam_span_list[:5]

[<span class="Va(m)" data-reactid="75">Total Revenue</span>,
 <span class="Va(m)" data-reactid="92">Cost of Revenue</span>,
 <span class="Va(m)" data-reactid="109">Gross Profit</span>,
 <span class="Va(m)" data-reactid="129">Operating Expense</span>,
 <span class="Va(m)" data-reactid="146">Operating Income</span>]

In [6]:
# it seems most of the data (but not all) fall in this range of 17 between one index and the other
# for example, 75 + 17 = 92

In [7]:
total_revenue_id = 75
cost_of_revenue_id = 92
between_total_and_cost = [x for x in span_list if int(x.get('data-reactid')) > total_revenue_id 
        and int(x.get('data-reactid')) < cost_of_revenue_id]
between_total_and_cost

[<span data-reactid="78">7,277,000</span>,
 <span data-reactid="80">7,277,000</span>,
 <span data-reactid="82">7,442,000</span>,
 <span data-reactid="84">6,824,000</span>,
 <span data-reactid="86">6,661,000</span>]

In [8]:
# we'll use this logic to extract the data

In [9]:
# find the dates on the website
dates = [re.findall(r'\d{2}/\d{2}/\d{4}', str(i)) for i in span_list if len(re.findall(r'\d{2}/\d{2}/\d{4}', str(i))) != 0]
dates = [x[0] for x in dates]
# note that the first column is TTM
dates.insert(0, 'TTM')
dates

['TTM', '12/31/2019', '12/31/2018', '12/31/2017', '12/31/2016']

In [10]:
df_list = []
for i in range(len(vam_span_list)):
    r_id1 = int(vam_span_list[i].get('data-reactid'))
    upper_bound_id = r_id1 + 3 + (len(dates) * 2)
    info = [x for x in span_list if int(x.get('data-reactid')) > r_id1 and int(x.get('data-reactid')) < upper_bound_id]
    id_data_dict = {int(x.get('data-reactid')): x.text for x in info}
    data_list = []
    for j in range(r_id1 + 3, r_id1 + 3 + (len(dates) * 2), 2):
        if j in id_data_dict.keys():
            data_list.append(id_data_dict[j])
        else:
            data_list.append(np.nan)
    df_list.append(data_list)
    
index_names = [x.text for x in vam_span_list]
column_names = dates
pd.DataFrame(df_list, index = index_names, columns = column_names)

Unnamed: 0,TTM,12/31/2019,12/31/2018,12/31/2017,12/31/2016
Total Revenue,7277000.0,7277000.0,7442000.0,6824000.0,6661000.0
Cost of Revenue,6270000.0,6270000.0,6549000.0,5866000.0,5602000.0
Gross Profit,1007000.0,1007000.0,893000.0,958000.0,1059000.0
Operating Expense,643000.0,643000.0,623000.0,693000.0,736000.0
Operating Income,364000.0,364000.0,270000.0,265000.0,323000.0
Net Non Operating Interest Income Expense,-102000.0,-102000.0,-116000.0,-158000.0,-91000.0
Other Income Expense,-85000.0,-85000.0,87000.0,144000.0,-64000.0
Pretax Income,177000.0,177000.0,241000.0,251000.0,168000.0
Tax Provision,-48000.0,-48000.0,71000.0,42000.0,13000.0
Net Income Common Stockholders,225000.0,225000.0,210000.0,209000.0,155000.0
