In [30]:
import re
import requests
import json
from bs4 import BeautifulSoup
from tqdm import tqdm

In [210]:
def build_request_url(ticker: str, region_code='usa'):
    request_api_prefix = 'http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?'	
    ticker_param = f't={ticker}&region={region_code}&culture=en-US&reportType=cf&period=12&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1'
    return request_api_prefix + ticker_param

In [211]:
URL = build_request_url('AMZN')
URL

'http://financials.morningstar.com/ajax/ReportProcess4HtmlAjax.html?t=AMZN&region=usa&culture=en-US&reportType=cf&period=12&dataType=A&order=asc&columnYear=5&curYearPart=1st5year&rounding=3&number=1'

In [212]:
text = requests.get(URL).text
text = re.sub('\'', '', text)

js = json.loads(text)
html = js['result']
soup = BeautifulSoup(html, 'html.parser')

In [213]:
tags = soup.find_all('div')
info = {}

for tag in tqdm(tags):
        debug_tag = tag
        attrs = tag.attrs

        if 'id' in attrs:
            tag_id = tag['id']
            value = tag.text

            # Parse currency and FY End month number
            if tag_id == 'unitsAndFiscalYear':
                info['fye_month'] = int(tag['fyenumber'])
                info['currency'] = tag['currency']

            # Parse Yrly or Qtrly values
            elif tag_id[:2] == 'Y_':
                parent_id = tag.parent['id']
                key = f'{parent_id}_{tag_id}'

                if 'rawvalue' in attrs:
                    # parse values i.e data_{statid}_{yearid} -> float value
                    if tag['rawvalue'] in ['—', 'nbsp']:
                        continue
                    info[key] = float(re.sub(',', '', tag['rawvalue']))
                    print('2.1', key)
                else:
                    # parse year labels i.e. Y_1 -> 2017-12
                    if 'title' in attrs:
                        value = tag['title']
                    info[key] = value
                    print('2.2', key, value)

            # Parse labels i.e. label_i4 -> Inventories
            elif tag_id[:3] == 'lab' and 'padding' not in tag_id:
                lbl_tag = tag.find("div", class_="lbl")
                value = lbl_tag['title'] if 'title' in lbl_tag.attrs else lbl_tag.text
                info[tag_id] = value
                print('3', tag_id, value)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 648/648 [00:00<00:00, 34711.92it/s]

3 label_s1 Cash Flows From Operating Activities
3 label_i1 Net income
3 label_i2 Depreciation & amortization
3 label_i3 Amortization of debt discount/premium and issuance costs
3 label_i4 Investment/asset impairment charges
3 label_i5 Investments losses (gains)
3 label_i6 Deferred income taxes
3 label_i7 (Gain) Loss from discontinued operations
3 label_i8 Extraordinary items
3 label_i9 Cumulative effect of accounting change
3 label_i10 Stock based compensation
3 label_i15 Change in working capital
3 label_i16 Accounts receivable
3 label_i17 Inventory
3 label_i18 Prepaid expenses
3 label_i19 Accounts payable
3 label_i20 Accrued liabilities
3 label_i21 Interest payable
3 label_i22 Income taxes payable
3 label_i23 Other working capital
3 label_i30 Other non-cash items
3 label_tts1 Net cash provided by operating activities
3 label_s2 Cash Flows From Investing Activities
3 label_i31 Investments in property, plant, and equipment
3 label_i32 Property, plant, and equipment reductions
3 label_i




In [214]:
info['fye_month'], info['currency']

(12, 'USD')

label_{sid}, Year_{pid}, data_{sid}_{pid}

stmt = {'fye_month':info['fye_month'],
        'currency': info['currency'], 
        'balance_sheet':bs_data, 
        'income_statement': is_data,
        'cashflow_statement': cf_data}

In [215]:
label_pattern = re.compile(r'label_(.*)')
year_pattern = re.compile(r'Year_(.*)')
stats = [k for k in info if label_pattern.search(k)]
years = [k for k in info if year_pattern.search(k)]

In [216]:
stmt_data = []
for stat in stats:
    sid = label_pattern.search(stat).group(1)
    stat_values = []
    for year in years:
        yid = year_pattern.search(year).group(1)
        key = f'data_{sid}_{yid}'
        stat_values.append({'period': info[year], 'value': info.get(key, float('nan'))})
    stmt_data.append({'name': info[stat], 'values': stat_values})

In [217]:
stmt_data

[{'name': 'Cash Flows From Operating Activities',
  'values': [{'period': '2017-12', 'value': 18434000000.0},
   {'period': '2018-12', 'value': 30723000000.0},
   {'period': '2019-12', 'value': 38514000000.0},
   {'period': '2020-12', 'value': 66064000000.0},
   {'period': '2021-12', 'value': 46327000000.0},
   {'period': 'TTM', 'value': 46327000000.0}]},
 {'name': 'Net income',
  'values': [{'period': '2017-12', 'value': 3033000000.0},
   {'period': '2018-12', 'value': 10073000000.0},
   {'period': '2019-12', 'value': 11588000000.0},
   {'period': '2020-12', 'value': 21331000000.0},
   {'period': '2021-12', 'value': 33364000000.0},
   {'period': 'TTM', 'value': 33364000000.0}]},
 {'name': 'Depreciation & amortization',
  'values': [{'period': '2017-12', 'value': 11478000000.0},
   {'period': '2018-12', 'value': 15341000000.0},
   {'period': '2019-12', 'value': 21789000000.0},
   {'period': '2020-12', 'value': 25251000000.0},
   {'period': '2021-12', 'value': 34296000000.0},
   {'perio

In [218]:
from functools import partial

In [220]:
def get_results(stmt_type, ticker):
    print(stmt_type, ticker)

In [221]:
get_bs = partial(get_results, 'bs')

In [222]:
get_bs('AMZN')

bs AMZN
