In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests, lxml
from lxml import html
import csv
from decimal import *
import math

In [2]:
class statistics:
    base_url = "https://finance.yahoo.com/"
    def __init__(self, symbol):
        '''

        :param symbol: stock symbol in all caps
        please note that any Canadian TSX stocks are followed with ".TO"     
        check the relevant URL for formatting.
        '''
        self.symbol = symbol.upper()
        self.path = "quote/{0}/key-statistics?p={0}".format(symbol)
        self.url = self.base_url + self.path
        self.methods = ['scrape_page', 'label_stats']
        self.attributes = ['self.symbol', 'self.path', 'self.url',
                          'self.methods', 'self.hdrs','self.valuation',
                          'self.fiscal_year', 'self.profitability',
                          'self.manager_effect','self.income_statement',
                          'self.balance_sheet', 'self.cash_statement',
                          'self.price_history', 'self.share_stats',  
                          'self.dividendSplit']
        self.hdrs = {"authority": "finance.yahoo.com",
                     "method": "GET",
                     "path": self.path,
                     "scheme": "https",
                     "accept": "text/html,application/xml;q=0.9",
                     "accept-encoding": "gzip, deflate, br",
                     "accept-language": "en-US,en;q=0.9",
                     "referer": self.base_url,
                     "sec-fetch-mode": "navigate",
                     "sec-fetch-site": "same-origin",
                     "sec-fetch-user": "?1",
                     "upgrade-insecure-requests": "1",
                     "user-agent": "Mozilla/5.0 (Windows NT 10.0;)"}
    def scrape_page(self):
        '''

        :return: scrapes the content of the class URL,
                   using headers defined in the init function,
                   returning a byte string of html code.
        '''
        page = requests.get(self.url, headers=self.hdrs)
        soup = BeautifulSoup(page.content, 'lxml')
        tables = soup.find_all('table')
        iterator = range(0, len(tables))
        function = lambda x: pd.read_html(str(tables[x]))
        table_list = list(map(function, iterator))[0][0]
        market_cap = table_list.iloc[0][1]
        return market_cap
    def label_stats(self, table_list):
        '''
        :param table_list: uses the output of the scrape_page method
        :return: creates attributes for the statistics class object,
                 uses indexLabel method to label columns and set the dataframes' index
        
        '''
        iterator = [table_list[i][0] for i in range(0, len(table_list))]
        
        table_list = list(map(lambda df: self.__indexLabel__(df), iterator))
        self.valuation, self.fiscal_year, self.profitability, self.manager_effect, \
        self.income_statement, self.balance_sheet, self.cash_statement, \
        self.price_history, self.share_stats, self.dividendSplit = table_list
        
        return table_list
    def __indexLabel__(self, df):
        '''
        
        :param df: Takes a dataframe as input.
        :return: returns a dataframe with column labels and a set index.
        
        '''
        df.columns = ['Measure', 'Value']
        df = df.set_index('Measure')
        return df

In [3]:
def save_in_csv(stock_symbols):
    oneplace = Decimal(10) ** -1
    market_caps = []

    for stock_symbol in stock_symbols:
        market_cap = statistics(stock_symbol).scrape_page()
        print(stock_symbol, ': ', market_cap)
        
        if not isinstance(market_cap, str): #math.isnan(market_cap):
            market_caps.append('N/A')
        else:
            mc_number = float(market_cap[:-1])
            mc_number_to_one_decimal = Decimal(mc_number).quantize(oneplace)
            mc_number_unit = market_cap[-1] # M for million, B for billion, T for trillion
            formatted_mc = 'USD ' + str(mc_number_to_one_decimal) + ' ' + mc_number_unit
            market_caps.append(formatted_mc)

    print('mcs: ', market_caps)
    np.savetxt('soupy_trial.csv', [row for row in zip(['stock symbol', *stock_symbols], ['market cap', *market_caps])], delimiter=',', fmt='%s')

In [4]:
stock_symbols = ['TCEHY', 'PTR','KC','ZNH','ZME','WDH','TSM','TCOM','TAL','SVA','PDD','NIO','MF','LNVGY','LFC','KRKR','JOBS','JKS','JD','IQ','FUTU','EDU','CEA','BILI','BGNE','BABA']
save_in_csv(stock_symbols)

TCEHY :  478.29B
PTR :  150.71B
KC :  1.25B
ZNH :  15.26B
ZME :  25.23M
WDH :  nan
TSM :  500.27B
TCOM :  13.59B
TAL :  2.05B
SVA :  460.25M
PDD :  53.39B
NIO :  33.20B
MF :  308.44M
LNVGY :  nan
LFC :  100.22B
KRKR :  44.34M
JOBS :  4.11B
JKS :  nan
JD :  85.70B
IQ :  3.82B
FUTU :  5.14B
EDU :  2.12B
CEA :  11.88B
BILI :  nan
BGNE :  21.45B
BABA :  0.89
mcs:  ['USD 478.3 B', 'USD 150.7 B', 'USD 1.2 B', 'USD 15.3 B', 'USD 25.2 M', 'N/A', 'USD 500.3 B', 'USD 13.6 B', 'USD 2.0 B', 'USD 460.2 M', 'USD 53.4 B', 'USD 33.2 B', 'USD 308.4 M', 'N/A', 'USD 100.2 B', 'USD 44.3 M', 'USD 4.1 B', 'N/A', 'USD 85.7 B', 'USD 3.8 B', 'USD 5.1 B', 'USD 2.1 B', 'USD 11.9 B', 'N/A', 'USD 21.4 B', 'N/A']
