In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests, lxml
from lxml import html
import csv
from decimal import *
import math
import datetime
# from retry import retry

In [2]:
class statistics:
    base_url = "https://finance.yahoo.com/"
    def __init__(self, symbol):
        '''

        :param symbol: stock symbol in all caps
        please note that any Canadian TSX stocks are followed with ".TO"     
        check the relevant URL for formatting.
        '''
        self.symbol = symbol.upper()
        self.path = "quote/{0}/key-statistics?p={0}".format(symbol)
        self.url = self.base_url + self.path
        self.methods = ['scrape_page', 'label_stats']
        self.attributes = ['self.symbol', 'self.path', 'self.url',
                          'self.methods', 'self.hdrs','self.valuation']
#                           'self.fiscal_year', 'self.profitability',
#                           'self.manager_effect','self.income_statement',
#                           'self.balance_sheet', 'self.cash_statement',
#                           'self.price_history', 'self.share_stats',  
#                           'self.dividendSplit']
        self.hdrs = {"authority": "finance.yahoo.com",
                     "method": "GET",
                     "path": self.path,
                     "scheme": "https",
                     "accept": "text/html,application/xml;q=0.9",
                     "accept-encoding": "gzip, deflate, br",
                     "accept-language": "en-US,en;q=0.9",
                     "referer": self.base_url,
                     "sec-fetch-mode": "navigate",
                     "sec-fetch-site": "same-origin",
                     "sec-fetch-user": "?1",
                     "upgrade-insecure-requests": "1",
                     "user-agent": "Mozilla/5.0 (Windows NT 10.0;)"}
#     @retry ((IndexError), tries=3, delay=1, backoff=2)
    def scrape_page(self):
        '''

        :return: scrapes the content of the class URL,
                   using headers defined in the init function,
                   returning a byte string of html code.
        '''
        page = requests.get(self.url, headers=self.hdrs)
        soup = BeautifulSoup(page.content, 'lxml')
        tables = soup.find_all('table')
        iterator = range(0, len(tables))
        function = lambda x: pd.read_html(str(tables[x]))
        table_list = list(map(function, iterator))[0]
#         print('table: ', table_list[0])
        market_cap = table_list[0].iloc[0][1]
        return market_cap
    def label_stats(self, table_list):
        '''
        :param table_list: uses the output of the scrape_page method
        :return: creates attributes for the statistics class object,
                 uses indexLabel method to label columns and set the dataframes' index
        
        '''
        iterator = [table_list[i][0] for i in range(0, len(table_list))]
        
        table_list = list(map(lambda df: self.__indexLabel__(df), iterator))
#         self.valuation, self.fiscal_year, self.profitability, self.manager_effect, \
#         self.income_statement, self.balance_sheet, self.cash_statement, \
#         self.price_history, self.share_stats, self.dividendSplit = table_list
        
        self.valuation = table_list
        print(self.valuation)
        return table_list
    def __indexLabel__(self, df):
        '''
        
        :param df: Takes a dataframe as input.
        :return: returns a dataframe with column labels and a set index.
        
        '''
        df.columns = ['Measure', 'Value']
        df = df.set_index('Measure')
        return df

In [3]:
class RealTimeCurrencyConverter():
    def __init__(self, url):
        self.data = requests.get(url).json()
        self.rates = self.data['rates']
        self.date = self.data['date']
    def convert(self, amount, currency):
        if currency == 'USD':
            return amount
        conversion_rate = self.rates[currency]
        usd_amount = amount / self.rates[currency]
        return usd_amount

In [4]:
url = 'https://api.exchangerate-api.com/v4/latest/USD'
converter = RealTimeCurrencyConverter(url)

In [5]:
unit_multiplier = {
    'K': 1000,
    'M': 1000000,
    'B': 1000000000,
    'T': 1000000000000
}
def convert_mc_actual_number(mc_number, mc_number_unit):
    return mc_number * unit_multiplier[mc_number_unit]

In [6]:
def convert_formatted_mc_actual_number(usd_amount):
    oneplace = Decimal(10) ** -1
    if usd_amount >= unit_multiplier['T']:
        formatted_usd_amount = usd_amount/unit_multiplier['T']
        formatted_usd_amount_one_decimal = Decimal(formatted_usd_amount).quantize(oneplace, rounding=ROUND_UP)
        return str(formatted_usd_amount_one_decimal) + ' T'
    elif usd_amount >= unit_multiplier['B']:
        formatted_usd_amount = usd_amount/unit_multiplier['B']
        formatted_usd_amount_one_decimal = Decimal(formatted_usd_amount).quantize(oneplace, rounding=ROUND_UP)
        return str(formatted_usd_amount_one_decimal) + ' B'
    elif usd_amount >= unit_multiplier['M']:
        formatted_usd_amount = usd_amount/unit_multiplier['M']
        formatted_usd_amount_one_decimal = Decimal(formatted_usd_amount).quantize(oneplace, rounding=ROUND_UP)
        return str(formatted_usd_amount_one_decimal) + ' M'
    else:
        formatted_usd_amount = usd_amount/unit_multiplier['K']
        formatted_usd_amount_one_decimal = Decimal(formatted_usd_amount).quantize(oneplace, rounding=ROUND_UP)
        return str(formatted_usd_amount_one_decimal) + ' K'

In [7]:
def get_currency(stock_symbol):
    if 'HK' in stock_symbol:
        return 'HKD'
    elif 'SS' in stock_symbol or 'SZ' in stock_symbol:
        return 'CNY'
    else:
        return 'USD'

In [8]:
def get_note(currency, time):
    if currency == 'USD':
        return '"Valuation is reported as market capitalization. It was last updated on {}."'.format(time)
    elif currency == 'HKD':
        return '"Valuation is reported as market capitalization. It was converted from {} using the exchange rate from {}."'.format(currency, time)
    elif currency == 'CNY':
        return '"Valuation is reported as market capitalization. It was converted from RMB using the exchange rate from {}."'.format(time)


In [9]:
yfinance_data = {}

with open('yfinance_data.csv', newline='') as csvfile:
    tickerreader = csv.reader(csvfile, delimiter=',')
    for stock_symbol, market_cap, note in tickerreader:
        note = '"{}"'.format(note)
        yfinance_data[stock_symbol] = [market_cap, note]

In [10]:
# @retry(ValueError, tries=3, delay=1, jitter=1)
def save_in_csv(stock_symbols):
    market_caps = []
    notes = []

    for stock_symbol in stock_symbols:
        now = datetime.datetime.now()
        time = now.strftime("%B %d, %Y %H:%M:%S")
        
        market_cap = statistics(stock_symbol).scrape_page()
        print(stock_symbol, ': ', market_cap)
        
        if not isinstance(market_cap, str) or market_cap[-1] not in ['K', 'M', 'B', 'T']: #math.isnan(market_cap):
            market_caps.append(yfinance_data[stock_symbol][0])
            notes.append(yfinance_data[stock_symbol][1])
#             raise ValueError('N/A')
        else:
            mc_number = float(market_cap[:-1])
            mc_number_unit = market_cap[-1] # M for million, B for billion, T for trillion
            mc_actual_number = convert_mc_actual_number(mc_number, mc_number_unit)
            currency = get_currency(stock_symbol)
            usd_amount = converter.convert(mc_actual_number, currency)
            
            formatted_mc = 'USD ' + convert_formatted_mc_actual_number(usd_amount)
            market_caps.append(formatted_mc)
            note = get_note(currency, time)
            notes.append(note)
            
    print('mcs: ', market_caps)
    np.savetxt('yfinance_data.csv', [row for row in zip(['stock symbol', *stock_symbols], ['market cap', *market_caps], ['note', *notes])], delimiter=',', fmt='%s')

In [11]:
def prioritize(tickers):
    prioritized_tickers = []
    
    for ticker in tickers:
        tickers_for_company = ticker.split(',')
        
        if 'NYSE' in ticker:
            nyse_ticker = list(filter(lambda t: ('NYSE' in t), tickers_for_company))[0]
            nyse_ticker = nyse_ticker.split(':')[1].strip()
            prioritized_tickers.append(nyse_ticker)
        elif 'HK' in ticker:
            hk_ticker = list(filter(lambda t: ('HK' in t), tickers_for_company))[0].strip()
            prioritized_tickers.append(hk_ticker)
        elif 'SS' in ticker:
            ss_ticker = list(filter(lambda t: ('SS' in t), tickers_for_company))[0].strip()
            prioritized_tickers.append(ss_ticker)
        elif 'SZ' in ticker:
            sz_ticker = list(filter(lambda t: ('SZ' in t), tickers_for_company))[0].strip()
            prioritized_tickers.append(sz_ticker)
    return prioritized_tickers

In [12]:
with open('unparsed_tickers.txt', 'r') as f:
    file = f.readlines()
    
unparsed_tickers = [line.strip('\n\r') for line in file]

stock_symbols = prioritize(unparsed_tickers)
save_in_csv(stock_symbols)

KRKR :  38.55M
JOBS :  4.10B
BABA :  261.93B
2357.HK :  32.00B
BGNE :  19.32B
1211.HK :  698.09B
0031.HK :  1.53B
1062.HK :  348.27M
JD :  90.08B
1288.HK :  1.30T
2020.HK :  252.64B
600733.SS :  30.35B
9888.HK :  363.24B
3988.HK :  1.11T
3328.HK :  446.96B
BILI :  10.19B
300750.SZ :  1.05T
000625.SZ :  71.15B
0939.HK :  0.61
CEA :  0.84
3908.HK :  0.77
LFC :  1.09
600111.SS :  -0.06
1186.HK :  0.26
0390.HK :  0.3
ZNH :  1.53
601888.SS :  1.58
300014.SZ :  1.07
1777.HK :  1.2
2196.HK :  0.62
FUTU :  0.96
2238.HK :  0.44
1772.HK :  0.76
0175.HK :  1.29
002340.SZ :  0.8
603986.SS :  0.66
002074.SZ :  0.6
2333.HK :  1.31
000651.SZ :  0.88
6690.HK :  1.15
600276.SS :  0.41
HNP :  0.58
1611.HK :  0.29
1398.HK :  0.53
IQ :  0.8
6618.HK :  nan
2618.HK :  nan
JKS :  1.08
KC :  nan
1024.HK :  nan
LNVGY :  1.13
300433.SZ :  1.18
2331.HK :  0.66
601012.SS :  1.41
002475.SZ :  0.91
3690.HK :  1.0
000333.SZ :  1.03
MF :  nan
1107.HK :  0.72
002714.SZ :  0.8
2150.HK :  nan
EDU :  1.0
NIO :  2.47
0327