In [1]:
import bar_chart_race as bcr
import pandas as pd
import numpy as np
import yfinance as yf

import warnings
warnings.filterwarnings('ignore')
import os
import pickle

COLUMNS=['zip', 'sector', 'fullTimeEmployees', 'longBusinessSummary', 'city', 'phone', 'state', 'country', 'companyOfficers', 
         'website', 'maxAge', 'address1', 'fax', 'industry', 'previousClose', 'regularMarketOpen', 'twoHundredDayAverage', 
         'trailingAnnualDividendYield', 'payoutRatio', 'volume24Hr', 'regularMarketDayHigh', 'navPrice', 'averageDailyVolume10Day', 
         'totalAssets', 'regularMarketPreviousClose', 'fiftyDayAverage', 'trailingAnnualDividendRate', 'open', 'averageVolume10days', 
         'expireDate', 'yield', 'algorithm', 'dividendRate', 'exDividendDate', 'beta', 'circulatingSupply', 'startDate', 
         'regularMarketDayLow', 'priceHint', 'currency', 'trailingPE', 'regularMarketVolume', 'lastMarket', 'maxSupply', 'openInterest', 
         'marketCap', 'volumeAllCurrencies', 'strikePrice', 'averageVolume', 'priceToSalesTrailing12Months', 'dayLow', 'ask', 'ytdReturn', 
         'askSize', 'volume', 'fiftyTwoWeekHigh', 'forwardPE', 'fromCurrency', 'fiveYearAvgDividendYield', 'fiftyTwoWeekLow', 'bid', 
         'tradeable', 'dividendYield', 'bidSize', 'dayHigh', 'exchange', 'shortName', 'longName', 'exchangeTimezoneName', 
         'exchangeTimezoneShortName', 'isEsgPopulated', 'gmtOffSetMilliseconds', 'underlyingSymbol', 'quoteType', 'symbol', 
         'underlyingExchangeSymbol', 'headSymbol', 'messageBoardId', 'uuid', 'market', 'annualHoldingsTurnover', 'enterpriseToRevenue', 
         'beta3Year', 'profitMargins', 'enterpriseToEbitda', '52WeekChange', 'morningStarRiskRating', 'forwardEps', 'revenueQuarterlyGrowth', 
         'sharesOutstanding', 'fundInceptionDate', 'annualReportExpenseRatio', 'bookValue', 'sharesShort', 'sharesPercentSharesOut', 
         'fundFamily', 'lastFiscalYearEnd', 'heldPercentInstitutions', 'netIncomeToCommon', 'trailingEps', 'lastDividendValue', 
         'SandP52WeekChange', 'priceToBook', 'heldPercentInsiders', 'nextFiscalYearEnd', 'mostRecentQuarter', 'shortRatio', 
         'sharesShortPreviousMonthDate', 'floatShares', 'enterpriseValue', 'threeYearAverageReturn', 'lastSplitDate', 'lastSplitFactor', 
         'legalType', 'morningStarOverallRating', 'earningsQuarterlyGrowth', 'dateShortInterest', 'pegRatio', 'lastCapGain', 
         'shortPercentOfFloat', 'sharesShortPriorMonth', 'category', 'fiveYearAverageReturn', 'regularMarketPrice', 'logo_url']

In [8]:
def get_all_stock_info(name='all_stock_info'):
    all_df = pd.DataFrame(columns=COLUMNS)
    filename = 'fixtures/{}.parquet.gzip'.format(name)
    if not os.path.exists(filename):
        with open('fixtures/tickers.pickle', 'rb') as f:
            tickers = pickle.load(f)

        for ticker in sorted(set(tickers)):
            yahoo_ticker = ticker.replace('.', '-')
            info_file = 'fixtures/stocks/{}.pickle'.format(yahoo_ticker)
            if not os.path.exists(info_file):
                print('Processing {}'.format(yahoo_ticker))
                t = yf.Ticker(yahoo_ticker)
                try:
                    info = dict(t.info)
                except:
                    info = None
                with open(info_file, 'wb') as f:
                    pickle.dump(info, f)
            else:
                with open(info_file, 'rb') as f:
                    info = pickle.load(f)
            try:
                if info:
                    all_df.loc[yahoo_ticker] = info
                else:
                    print('Skipped ticker: {}'.format(yahoo_ticker))
            except:
                continue
        all_df.fillna(value=np.nan, inplace=True)
        all_df.to_parquet(filename, compression='gzip')
        print('All stock data processed.')
    else:
        all_df = pd.read_parquet(filename)
        print('All stock data loaded.')
    return all_df

def get_all_stocks(start='', end=''):
    dfs = {}
    with open('fixtures/tickers.pickle', 'rb') as f:
        tickers = pickle.load(f)

    for ticker in sorted(set(tickers)):
        yahoo_ticker = ticker.replace('.', '-')
        stock_file = 'fixtures/stocks/{}.parquet.gzip'.format(yahoo_ticker)
        if not os.path.exists(stock_file):
            print('Processing {}'.format(yahoo_ticker))
            t = yf.Ticker(yahoo_ticker)
            df = t.history(period='max')
            df.to_parquet(stock_file, compression='gzip')
        else:
            df = pd.read_parquet(stock_file)
        if start == '' and end == '':
            dfs[yahoo_ticker] = df
        elif start == '':
            dfs[yahoo_ticker] = df[:end]
        elif end == '':
            dfs[yahoo_ticker] = df[start:]
        else:
            dfs[yahoo_ticker] = df[start:end]

    return dfs

In [18]:
df = get_all_stock_info()

All stock data loaded.


In [24]:
df[(df['country'] == 'China') & (df['city'] == 'Beijing')]['marketCap'].sum() / 1000000000

813.854236286

In [25]:
df[(df['country'] == 'China') & (df['city'] == 'Shanghai')]['marketCap'].sum() / 1000000000

515.582304124

In [29]:
df[(df['country'] == 'China') & (df['city'] == 'Shenzhen')]['marketCap'].sum() / 1000000000

45.301750076

In [30]:
df[(df['country'] == 'China') & (df['city'] == 'Guangzhou')]['marketCap'].sum() / 1000000000

96.679661232

In [31]:
df[(df['country'] == 'China')]['marketCap'].sum() / 1000000000

2185.122036258

In [60]:
df1 = df.groupby('city').sum()
s1 = df1['marketCap'] / 1000000000
s2 = df1['fullTimeEmployees']
df2 = pd.DataFrame({'marketCap': s1, 'Employees': s2})
df2.sort_values(by=['marketCap'], ascending=False)[:30]

Unnamed: 0_level_0,marketCap,Employees
city,Unnamed: 1_level_1,Unnamed: 2_level_1
New York,3265.58727,2319086.0
Mountain View,2524.454072,300160.0
Cupertino,2290.706467,147263.0
Seattle,1977.528622,1649369.0
San Francisco,1974.021796,709966.0
Redmond,1682.012494,163119.0
San Jose,1320.75094,346725.0
Atlanta,1083.729498,1208661.0
London,974.906014,1303487.0
Santa Clara,967.712177,220765.0


In [50]:
df['E']

index
A       16400.0
AA      13800.0
AACG      768.0
AACQ        NaN
AAIC       11.0
         ...   
ZUMZ     2575.0
ZUO      1249.0
ZVO      1830.0
ZYME      325.0
ZYNE       28.0
Name: fullTimeEmployees, Length: 6358, dtype: float64