In [1]:
from bs4 import BeautifulSoup
import requests
import sys
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import datetime as datetime
import json
from dateutil import relativedelta
import webbrowser
import glob
import os
import time
#us-gaap items:'http://www.xbrlsite.com/LinkedData/BrowseObjectsByType_HTML.aspx?Type=%5BConcept%5D&Submit=Submit'
#other python code: https://github.com/lukerosiak/pysec
#FASB US GAAP Taxonomy:https://www.fasb.org/cs/ContentServer?c=Page&cid=1176169699514&d=&pagename=FASB%2FPage%2FSectionPage

## Analysis

In [2]:
engine = create_engine('sqlite:///Corp_Financials_Cash.db')
###############
#Access list of tables in db
#pd.read_sql('SELECT * FROM sqlite_master',engine)
##############
df = pd.read_sql('SELECT * FROM sp_500_tables', engine)
df.drop_duplicates(inplace = True)

In [3]:
sp_500 = pd.read_sql('SELECT * FROM sp_500', engine)
sp_500.drop_duplicates(subset = ['CIK'],inplace = True)

In [4]:
df['CIK'] = df['CIK'].astype('int')

In [5]:
df = sp_500[['GICS Sector','CIK','Ticker symbol']].merge(df, on = ['CIK'], how = 'right')

In [6]:
#################
# Change CutDate dtype to datetime
#################
df['CutDate'] = df.CutDate.apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d') if x != '' else x)

In [7]:
#################
# Wrangle reporting_period to convert bytesarray to integers.
#################

convert_date = {'\x00\x00\x00\x00\x00\x00\x00\x00':'0','\x03\x00\x00\x00\x00\x00\x00\x00':'3','\x06\x00\x00\x00\x00\x00\x00\x00':'6',\
               '\x0c\x00\x00\x00\x00\x00\x00\x00':'12','\x07\x00\x00\x00\x00\x00\x00\x00':'6','\t\x00\x00\x00\x00\x00\x00\x00':'9',\
               '\x04\x00\x00\x00\x00\x00\x00\x00':'3','\x01\x00\x00\x00\x00\x00\x00\x00':'1',')\x00\x00\x00\x00\x00\x00\x00':'100',\
               '\n\x00\x00\x00\x00\x00\x00\x00':'9','\x05\x00\x00\x00\x00\x00\x00\x00':'5'}
normalized = {7:6,4:3,10:9}
df['reporting_period'] = df.reporting_period.apply(lambda x: convert_date[x.decode()] if type(x) == bytes else x)
df['reporting_period'] = df.reporting_period.apply(lambda x: int(x) if x != '' else x)
df['reporting_period'] = df.reporting_period.apply(lambda x: normalized[x] if x in normalized else x)

In [8]:
def period_comparison(df, table_name_list,item, member = False, period = 'min', convert_float = True):
    ##########################
    # This function will return values comparison of the interested item across different reporting period.
    # member: True or False - False will only consider Date_Only member value
    # period: 'min' or 'max' - min or max of reporting periods, e.g. single quarter or YTD
    #convert_float: True or False - convert values to float or not
    #depth always pick the smallest value
    #contextref with the smallest length - if duplicated values still exist after all above
    #########################
    df_table = df.loc[df['table_name'].str.contains('|'.join(table_name_list)),:]
    df_table = df_table[df_table['us-gaap'] == item.lower()]
    
    df_table['contextref_len'] = df_table['contextref'].apply(lambda x: len(x))
    #groupby adds value to prevent leaving out certain items with the same member items
    #df_table = df_table.groupby(by = ['CIK','CutDate','us-gaap','reporting_period','member','value'])\
    #.agg({'contextref_len':'min','depth':'min','decimals':'max'}).reset_index().merge(df_table, on = ['CIK','CutDate','reporting_period','us-gaap','contextref_len','depth','decimals','member','value'], how = 'left')
    #
    df_table = df_table.groupby(by = ['CIK','CutDate','us-gaap','reporting_period','member'])\
    .agg({'contextref_len':'min','depth':'min','decimals':'max'}).reset_index().merge(df_table, on = ['CIK','CutDate','reporting_period','us-gaap','contextref_len','depth','decimals','member'], how = 'left')

    if member == False:
        df_table = df_table[df_table.member == 'Date_Only']
        
    #df_main = df_table.groupby(['CIK','reporting_period','member','value']).agg({'CutDate':['max','min']}).reset_index()
    #df_main.columns = ['CIK','reporting_period','member','value','this_period','last_period']
    #
    df_main = df_table.groupby(['CIK','reporting_period','member']).agg({'CutDate':['max','min']}).reset_index()
    df_main.columns = ['CIK','reporting_period','member','this_period','last_period']    
    
    if period == 'min':
        #df_main = df_main.groupby(by = ['CIK','member','value'])['reporting_period'].min().reset_index().merge(df_main, on = ['CIK','reporting_period','member'], how = 'left')
        df_main = df_main.groupby(by = ['CIK','member'])['reporting_period'].min().reset_index().merge(df_main, on = ['CIK','reporting_period','member'], how = 'left')
    if period == 'max':
        #df_main = df_main.groupby(by = ['CIK','member','value'])['reporting_period'].max().reset_index().merge(df_main, on = ['CIK','reporting_period','member'], how = 'left')
        df_main = df_main.groupby(by = ['CIK','member'])['reporting_period'].max().reset_index().merge(df_main, on = ['CIK','reporting_period','member'], how = 'left')
    
    #df_main = df_main.merge(df_table[['CIK','GICS Sector','value','reporting_period','CutDate','member']],\
    #            left_on = ['CIK','reporting_period','member','this_period','value'], right_on = ['CIK','reporting_period','member','CutDate','value'], how = 'left')\
    #.merge(df_table[['CIK','value','reporting_period','CutDate','member']], \
    #       left_on = ['CIK','reporting_period','member','last_period'], right_on = ['CIK','reporting_period','member','CutDate'], how = 'left')\
    #.rename(columns = {'value_x': 'value_this_period','value_y':'value_last_period'})\
    #.drop(['CutDate_x','CutDate_y'], axis = 1)

    df_main = df_main.merge(df_table[['CIK','GICS Sector','value','reporting_period','CutDate','member']],\
                left_on = ['CIK','reporting_period','member','this_period'], right_on = ['CIK','reporting_period','member','CutDate'], how = 'left')\
    .merge(df_table[['CIK','value','reporting_period','CutDate','member']], \
           left_on = ['CIK','reporting_period','member','last_period'], right_on = ['CIK','reporting_period','member','CutDate'], how = 'left')\
    .rename(columns = {'value_x': 'value_this_period','value_y':'value_last_period'})\
    .drop(['CutDate_x','CutDate_y'], axis = 1)
    
    
    if convert_float == True:
        df_main['value_this_period'] = df_main['value_this_period'].astype('float')
        df_main['value_last_period'] = df_main['value_last_period'].astype('float')
        df_main['value_diff'] = df_main['value_this_period'] - df_main['value_last_period']
    df_main.drop_duplicates(inplace = True)
        
    return df_main

## Cash & Investment

In [None]:
#############
#Extract cash items from 330000 - investment disclosure & 333000 Equity method investment
############
# 330000 table
mapping_cash = pd.ExcelFile('Mapping.xls').parse('330000')

mapped = pd.DataFrame(columns = ['CIK', 'member', 'reporting_period', 'this_period', 'last_period',\
                        'GICS Sector', 'value_this_period', 'value_last_period', 'value_diff','us-gaap'])
mapping_item = mapping_cash['us-gaap']
for i in mapping_item:
    mapped_i = period_comparison(df,['330000'],i, member = True)
    mapped_i['us-gaap'] = np.repeat(i,len(mapped_i.index))
    mapped = pd.concat([mapped,mapped_i], ignore_index = True)

# 333000 table
new_df = pd.DataFrame(columns = ['CIK'], data = list(set(mapped['CIK']))).merge(df, on = ['CIK'], how = 'inner')
mapping_cash = pd.ExcelFile('Mapping.xls').parse('333000')
mapping_item = mapping_cash['us-gaap']
for i in mapping_item:
    try:
        mapped_i = period_comparison(new_df,['333000'],i, member = True)
        mapped_i['us-gaap'] = np.repeat(i,len(mapped_i.index))
        mapped = pd.concat([mapped,mapped_i], ignore_index = True)
    except:
        #print(i)
        continue
mapped = mapped.loc[mapped['member'].str.contains('fair value', case = False, na = False) == False,:]
#mapped['value_this_period'] = mapped['value_this_period'].apply(lambda x: x/1000000)
#mapped['value_last_period'] = mapped['value_last_period'].apply(lambda x: x/1000000)
#mapped['value_diff'] = mapped['value_diff'].apply(lambda x: x/1000000)

In [10]:
##################
# Find companies left out of 330000
##################
no_investment = []
for i in list(set(df['CIK'])):
    if i not in list(set(mapped['CIK'])):
        no_investment.append(i)
c = pd.DataFrame(columns = ['CIK'], data = no_investment).merge(df, on = ['CIK'], how = 'inner')

In [None]:
##################
# Extract cash items from 815000 - Fair value disclosure for the remaining companies
##################
mapping_cash = pd.ExcelFile('Mapping.xls').parse('815000')

mapping_item = mapping_cash['us-gaap']
for i in mapping_item:
    try:
        mapped_i = period_comparison(c,['815000'],i, member = True)
        mapped_i['us-gaap'] = np.repeat(i,len(mapped_i.index))
        mapped = pd.concat([mapped,mapped_i], ignore_index = True)
    except:
        #print(i)
        continue
mapped = mapped.loc[mapped['member'].str.contains('fair value', case = False, na = False) == False,:]
mapped['value_this_period'] = mapped['value_this_period'].apply(lambda x: x/1000000)
mapped['value_last_period'] = mapped['value_last_period'].apply(lambda x: x/1000000)
mapped['value_diff'] = mapped['value_diff'].apply(lambda x: x/1000000)

In [12]:
##################
# Find companies left out of both 330000 & 815000
##################
no_investment = []
for i in list(set(df['CIK'])):
    if i not in list(set(mapped['CIK'])):
        no_investment.append(i)
c = pd.DataFrame(columns = ['CIK'], data = no_investment).merge(df, on = ['CIK'], how = 'inner')

In [13]:
############
#Aggregate balance sheet cash items of the rest of the companies without investment or fair value disclosure
############


cash_item = pd.ExcelFile('Mapping.xls').parse('104000')['us-gaap']

total_cash = pd.DataFrame(columns = ['CIK', 'member', 'reporting_period', 'this_period', 'last_period',\
                        'GICS Sector', 'value_this_period', 'value_last_period', 'value_diff','us-gaap'])
for i in cash_item:
    cash = period_comparison(c,['104000'],i, member = False)
    cash['us-gaap'] = np.repeat(i, len(cash.index))
    total_cash = pd.concat([total_cash, cash], ignore_index = True)
total_cash['value_this_period'] = total_cash['value_this_period'].apply(lambda x:x/1000000)
total_cash['value_last_period'] = total_cash['value_last_period'].apply(lambda x: x/1000000)
total_cash['value_diff'] = total_cash['value_diff'].apply(lambda x: x/1000000)
#total_cash.groupby(['CIK'])['value_this_period'].sum().reset_index().sort_values(by = ['value_this_period'], ascending  = False)

  stride //= shape[i]


In [14]:
mapped = pd.concat([mapped, total_cash], ignore_index = True)

In [None]:
###########
# Companies left out of cash & investment analysis due to failue to extract cash and investment items from xbrl
##########
no_investment = []
for i in list(set(df['CIK'])):
    if i not in list(set(mapped['CIK'])):
        no_investment.append(i)
c = pd.DataFrame(columns = ['CIK'], data = no_investment).merge(df, on = ['CIK'], how = 'inner')
set(c['CIK'])

### Total Cash

In [15]:
strategic_invest = {'us-gaap:equitymethodinvestments':'Equity Method',\
 'us-gaap:investmentsinaffiliatessubsidiariesassociatesandjointventures':'Equity Method',\
'us-gaap:equitysecuritieswithoutreadilydeterminablefairvalueamount':'Cost Method',\
'us-gaap:costmethodinvestmentsaggregatecarryingamountnotevaluatedforimpairment':'Cost Method',\
'us-gaap:costmethodinvestments':'Cost Method',\
'us-gaap:costmethodinvestmentsfairvaluedisclosure':'Cost Method'}


In [16]:
#############
#Aggregate total investment
##320193/1018724/1652044/789019/50863/1326801/858877
############
#https://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.GroupBy.apply.html
total_cash_agg = mapped.groupby(by = ['CIK','GICS Sector','us-gaap']).apply(lambda x: sum(x.value_this_period*['Date_Only' in i for i in x.member]) \
                                             if sum(['Date_Only' in i for i in x.member]) > 0\
                                                 else sum(x.value_this_period)).reset_index()

total_cash = pd.ExcelFile('Mapping.xls').parse('total_cash')
total_cash = total_cash[total_cash.inclusion == 'V']
total_cash = total_cash.merge(total_cash_agg,on = ['CIK','us-gaap'], how = 'left').rename(columns = {0:'value'})
total_cash['Cat'] = total_cash['us-gaap'].apply(lambda x: strategic_invest[x] if x in strategic_invest else 'investment')

In [27]:
total_cash.groupby(['Cat'])['value'].sum().reset_index()

Unnamed: 0,Cat,value
0,Cost Method,12160.712
1,Equity Method,157542.893
2,investment,1482870.614


In [17]:
############
#Rank of total investment
###########
agg = total_cash.groupby(by = ['CIK','GICS Sector'])['value'].sum().reset_index().sort_values(by = ['value'],ascending = False)\
.merge(sp_500[['CIK','Security']], on = ['CIK'], how = 'left')
agg.head(20)


Unnamed: 0,CIK,GICS Sector,value,Security
0,320193,Information Technology,243743.0,Apple Inc.
1,789019,Information Technology,135630.0,Microsoft Corp.
2,1652044,Information Technology,100611.0,Alphabet Inc Class A
3,1341439,Information Technology,45641.0,Oracle Corp.
4,882095,Health Care,45067.0,Gilead Sciences
5,64803,Consumer Staples,43911.0,CVS Health
6,1109357,Utilities,39990.0,Exelon Corp.
7,858877,Information Technology,38710.0,Cisco Systems
8,40545,Industrials,37205.0,General Electric
9,731766,Health Care,34216.0,United Health Group Inc.


In [None]:
##########
#Top 20 shares of total corporate investment in this study
##########
agg.head(20)['value'].sum()/agg['value'].sum()

### Investment Composition

In [18]:
#################
#Investment Compositions
#################

member_only = mapped[(mapped['member'] != 'Date_Only')].loc[mapped['this_period'] != mapped['last_period']]
member_list = pd.ExcelFile('Mapping.xls').parse('member')
cash_item = pd.ExcelFile('Mapping.xls').parse('total_cash')
cash_item = cash_item[cash_item['inclusion'] == 'V']
##############
#import cash mapping table, merge with member only and total cash(to gauge significance)
##############
investment = cash_item\
.merge(member_only ,on = ['CIK','us-gaap'], how = 'left')\
.merge(member_list, on = ['member'], how = 'left')

investment = investment[investment['Cat'].isnull() == False]

In [24]:
investment.groupby(['Cat']).agg({'value_this_period':'sum','value_last_period':'sum','value_diff':'sum'}).reset_index()\
.sort_values(by = ['value_diff'], ascending = False)

Unnamed: 0,Cat,value_this_period,value_last_period,value_diff
4,Cash,61140.532,53743.889,7396.643
3,CP,14381.61,8770.52,5611.09
14,MMF,42597.733,37591.735,5005.998
15,Other,58849.355,54729.85,4119.505
12,MBS,15269.454,14885.212,384.242
2,CMBS,1604.689,1610.57,-5.881
11,GOV & Corp,104.498,184.452,-79.954
1,Agency,418.155,524.698,-106.543
7,Foerign Securities,145.0,277.0,-132.0
8,Foreign Corp,3211.1,3541.9,-330.8


## Cash Flow Item

### Purchases and Sales of Securities

In [19]:
ps = []
for i in ['PaymentsToAcquireAvailableForSaleSecuritiesDebt',\
'PaymentsToAcquireHeldToMaturitySecurities',\
'PaymentsToAcquireMarketableSecurities',\
'ProceedsFromSaleOfAvailableForSaleSecuritiesDebt',\
'ProceedsFromMaturitiesPrepaymentsAndCallsOfAvailableForSaleSecurities',\
'ProceedsFromSaleAndMaturityOfAvailableForSaleSecurities',\
'ProceedsFromSaleOfHeldToMaturitySecurities',\
'ProceedsFromMaturitiesPrepaymentsAndCallsOfHeldToMaturitySecurities',\
'ProceedsFromSaleAndMaturityOfHeldToMaturitySecurities',\
'ProceedsFromSaleAndMaturityOfMarketableSecurities',\
'ProceedsFromSaleAndMaturityOfOtherInvestments',\
'ProceedsFromSaleMaturityAndCollectionsOfInvestments',\
'PaymentsToAcquireAvailableForSaleSecuritiesDebt',\
'PaymentsToAcquireAvailableForSaleSecuritiesEquity',\
'PaymentsToAcquireAvailableForSaleSecurities',\
'PaymentsToAcquireHeldToMaturitySecurities',\
'PaymentsToAcquireMarketableSecurities',\
'ProceedsFromMaturitiesPrepaymentsAndCallsOfAvailableForSaleSecurities',\
'ProceedsFromSaleAndMaturityOfAvailableForSaleSecurities',\
'ProceedsFromSaleOfHeldToMaturitySecurities',\
'ProceedsFromMaturitiesPrepaymentsAndCallsOfHeldToMaturitySecurities',\
'ProceedsFromSaleAndMaturityOfHeldToMaturitySecurities',\
'ProceedsFromSaleAndMaturityOfMarketableSecurities',\
'ProceedsFromSaleAndMaturityOfOtherInvestments',\
'ProceedsFromSaleMaturityAndCollectionsOfInvestments',\
'ProceedsFromSaleOfAvailableForSaleSecuritiesDebt',\
'ProceedsFromSaleOfAvailableForSaleSecuritiesEquity',\
'ProceedsFromSaleOfAvailableForSaleSecurities',\
'ProceedsFromMaturitiesPrepaymentsAndCallsOfAvailableForSaleSecurities',\
'ProceedsFromSaleAndMaturityOfAvailableForSaleSecurities',\
'PaymentsToAcquireOtherInvestments',\
'PaymentsToAcquireInvestments',\
'ProceedsFromSaleOfAvailableForSaleSecurities',\
'PaymentsToAcquireTradingSecuritiesHeldforinvestment',\
'ProceedsFromSaleAndMaturityOfTradingSecuritiesHeldforinvestment']:
    ps.append('us-gaap:' + i.lower())
ps = list(set(ps))

In [20]:
df_ps = pd.DataFrame(columns = ['CIK', 'member', 'reporting_period', 'this_period', 'last_period',
       'GICS Sector', 'value_this_period', 'value_last_period', 'value_diff','us-gaap'])
for i in ps:
    try:
        df_c = period_comparison(df,['152200','330000'],i,period = 'max')
        df_c['us-gaap'] = np.repeat(i, len(df_c.index))
        if 'payment' in i[:20]:
            df_c['value_this_period'] = df_c['value_this_period'].apply(lambda x: -x)
            df_c['value_last_period'] = df_c['value_last_period'].apply(lambda x: -x)
            df_c['value_diff'] = df_c['value_this_period'] - df_c['value_last_period']
        df_ps = pd.concat([df_c,df_ps], ignore_index = True)
    except:
        continue
df_ps['value_this_period'] = df_ps['value_this_period'].apply(lambda x: x/1000000)
df_ps['value_last_period'] = df_ps['value_last_period'].apply(lambda x: x/1000000)
df_ps['value_diff'] = df_ps['value_diff'].apply(lambda x: x/1000000)
df_ps = df_ps.merge(pd.ExcelFile('Mapping.xls').parse('cf'), on = ['us-gaap'], how = 'left')

In [None]:

only_investments = total_cash[total_cash.Cat == 'investment'].groupby(['CIK'])['value'].sum().reset_index()
df_ps_agg = df_ps.groupby(['CIK','reporting_period','this_period','GICS Sector'])['value_this_period'].sum().reset_index()
df_ps_agg = df_ps_agg.sort_values(by = ['CIK','reporting_period','this_period'], ascending = False)\
.drop_duplicates(subset = ['CIK'],keep = 'first')

df_ps_agg = df_ps_agg.merge(only_investments,on = ['CIK'], how = 'left').rename(columns = {'value_this_period':'cash_flow','value':'investment_bal'})
df_ps_agg['change %'] = -df_ps_agg['cash_flow']*100/(df_ps_agg['cash_flow']+df_ps_agg['investment_bal'])
df_ps_agg['investment_bal_last_period'] = df_ps_agg['cash_flow']+df_ps_agg['investment_bal']
df_ps_agg = df_ps_agg.sort_values(by = ['change %'])[df_ps_agg['investment_bal_last_period'] >= 0]
df_ps_agg['valid_cash_flow'] = np.repeat('v', len(df_ps_agg.index))
df_ps_agg.groupby(by = ['reporting_period'])['change %'].describe()
############
#The corporate investments exhibit net outflows of 6~14% of original investment balance regardless of reporting periods of the entities.
############
#df_ps_agg.groupby(by = ['reporting_period']).apply(lambda x: np.average(x['change %'], weights = x['investment_bal'])).reset_index().rename(columns = {0:'weighted_change %'})

In [111]:
###################
#Prominent  Consumer Staple companies - Pepsico & ESTEE LAUDER, all reported net investment decrease
#Material: DowDuPont
#Healthcare: Amgen, Anthem, GILEAD
###################
df_ps_agg.groupby(by = ['GICS Sector']).apply(lambda x: np.average(x['change %'], weights = x['investment_bal'])).reset_index().rename(columns = {0:'weighted_change %'})

Unnamed: 0,GICS Sector,weighted_change %
0,Consumer Discretionary,-7.62835
1,Consumer Staples,-35.268713
2,Energy,4.111967
3,Health Care,-8.482201
4,Industrials,-5.717812
5,Information Technology,-7.363309
6,Materials,-16.657373
7,Utilities,-2.558277


In [35]:
#####################
#Data for Visualization
#Main is sorted by last period investment balance and reporting period
#####################
df_main = only_investments.rename(columns = {'value':'investment_bal'}).merge(agg, on = ['CIK'], how = 'outer')\
.merge(df_ps_agg, on = ['CIK','investment_bal','GICS Sector'], how = 'left').rename(columns = {'value':'total_cash'})
df_main.loc[df_main['investment_bal_last_period'].isnull(),'investment_bal_last_period'] = df_main.loc[df_main['investment_bal_last_period'].isnull(),'investment_bal']
df_main.to_excel('Main.xls', index = False)

df_ps_agg[['CIK', 'reporting_period', 'this_period', 'GICS Sector']]\
.merge(df_ps,on = ['CIK', 'reporting_period', 'this_period', 'GICS Sector'], how = 'left')\
.to_excel('Cash Flow.xls',index = False)

investment.to_excel('investment_composition.xls', index = False)