In [1]:
from bs4 import BeautifulSoup
import requests
import sys
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import datetime as datetime
import json
from dateutil import relativedelta
import webbrowser
import glob
import os
import time
#FASB US GAAP Taxonomy:https://www.fasb.org/cs/ContentServer?c=Page&cid=1176169699514&d=&pagename=FASB%2FPage%2FSectionPage

In [2]:
def get_xml_link(CIK, accession_number):
    r = requests.get('https://www.sec.gov/Archives/edgar/data/{}/{}/index.json'.format(CIK,accession_number)).text
    j = json.loads(r)
    file_name = ''
    for i in j['directory']['item']:
        try:
            date = i['name'].split('-')[1].split('.xml')[0]
            datetime.datetime.strptime(date,'%Y%m%d')
            file_name = i['name']
            break
        except:
            if 'htm.xml' in i['name']:
                file_name = i['name']
                break
            else:
                continue
    
    return 'https://www.sec.gov/Archives/edgar/data/{}/{}/{}'.format(CIK,accession_number,file_name)

In [3]:
def get_company_document_list(company_CIK):
    company = company_CIK

    link_10Q = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=10-Q&dateb=&owner=include&count=40'.format(company)
    link_10K = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=10-K&dateb=&owner=include&count=40'.format(company)
    r_10Q = requests.get(link_10Q)
    r_10K = requests.get(link_10K)
    df_10Q = pd.read_html(r_10Q.text)[-1]
    df_10K = pd.read_html(r_10K.text)[-1]
    #Concat 10-K and 10-Q document list
    df_filing = pd.concat([df_10Q,df_10K], ignore_index = True)
    df_filing = df_filing[df_filing[0] != 'Filings']

    #Extract acct_no
    df_filing[2] = df_filing[2].apply(lambda x: x.split('Acc-no: ')[1].split('\xa0(34 Act)')[0].replace('-',''))

    df_filing[3] = df_filing[3].apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d'))

    df_filing = df_filing.loc[:,[0,2,3]]

    df_filing.rename(columns = {0:'document_type',2:'Acc_no',3:'release_date'},inplace = True)
    #order the list to get the latest filings
    df_filing  = df_filing.sort_values(by = 'release_date',ascending = False).reset_index()
    
    return df_filing

In [4]:
def reporting_period(s, df_date):
    months = ''
    try:
        months = str(df_date[df_date.context == s]['months'].item()) + 'Date_Only'
    except:
        try:
            for i in df_date.index:
                element = df_date.loc[i,'context'].split('_')
                if all(x in s for x in element):
                    months = df_date.loc[i,'months']
        except:
            months = 'na'
    return months

In [5]:
def dateto(s, df_date):
    dateto = ''
    try:
        dateto = df_date[df_date.context == s]['to'].item()
    except:
        try:
            for i in df_date.index:
                element = df_date.loc[i,'context'].split('_')
                if all(x in s for x in element):
                    dateto = df_date.loc[i,'to']
        except:
            dateto = 'na'
    return dateto

In [6]:
def parse_xbrl(xbrl_link):
    ###############################
#   input: xml file of the company
#   output: the entire parsed xbrl file and taxonomy versionand company CIK
    ###############################
    xbrl_resp = requests.get(xbrl_link)
    xbrl_str = xbrl_resp.text

    #soup = BeautifulSoup(xbrl_str, 'lxml') --> this will truncate tag.name
    #https://stackoverflow.com/questions/28616558/python-64-bit-not-storing-as-long-of-string-as-32-bit-python
    soup = BeautifulSoup(xbrl_str, 'html.parser')
    tag_list = soup.find_all()
    df = {'us-gaap':[],'value':[],'contextref':[],'decimals':[], 'unitref':[]}
    df_date = {'tag':[],'value':[],'context':[]}
    df_explicitmember = {'tag':[],'name':[],'us-gaap':[]}
    for tag in tag_list:
        df['us-gaap'].append(tag.name)
        df['value'].append(tag.text)

        if 'contextref' in tag.attrs:
            df['contextref'].append(tag.attrs['contextref'])
        if 'decimals' in tag.attrs:
            df['decimals'].append(tag.attrs['decimals'])
        if 'unitref' in tag.attrs:
            df['unitref'].append(tag.attrs['unitref'])
        if 'xmlns:us-gaap' in tag.attrs:
            xbrl_version = tag.attrs['xmlns:us-gaap'].rsplit('/',1)[1].split('-')[0]
                
        if 'contextref' not in tag.attrs:
            df['contextref'].append('')
        if 'decimals' not in tag.attrs:
            df['decimals'].append('')
        if 'unitref' not in tag.attrs:
            df['unitref'].append('')
            
########################            
#Get explicit member
########################              
        
        if 'explicitmember' in tag.name:
            df_explicitmember['tag'].append(tag.name)
            df_explicitmember['name'].append(tag.text)
            df_explicitmember['us-gaap'].append(tag.attrs['dimension'])
            

########################            
#Get period
########################         
        if 'id' in tag.attrs:
            for subtag in tag.find_all():
                if 'period' in subtag.name:
                    df_date['tag'].append(subtag.name)
                    df_date['value'].append(subtag.text)
                    df_date['context'].append(tag.attrs['id'])

                    
                    
#########################
#Wrangle df_explicitmember
#########################  
    df_explicitmember = pd.DataFrame(df_explicitmember)
    df_explicitmember = df_explicitmember.loc[df_explicitmember['name'].str.contains('us-gaap',na = False, case = False) == False,:]
    df_explicitmember['us-gaap'] = df_explicitmember['us-gaap'].apply(lambda x: x.lower())
    df_explicitmember['name'] = df_explicitmember['name'].apply(lambda x: x.split(':')[1].lower())
    df_explicitmember['label'] = df_explicitmember['name']
    df_explicitmember['Year'] = np.repeat(xbrl_version, len(df_explicitmember.index))
                                        
#########################
#Wrangle df_date
#########################
    df_date = pd.DataFrame(df_date)
    df_date = df_date.loc[df_date.context.str.contains('|'.join(['us-gaap','srt','dei','axis','member']), na = False, case = False) == False,:]
    df_date['context'] = df_date['context'].apply(lambda x: x.lower())
    ###########Prior code
    #df_date['from'] = df_date.value.str.extract('[\n+]?(\d+\-\d+\-\d+)[\n+](\d+\-\d+\-\d+)?')[0]
    #df_date['to'] = df_date.value.str.extract('[\n+]?(\d+\-\d+\-\d+)[\n+](\d+\-\d+\-\d+)?')[1]
    ###########
    df_date['from'] = df_date.value.str.extractall('[\n+]?(\d+\-\d+\-\d+)[\n+]?').unstack()[0][0]
    df_date['to'] = df_date.value.str.extractall('[\n+]?(\d+\-\d+\-\d+)[\n+]?').unstack()[0][1]

########################
#if there is only one date, then to and from should be the same
########################
    to = []
    for i in df_date.index:
        try:
            datetime.datetime.strptime(df_date.loc[i,'to'],'%Y-%m-%d')
            to.append(df_date.loc[i,'to'])
        except:
        
            to.append(df_date.loc[i,'from'])
    df_date['to'] = to    

########################
#calculate months between from and to date
########################
    months = []
    for i in df_date.index:

        f = datetime.datetime.strptime(df_date.loc[i,'from'],'%Y-%m-%d')
        t = datetime.datetime.strptime(df_date.loc[i,'to'],'%Y-%m-%d')
    
        r = relativedelta.relativedelta(t, f)
        if df_date.loc[i,'from'] == df_date.loc[i,'to']:
            months.append(r.months)
        elif r.years >= 1:
            months.append(r.years*12+r.months)        
        else:
            months.append(r.months+1)    
    df_date['months'] = months

    df_date.sort_values(by = ['months'], ascending = False,inplace = True)
    df_date = df_date.reset_index()
#########################
#Wrangle df
#########################    
    df = pd.DataFrame(df)
    #CIK = df.loc[df['us-gaap'].str.contains('dei:entitycentralindexkey', case = False, na = False),'value'].item()
    CIK = ''
    
    df['CIK'] = np.repeat(CIK, len(df.index))
    df['Year'] = np.repeat(xbrl_version, len(df.index))
    df['us-gaap'] = df['us-gaap'].apply(lambda x: x.lower())
    df['contextref'] = df['contextref'].apply(lambda x: x.lower())
    df['reporting_period'] = df['contextref'].apply(lambda x: reporting_period(x, df_date))
    df['CutDate'] = df['contextref'].apply(lambda x: dateto(x, df_date))
    
    return df, df_explicitmember

## us-gaap Taxonomy

In [7]:
taxonomy_f = ['US_GAAP_Taxonomy_2018.xlsx','US_GAAP_Taxonomy_2017.xlsx','US_GAAP_Taxonomy_2016.xlsx']
#taxonomy_f = ['US_GAAP_Taxonomy_2018.xlsx']
fasb = pd.DataFrame(columns = ['table_name', 'prefix', 'name', 'label', 'depth', 'order', 'priority',
       'parent', 'arcrole', 'preferredLabel', 'systemid', 'us-gaap', 'Year'])

In [8]:
for i in taxonomy_f:
    year = i.rsplit('_',1)[1].split('.xlsx')[0]

    us_gaap = pd.ExcelFile(i).parse('Presentation')
    us_gaap = us_gaap.reset_index()

    table_head = us_gaap.loc[us_gaap['prefix'].str.contains('definition', case = False, na = False),:].index
    table_name = list(us_gaap.loc[table_head,'name'])
    df_tables = pd.DataFrame(data = table_name, index = table_head, columns = ['table_name'])
    df_tables = df_tables.reset_index()

    us_gaap_with_table = df_tables.merge(us_gaap, how = 'outer', on = ['index']).sort_values(by = ['index']).set_index('index')
    us_gaap_with_table['table_name'].fillna(method = 'ffill', inplace = True)

    table_of_interest = ['104000 - Statement - Statement of Financial Position, Classified',\
                         '124100 - Statement - Statement of Income',\
                         '152200 - Statement - Statement of Cash Flows','460000 - Disclosure - Debt',\
                         '770000 - Disclosure - Income Taxes','300000 - Disclosure - Cash and Cash Equivalents',\
                         '330000 - Disclosure - Investments, Debt and Equity Securities',\
                         '336000 - Disclosure - Investments, All Other Investments',\
                         '333000 - Disclosure - Investments, Equity Method and Joint Ventures',\
                         '320000 - Disclosure - Receivables, Loans, Notes Receivable, and Others',\
                         '790000 - Disclosure - Segment Reporting',\
                         '800000 - Disclosure - Business Combinations',\
                         '815000 - Disclosure - Fair Value Measures and Disclosures',\
                         '993500 - Disclosure - Investment Holdings',\
                         '993510 - Disclosure - Other than Securities Investment Holdings',\
                         '993520 - Disclosure - Summary of Investment Holdings',\
                         '148400 - Statement - Statement of Comprehensive Income',\
                         '148410 - Statement - Statement of Other Comprehensive Income',\
                         '500000 - Disclosure - Equity']
    us_gaap_with_table = us_gaap_with_table.loc[us_gaap_with_table['table_name'].str.contains('|'.join(table_of_interest), na = False, case = False),:]
    us_gaap_with_table = us_gaap_with_table[us_gaap_with_table['prefix'] == 'us-gaap']
    us_gaap_with_table['us-gaap'] = us_gaap_with_table['prefix']+':'+us_gaap_with_table['name']
    us_gaap_with_table['us-gaap'] = us_gaap_with_table['us-gaap'].apply(lambda x: x.lower())
    us_gaap_with_table['name'] = us_gaap_with_table['name'].apply(lambda x: x.lower())
    us_gaap_with_table['Year'] = np.repeat(year, len(us_gaap_with_table.index))
    
    fasb = pd.concat([fasb,us_gaap_with_table], ignore_index = True)
fasb = fasb[['table_name','label','depth','order','us-gaap','Year','name']]

In [9]:
#############
# This is to calculate the distance of every item to its respective member item, later for explicitmember
#############
distance_to_member_level = []
result = []
for i in fasb.index:
    current_depth = fasb.loc[i,'depth']
    for row in fasb.index[i+1:]:
        
        if fasb.loc[row,'depth'] <= fasb.loc[i,'depth']:
            distance_to_member_level.append(current_depth-fasb.loc[i,'depth'])
            break
        elif 'MEMBER' in fasb.loc[row,'name'].upper():
            if fasb.loc[row,'depth'] > current_depth:
                current_depth = fasb.loc[row,'depth']
        elif row == fasb.index[-1]:
            distance_to_member_level.append(current_depth-fasb.loc[i,'depth'])
    
    #############
    # For final row
    #############
distance_to_member_level.append(0)

    
fasb['distance_to_member_level'] = distance_to_member_level

## Parsing

In [31]:
engine = create_engine('sqlite:///Corp_Financials_Cash.db')
###############
#Access list of tables in db
##############
df = pd.read_sql('SELECT * FROM sp_500_tables', engine)

In [14]:
#############
#import S&P 500 company list --> source: Wikipedia
############
#sp500 = pd.read_excel('S&P500CIK.xls')
sp_500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies', header = 0)[0]
sp_500 = sp_500.loc[sp_500['GICS Sector'].str.contains('|'.join(['Financials','Real Estate'])) == False,:]

In [10]:
df_all = pd.DataFrame(columns = ['CIK', 'Year', 'CutDate', 'reporting_period', 'table_name', 'decimals',\
                                 'unitref', 'value', 'label', 'member', 'us-gaap', 'name', 'depth','contextref'])
df_all_text = pd.DataFrame(columns = ['CIK', 'Year', 'CutDate', 'reporting_period', 'table_name', 'decimals',\
                                 'unitref', 'value', 'label', 'member', 'us-gaap', 'name', 'depth','contextref'])
#CIKs = ['1513761']
engine = create_engine('sqlite:///Corp_Financials_Cash(2).db')
CIKs = list(set(pd.read_sql('SELECT * FROM sp_500', engine)['CIK']))
engine = create_engine('sqlite:///Corp_Financials_Cash.db')

In [22]:
#######################
# loop for sp_500 XBRL
#######################
count = 0
failed = {'CIK':[]}
for cik in CIKs:
    try:
        CIK = cik
        Acc_no = get_company_document_list(CIK)
        Acc_no = Acc_no[Acc_no.release_date > datetime.datetime(2018,4,1)].loc[0,'Acc_no']
    
        link = get_xml_link(CIK,Acc_no)
        df_comp, df_member = parse_xbrl(link)
        df_comp['CIK'] = np.repeat(CIK, len(df_comp.index))
        df_comp = df_comp.merge(fasb, how = 'inner', on = ['Year','us-gaap'])

#######################
# get member item from contextref
#######################
        df_member = df_member.merge(fasb[['table_name','us-gaap','Year','distance_to_member_level','depth']],how = 'inner',on = ['Year','us-gaap'])
        df_member['depth'] = df_member['depth'] + df_member['distance_to_member_level']

        fasb_by_comp = fasb[fasb['Year'] == df_comp['Year'][0]]
        fasb_with_explicitmember = pd.concat([df_member[['name', 'us-gaap', 'Year', 'table_name','depth','label']],\
                                              fasb_by_comp[['name', 'us-gaap', 'Year', 'table_name','depth','label']]], ignore_index = True)
        fasb_with_explicitmember.drop_duplicates(inplace = True)
        fasb_with_explicitmember['depth'] = fasb_with_explicitmember['depth'].astype('int')
        member = []
        for i in df_comp.index:
            s = df_comp.loc[i,'contextref']
            table = df_comp.loc[i,'table_name']
            fasb_sort_by_table = fasb_with_explicitmember.copy()
            fasb_sort_by_table = fasb_sort_by_table[fasb_sort_by_table['table_name'] == table]
    
#######################
# First time checking if the table is empty to distinduish date only contextref, second time checking to
# determine if the member applies to the specific table.
#######################
            if 'Date_Only' in str(df_comp.loc[i,'reporting_period']):
                member.append('Date_Only')
            else:
                fasb_sort_by_table = fasb_sort_by_table.loc[[x in s for x in fasb_sort_by_table['name']],:]
                fasb_sort_by_table = fasb_sort_by_table.loc[fasb_sort_by_table['name'].str.contains('axis', case = False, na = False) == False,:]
                fasb_sort_by_table['len_name'] = fasb_sort_by_table['name'].apply(lambda x: len(x))
                if fasb_sort_by_table.empty:
                    member.append('')

                else:
                    fasb_sort_by_table = fasb_sort_by_table.sort_values(by = ['depth','len_name'], ascending = [False, False])
                    member.append(fasb_sort_by_table.reset_index().loc[0,'label'])

        df_comp['member'] = member
        df_comp['reporting_period'] = df_comp['reporting_period'].apply(lambda x: int(str(x).split('Date_Only')[0]) if 'Date_Only' in str(x) else x)
        df_comp = df_comp[df_comp['member'] != '']
        df_comp = df_comp[['CIK','Year','CutDate','reporting_period','table_name','decimals', \
                           'unitref', 'value','label','member','us-gaap','name','depth','contextref']]
        df_comp.drop_duplicates(inplace = True)
#######################
# Wrangle Text Block
#######################
        text = []
        for i in df_comp.loc[df_comp['label'].str.contains('Text Block', na = False, case = False),'value']:
            soup = BeautifulSoup(i,"lxml")
            pageText = soup.findAll(text=True)
            text.append(' '.join(pageText))
        df_comp.loc[df_comp['label'].str.contains('Text Block', na = False, case = False),'value'] = text
    
        df_all = pd.concat([df_all,df_comp.loc[df_comp['label'].str.contains('Text Block', na = False, case = False) == False,:]], ignore_index = True)
    
        df_all_text = pd.concat([df_all_text,df_comp.loc[df_comp['label'].str.contains('Text Block', na = False, case = False),:]], ignore_index = True)
        count += 1
        print(count)
        #time.sleep(20)
        if count%2 == 0:
            df_all.to_sql('sp_500_tables',engine, if_exists = 'append')
            df_all_text.to_sql('sp_500_tables_text',engine, if_exists = 'append')
            del(df_all)
            del(df_all_text)
            df_all = pd.DataFrame(columns = ['CIK', 'Year', 'CutDate', 'reporting_period', 'table_name', 'decimals',\
                                             'unitref', 'value', 'label', 'member', 'us-gaap', 'name', 'depth','contextref'])
            df_all_text = pd.DataFrame(columns = ['CIK', 'Year', 'CutDate', 'reporting_period', 'table_name', 'decimals',\
                                                  'unitref', 'value', 'label', 'member', 'us-gaap', 'name', 'depth','contextref'])
            
    except:
        failed['CIK'].append(cik)
        #time.sleep(20)
#############
# To prevent lefting out the last one
#############
df_all.to_sql('sp_500_tables',engine, if_exists = 'append')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
