# Explore SEC API & download Income Statement and Balance Sheets

In [66]:
import requests
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None # not to get SettingWithCopyWarning

## Set variables - Ticker, CIK etc.

In [67]:
cik = '0001144519'
ticker = 'BG'
e_mail = 'zs.nagy.1989@gmail.com'

## Function to use

In [68]:
def update_us_gaap_jsons(path, cik, dictionary_new_key):
    import json
    # open existing JSON file
    with open(path) as infile:
        data = json.load(infile)
    # add new key to the dictionary
    if cik in data.keys():
        print('CIK is already in JSON file.')
    else:
        data[cik] = dictionary_new_key[cik]
        print('New CIK has been added to JSON file.')
    # write & update old JSON file
    with open(path, 'w') as outfile:
        json.dump(data, outfile, sort_keys=False, indent=2)

In [69]:
def get_company_reports(cik=str, report_list=['10-K', '10-Q'], e_mail=str):
    # create link
    cik = cik
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    # create header
    header = {
    "User-Agent": e_mail#, # remaining fields are optional
    #    "Accept-Encoding": "gzip, deflate",
    #    "Host": "data.sec.gov"
    }
    # get company fillings
    company_filings = requests.get(url, headers=header).json()
    # create dataframe & filter based on documentum type
    company_filings_df = pd.DataFrame(company_filings["filings"]["recent"])
    company_filings_df_filt = company_filings_df.loc[company_filings_df['form'].isin(report_list)]
    # reorder the documents
    company_filings_df_filt = company_filings_df_filt.reindex(index=company_filings_df_filt.index[::-1])
    # drop original index
    company_filings_df_filt.reset_index(inplace=True, drop=True)
    company_filings_df_filt = company_filings_df_filt[['reportDate', 'form']].copy().rename(columns={'reportDate':'end'})
    # add cik column
    company_filings_df_filt['cik'] = [cik for x in range(len(company_filings_df_filt))]

    return company_filings_df_filt

In [70]:
def insert_fourth_quaterly_report(input_df=pd.DataFrame):
    counter = 0
    data = {
        'end': [],
        'form': [],
        'cik': []   
        }

    for index, value in enumerate(input_df.form):
        if (counter == 3 and value == '10-K') or (index < 3 and value == '10-K'):
            data['form'].append('10-Q')
            data['form'].append(value)
            data['end'].append(input_df.end.iloc[index])
            data['end'].append(input_df.end.iloc[index])
            data['cik'].append(input_df.cik.iloc[index])
            data['cik'].append(input_df.cik.iloc[index])
        else:
            data['form'].append(value)
            data['end'].append(input_df.end.iloc[index])
            data['cik'].append(input_df.cik.iloc[index])
        # calculate 10-Q rows
        if value == '10-Q':
            counter += 1
        else:
            counter = 0

    result = pd.DataFrame(data)
    return result

In [71]:
def get_income_statements(cik=str, e_mail=str, input_df=pd.DataFrame, us_gaap_dict=dict):
    # create link
    url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    
    # create header
    header = {
    "User-Agent": e_mail
    }
    
    # get company fillings
    company_facts = requests.get(url, headers=header).json()
    # create output template from company fillings table
    result = input_df.copy()
    result['end'] = pd.to_datetime(result['end'])
    # iterate us_gaap keys related to specific cik (company)
    for key in us_gaap_dict.keys():
        # set key unit parameters based on filling type
        if key == 'shares':
            word = 'shares'
        elif key == 'eps_dil':
            word = 'USD/shares'
        else:
            word = 'USD'

        # empty DataFrame that will be filled and after merged with result
        data = pd.DataFrame()
        # iterate us-gaap lists
        for index in range(len(us_gaap_dict[key])):
            # get data & create dataframe
            temporary = pd.DataFrame(company_facts["facts"]["us-gaap"][us_gaap_dict[key][index]]["units"][word])
            # set date columns datatype
            temporary['end'] = pd.to_datetime(temporary['end'])
            temporary['start'] = pd.to_datetime(temporary['start'])
            # calculate the covered timeperiods of unique reports
            temporary['time_diff'] = temporary['end'] - temporary['start']
            temporary['time_diff'] = [x.days for x in temporary['time_diff']]
            # filter based on report lenght
            temporary = temporary.loc[((temporary['time_diff'] > 345) & (temporary['time_diff'] < 390)) | ((temporary['time_diff'] > 80) & (temporary['time_diff'] < 100))]
            # set form based on report lenght
            temporary['form'] = ['10-K' if elem > 120 else '10-Q' for elem in temporary['time_diff']]
            temporary.sort_values(by=['end'])
            # drop duplicates
            data.drop_duplicates(subset=['val', 'form'], keep='last', inplace=True, ignore_index=True)
            data.drop_duplicates(subset=['end', 'form'], keep='last', inplace=True, ignore_index=True)
            # merge temporary data before add to result
            if index == 0:
                data = temporary.loc[temporary['form'].isin(['10-Q', '10-K'])]
            else:
                data = pd.concat([data, temporary], ignore_index=True)
        
        # add data to result
        data['end'] = pd.to_datetime(data['end'])
        result = pd.merge_asof(result.sort_values('end'), data[['val', 'end', 'form']].sort_values('end'), on="end" , by="form")

        result.rename(columns={'val': key}, inplace=True)
    
    if 'inpairm' in result.columns:
        result['inpairm'].fillna(0.0, inplace=True)

    return result

In [72]:
def fill_missing_annual_depr(input_df=pd.DataFrame):
    # fill missing annual D&A
    for i, val in enumerate(input_df['depr_amort']):
        if input_df['form'].iloc[i] == '10-K' and i > 0:
            input_df['depr_amort'].iloc[i] = 4 * input_df['depr_amort'].iloc[i-1]
        elif input_df['form'].iloc[i] == '10-K' and i == 0:
            input_df['depr_amort'].iloc[i] = 4 * input_df['depr_amort'].iloc[i+1]

    return input_df

In [73]:
def get_balance_sheet(cik=str, e_mail=str, input_df=pd.DataFrame, us_gaap_dict=dict):
    # create link
    url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    
    # create header
    header = {
    "User-Agent": e_mail
    }
    
    # get company fillings
    company_facts = requests.get(url, headers=header).json()
    # create output template from company fillings table
    result = input_df.copy()
    result['end'] = pd.to_datetime(result['end'])
    # iterate us_gaap keys related to specific cik (company)
    for key in us_gaap_dict.keys():
        # empty DataFrame that will be filled and after merged with result
        data = pd.DataFrame()
        # iterate us-gaap lists
        for index in range(len(us_gaap_dict[key])):
            # get data & create dataframe
            temporary = pd.DataFrame(company_facts["facts"]["us-gaap"][us_gaap_dict[key][index]]["units"]["USD"])
            # set date columns datatype
            # drop duplicates
            data.drop_duplicates(subset=['val', 'form'], keep='last', inplace=True, ignore_index=True)
            data.drop_duplicates(subset=['end', 'val'], keep='last', inplace=True, ignore_index=True)
            # merge temporary data before add to result
            if index == 0:
                data = temporary.loc[temporary['form'].isin(['10-Q', '10-K'])]
            else:
                data = pd.concat([data, temporary], ignore_index=True)
        
        # add data to result
        data['end'] = pd.to_datetime(data['end'])
        result = pd.merge_asof(result.sort_values('end'), data[['val', 'end']].sort_values('end'), on="end")

        result.rename(columns={'val': key}, inplace=True)

    return result

## 0. US-GAAP dictionaries

In [74]:
us_gaap_is = {
    cik:{
        'rev': ['Revenues'],
        'cost_sale': ['CostOfGoodsAndServicesSold', 'CostOfGoodsSold'],
        'sale_gen_adm': ['SellingGeneralAndAdministrativeExpense'],
        'inpairm': ['GoodwillImpairmentLoss'],
        'op_income': ['IncomeLossFromContinuingOperationsBeforeIncomeTaxesExtraordinaryItemsNoncontrollingInterest'],
        'int_income': ['InterestAndOtherIncome'],
        'int_exp': ['InterestExpense'],
        'inc_tax_exp': ['IncomeTaxExpenseBenefit'],
        'net_inc': ['NetIncomeLoss'],
        'shares': ['WeightedAverageNumberOfDilutedSharesOutstanding'],              # not ['USD'] but ['shares']
        'eps_dil': ['EarningsPerShareDiluted']                                      # not ['USD'] but ['USD/shares']
        }
    }

In [75]:
us_gaap_depr = {
    cik:{
        'depr_amort': ['DepreciationDepletionAndAmortization']
        }
    }

In [76]:
us_gaap_bs = {
    cik:{
        'cash_cash_eq': ['CashAndCashEquivalentsAtCarryingValue'],
        'acc_rec': ['AccountsReceivableNetCurrent'],
        'invent': ['InventoryNet'],
        'other_curr_ass': ['OtherAssetsCurrent'],
        'tot_curr_ass': ['AssetsCurrent'],
        'ppe_net': ['PropertyPlantAndEquipmentNet'],
        'goodwill': ['Goodwill'],
        'intang_ass': ['IntangibleAssetsNetExcludingGoodwill'],
        'other_ass': ['OtherAssetsNoncurrent'],
        'tot_ass': ['Assets'],
        'short_debt': ['LongTermDebtCurrent'],
        'acc_pay': ['AccountsPayableCurrent'],
        'other_curr_liab': ['OtherLiabilitiesCurrent'],
        'tot_curr_liab': ['LiabilitiesCurrent'],
        'tot_long_debt': ['LongTermDebtNoncurrent'],
        'other_liab': ['OtherLiabilitiesNoncurrent'],
        'tot_equity': ['StockholdersEquity']
        }
    }

### Useful LINK
#### https://www.kaggle.com/code/svendaj/extracting-data-from-sec-edgar-restful-apis

## 1. Get Comnpany related report list

In [77]:
input_df = get_company_reports(
    cik=cik,
    report_list=['10-K', '10-Q'],
    e_mail=e_mail
    )
input_df.head()

Unnamed: 0,end,form,cik
0,2016-09-30,10-Q,1144519
1,2016-12-31,10-K,1144519
2,2017-03-31,10-Q,1144519
3,2017-06-30,10-Q,1144519
4,2017-09-30,10-Q,1144519


In [78]:
input_df_inserted = insert_fourth_quaterly_report(input_df)
input_df_inserted.head()

Unnamed: 0,end,form,cik
0,2016-09-30,10-Q,1144519
1,2016-12-31,10-Q,1144519
2,2016-12-31,10-K,1144519
3,2017-03-31,10-Q,1144519
4,2017-06-30,10-Q,1144519


## 2. Filling Income Sattement

In [79]:
df_income_statement = get_income_statements(
    cik=cik,
    e_mail=e_mail,
    input_df=input_df_inserted,
    us_gaap_dict=us_gaap_is[cik]
    )

df_income_statement.head()

Unnamed: 0,end,form,cik,rev,cost_sale,sale_gen_adm,inpairm,op_income,int_income,int_exp,inc_tax_exp,net_inc,shares,eps_dil
0,2016-09-30,10-Q,1144519,15616000000,10867000000,324000000,0.0,170000000,13000000,73000000,45000000,118000000,139927845,0.83
1,2016-12-31,10-Q,1144519,15616000000,10867000000,324000000,0.0,170000000,13000000,73000000,45000000,271000000,148078492,1.82
2,2016-12-31,10-K,1144519,42679000000,40269000000,1284000000,76000000.0,996000000,51000000,234000000,220000000,745000000,148226475,5.01
3,2017-03-31,10-Q,1144519,11121000000,10661000000,378000000,0.0,82000000,12000000,65000000,28000000,47000000,140897156,0.27
4,2017-06-30,10-Q,1144519,11645000000,11290000000,328000000,0.0,26000000,8000000,62000000,-55000000,81000000,141398804,0.51


## 2.b Filling Deplation, Amortzation, Depratiation

In [80]:
df_income_amortization = get_income_statements(
    cik=cik,
    e_mail=e_mail,
    input_df=input_df_inserted,
    us_gaap_dict=us_gaap_depr[cik]
    )

# fill missing annual D&A
df_income_amortization = fill_missing_annual_depr(df_income_amortization)

df_income_amortization.head()

Unnamed: 0,end,form,cik,depr_amort
0,2016-09-30,10-Q,1144519,149000000
1,2016-12-31,10-Q,1144519,149000000
2,2016-12-31,10-K,1144519,596000000
3,2017-03-31,10-Q,1144519,130000000
4,2017-06-30,10-Q,1144519,152000000


## 3. Filling Balance Sheet

In [81]:
df_balance_sheet = get_balance_sheet(
    cik=cik,
    e_mail=e_mail,
    input_df=input_df_inserted,
    us_gaap_dict=us_gaap_bs[cik]
    )

df_balance_sheet.head()

Unnamed: 0,end,form,cik,cash_cash_eq,acc_rec,invent,other_curr_ass,tot_curr_ass,ppe_net,goodwill,intang_ass,other_ass,tot_ass,short_debt,acc_pay,other_curr_liab,tot_curr_liab,tot_long_debt,other_liab,tot_equity
0,2016-09-30,10-Q,1144519,297000000,1680000000,5173000000,4612000000,11923000000,5169000000,371000000,318000000,899000000,20045000000,863000000,3205000000,3121000000,8281000000,3447000000,826000000,7083000000
1,2016-12-31,10-Q,1144519,934000000,1676000000,4773000000,3645000000,11092000000,5099000000,373000000,336000000,927000000,19188000000,938000000,3485000000,2476000000,7684000000,3069000000,853000000,7144000000
2,2016-12-31,10-K,1144519,934000000,1676000000,4773000000,3645000000,11092000000,5099000000,373000000,336000000,927000000,19188000000,938000000,3485000000,2476000000,7684000000,3069000000,853000000,7144000000
3,2017-03-31,10-Q,1144519,676000000,1671000000,5188000000,4447000000,12008000000,5351000000,497000000,365000000,1001000000,20620000000,938000000,3898000000,2787000000,8616000000,3266000000,889000000,7428000000
4,2017-06-30,10-Q,1144519,575000000,1747000000,5454000000,4138000000,11914000000,5331000000,504000000,362000000,942000000,20433000000,206000000,3513000000,2529000000,7933000000,3918000000,879000000,7227000000


## 4. Merge & Check every statement

In [82]:
merged_temp = pd.merge(df_income_statement, df_income_amortization, on=['end', 'form', 'cik'])
merged_final = pd.merge(merged_temp, df_balance_sheet, on=['end', 'form', 'cik'])
merged_final['ticker'] = [ticker for x in range(len(merged_final.end))]
merged_final.head()

Unnamed: 0,end,form,cik,rev,cost_sale,sale_gen_adm,inpairm,op_income,int_income,int_exp,...,other_ass,tot_ass,short_debt,acc_pay,other_curr_liab,tot_curr_liab,tot_long_debt,other_liab,tot_equity,ticker
0,2016-09-30,10-Q,1144519,15616000000,10867000000,324000000,0.0,170000000,13000000,73000000,...,899000000,20045000000,863000000,3205000000,3121000000,8281000000,3447000000,826000000,7083000000,BG
1,2016-12-31,10-Q,1144519,15616000000,10867000000,324000000,0.0,170000000,13000000,73000000,...,927000000,19188000000,938000000,3485000000,2476000000,7684000000,3069000000,853000000,7144000000,BG
2,2016-12-31,10-K,1144519,42679000000,40269000000,1284000000,76000000.0,996000000,51000000,234000000,...,927000000,19188000000,938000000,3485000000,2476000000,7684000000,3069000000,853000000,7144000000,BG
3,2017-03-31,10-Q,1144519,11121000000,10661000000,378000000,0.0,82000000,12000000,65000000,...,1001000000,20620000000,938000000,3898000000,2787000000,8616000000,3266000000,889000000,7428000000,BG
4,2017-06-30,10-Q,1144519,11645000000,11290000000,328000000,0.0,26000000,8000000,62000000,...,942000000,20433000000,206000000,3513000000,2529000000,7933000000,3918000000,879000000,7227000000,BG


In [83]:
# write file
merged_final.to_csv('../sec_report_csv/{}_{}_sec_reports.csv'.format(cik, ticker), index=False)

## 5. Update JSON files with US-GAAP codes

You should manually cross check the downloaded data, and after that change the variable

In [84]:
downloaded_data_correct = False

In [85]:
if downloaded_data_correct:
    # unpdate Income Statement - US-GAAP 
    update_us_gaap_jsons(
        path='../JSON/us_gaap_is.json',
        cik=cik,
        dictionary_new_key=us_gaap_is
    )
    # unpdate D&A - US-GAAP 
    update_us_gaap_jsons(
        path='../JSON/us_gaap_depr.json',
        cik=cik,
        dictionary_new_key=us_gaap_depr
    )
    # unpdate Balance Sheet - US-GAAP
    update_us_gaap_jsons(
        path='../JSON/us_gaap_bs.json',
        cik=cik,
        dictionary_new_key=us_gaap_bs
    )

New CIK has been added to JSON file.
New CIK has been added to JSON file.
New CIK has been added to JSON file.
