# Explore SEC API & download Income Statement and Balance Sheets

In [84]:
import requests
import pandas as pd

## Function to use

In [85]:
def update_us_gaap_jsons(path, cik, dictionary_new_key):
    import json
    # open existing JSON file
    with open(path) as infile:
        data = json.load(infile)
    # add new key to the dictionary
    if cik in data.keys():
        print('CIK is already in JSON file.')
    else:
        data[cik] = dictionary_new_key[cik]
        print('New CIK has been added to JSON file.')
    # write & update old JSON file
    with open(path, 'w') as outfile:
        json.dump(data, outfile, sort_keys=False, indent=2)

In [86]:
def get_company_reports(cik=str, report_list=['10-K', '10-Q'], e_mail=str):
    # create link
    cik = cik
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    # create header
    header = {
    "User-Agent": e_mail#, # remaining fields are optional
    #    "Accept-Encoding": "gzip, deflate",
    #    "Host": "data.sec.gov"
    }
    # get company fillings
    company_filings = requests.get(url, headers=header).json()
    # create dataframe & filter based on documentum type
    company_filings_df = pd.DataFrame(company_filings["filings"]["recent"])
    company_filings_df_filt = company_filings_df.loc[company_filings_df['form'].isin(report_list)]
    # reorder the documents
    company_filings_df_filt = company_filings_df_filt.reindex(index=company_filings_df_filt.index[::-1])
    # drop original index
    company_filings_df_filt.reset_index(inplace=True, drop=True)
    company_filings_df_filt = company_filings_df_filt[['reportDate', 'form']].copy().rename(columns={'reportDate':'end'})

    return company_filings_df_filt

In [87]:
def insert_fourth_quaterly_report(input_df=pd.DataFrame):
    counter = 0
    data = {
        'end': [],
        'form': []    
        }

    for index, value in enumerate(input_df.form):
        if counter == 3 and value == '10-K':
            data['form'].append('10-Q')
            data['form'].append(value)
            data['end'].append(input_df.end.iloc[index])
            data['end'].append(input_df.end.iloc[index])
        else:
            data['form'].append(value)
            data['end'].append(input_df.end.iloc[index])
        # calculate 10-Q rows
        if value == '10-Q':
            counter += 1
        else:
            counter = 0

    result = pd.DataFrame(data)
    return result

In [88]:
def get_income_statements(cik=str, e_mail=str, input_df=pd.DataFrame, us_gaap_dict=dict):
    # create link
    url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    
    # create header
    header = {
    "User-Agent": e_mail
    }
    
    # get company fillings
    company_facts = requests.get(url, headers=header).json()
    # create output template from company fillings table
    result = input_df.copy()
    result['end'] = pd.to_datetime(result['end'])
    # iterate us_gaap keys related to specific cik (company)
    for key in us_gaap_dict.keys():
        # set key unit parameters based on filling type
        if key == 'shares':
            word = 'shares'
        elif key == 'eps_dil':
            word = 'USD/shares'
        else:
            word = 'USD'

        # empty DataFrame that will be filled and after merged with result
        data = pd.DataFrame()
        # iterate us-gaap lists
        for index in range(len(us_gaap_dict[key])):
            # get data & create dataframe
            temporary = pd.DataFrame(company_facts["facts"]["us-gaap"][us_gaap_dict[key][index]]["units"][word])
            # set date columns datatype
            temporary['end'] = pd.to_datetime(temporary['end'])
            temporary['start'] = pd.to_datetime(temporary['start'])
            # calculate the covered timeperiods of unique reports
            temporary['time_diff'] = temporary['end'] - temporary['start']
            temporary['time_diff'] = [x.days for x in temporary['time_diff']]
            # filter based on report lenght
            temporary = temporary.loc[((temporary['time_diff'] > 345) & (temporary['time_diff'] < 390)) | ((temporary['time_diff'] > 80) & (temporary['time_diff'] < 100))]
            # set form based on report lenght
            temporary['form'] = ['10-K' if elem > 120 else '10-Q' for elem in temporary['time_diff']]
            temporary.sort_values(by=['end'])
            # drop duplicates
            data.drop_duplicates(subset=['val', 'form'], keep='last', inplace=True, ignore_index=True)
            data.drop_duplicates(subset=['end', 'form'], keep='last', inplace=True, ignore_index=True)
            # merge temporary data before add to result
            if index == 0:
                data = temporary.loc[temporary['form'].isin(['10-Q', '10-K'])]
            else:
                data = pd.concat([data, temporary], ignore_index=True)
        
        # add data to result
        data['end'] = pd.to_datetime(data['end'])
        result = pd.merge_asof(result.sort_values('end'), data[['val', 'end', 'form']].sort_values('end'), on="end", by="form")

        result.rename(columns={'val': key}, inplace=True)
    
    if 'inpairm' in result.columns:
        result['inpairm'].fillna(0.0, inplace=True)

    return result

In [89]:
def fill_missing_annual_depr(input_df=pd.DataFrame):
    # fill missing annual D&A
    for i, val in enumerate(input_df['depr_amort']):
        if input_df['form'].iloc[i] == '10-K' and i > 0:
            input_df['depr_amort'].iloc[i] = 4 * input_df['depr_amort'].iloc[i-1]
        elif input_df['form'].iloc[i] == '10-K' and i == 0:
            input_df['depr_amort'].iloc[i] = 4 * input_df['depr_amort'].iloc[i+1]

    return input_df

In [90]:
def get_balance_sheet(cik=str, e_mail=str, input_df=pd.DataFrame, us_gaap_dict=dict):
    # create link
    url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    
    # create header
    header = {
    "User-Agent": e_mail
    }
    
    # get company fillings
    company_facts = requests.get(url, headers=header).json()
    # create output template from company fillings table
    result = input_df.copy()
    result['end'] = pd.to_datetime(result['end'])
    # iterate us_gaap keys related to specific cik (company)
    for key in us_gaap_dict.keys():
        # empty DataFrame that will be filled and after merged with result
        data = pd.DataFrame()
        # iterate us-gaap lists
        for index in range(len(us_gaap_dict[key])):
            # get data & create dataframe
            temporary = pd.DataFrame(company_facts["facts"]["us-gaap"][us_gaap_dict[key][index]]["units"]["USD"])
            # set date columns datatype
            # drop duplicates
            data.drop_duplicates(subset=['val', 'form'], keep='last', inplace=True, ignore_index=True)
            data.drop_duplicates(subset=['end', 'val'], keep='last', inplace=True, ignore_index=True)
            # merge temporary data before add to result
            if index == 0:
                data = temporary.loc[temporary['form'].isin(['10-Q', '10-K'])]
            else:
                data = pd.concat([data, temporary], ignore_index=True)
        
        # add data to result
        data['end'] = pd.to_datetime(data['end'])
        result = pd.merge_asof(result.sort_values('end'), data[['val', 'end']].sort_values('end'), on="end")

        result.rename(columns={'val': key}, inplace=True)

    return result

## 0. US-GAAP dictionaries

In [91]:
us_gaap_is = {
    '0000100493':{
        'rev': ['SalesRevenueNet', 'Revenues'],
        'cost_sale': ['CostOfGoodsSold', 'CostOfGoodsAndServicesSold'],
        'sale_gen_adm': ['SellingGeneralAndAdministrativeExpense'],
        'inpairm': ['GoodwillImpairmentLoss'],
        'op_income': ['OperatingIncomeLoss'],
        'int_income': ['InvestmentIncomeInterest'],
        'int_exp': ['InterestAndDebtExpense'],
        'inc_tax_exp': ['IncomeTaxExpenseBenefit'],
        'net_inc': ['ProfitLoss'],
        'shares': ['WeightedAverageNumberOfDilutedSharesOutstanding'],              # not ['USD'] but ['shares']
        'eps_dil': ['EarningsPerShareDiluted']                                      # not ['USD'] but ['USD/shares']
        }
    }

In [92]:
#update_us_gaap_jsons(
#    path='../JSON/us_gaap_is.json',
#    cik='0000100493',
#    dictionary_new_key=us_gaap_is
#)

In [93]:
us_gaap_depr = {
    '0000100493':{
        'depr_amort': ['DepreciationDepletionAndAmortization']
        }
    }

In [94]:
#update_us_gaap_jsons(
#    path='../JSON/us_gaap_depr.json',
#    cik='0000100493',
#    dictionary_new_key=us_gaap_depr
#)

In [95]:
us_gaap_bs = {
    '0000100493':{
        'cash_cash_eq': ['CashAndCashEquivalentsAtCarryingValue'],
        'acc_rec': ['AccountsReceivableNetCurrent'],
        'invent': ['InventoryNet'],
        'other_curr_ass': ['OtherAssetsCurrent'],
        'tot_curr_ass': ['AssetsCurrent'],
        'ppe_net': ['PropertyPlantAndEquipmentNet'],
        'goodwill': ['Goodwill'],
        'intang_ass': ['IntangibleAssetsNetExcludingGoodwill'],
        'other_ass': ['OtherAssetsNoncurrent'],
        'tot_ass': ['Assets'],
        'short_debt': ['DebtCurrent'],
        'acc_pay': ['AccountsPayableCurrent'],
        'other_curr_liab': ['OtherLiabilitiesNoncurrent'],
        'tot_curr_liab': ['LiabilitiesCurrent'],
        'tot_long_debt': ['LongTermDebtAndCapitalLeaseObligations'],
        'other_liab': ['OtherLiabilitiesNoncurrent'],
        'tot_equity': ['StockholdersEquity']
        }
    }

In [96]:
#update_us_gaap_jsons(
#    path='../JSON/us_gaap_bs.json',
#    cik='0000100493',
#    dictionary_new_key=us_gaap_bs
#)

### Useful LINK
#### https://www.kaggle.com/code/svendaj/extracting-data-from-sec-edgar-restful-apis

## 1. Get Comnpany related report list

In [97]:
cik = '0000100493'
e_mail = 'zs.nagy.1989@gmail.com'

In [98]:
input_df = get_company_reports(
    cik=cik,
    report_list=['10-K', '10-Q'],
    e_mail=e_mail
    )

In [99]:
input_df_inserted = insert_fourth_quaterly_report(input_df)
input_df_inserted.head()

Unnamed: 0,end,form
0,2010-10-02,10-K
1,2011-01-01,10-Q
2,2011-04-02,10-Q
3,2011-07-02,10-Q
4,2011-10-01,10-Q


## 2. Filling Income Sattement

In [100]:
test_is = get_income_statements(
    cik=cik,
    e_mail=e_mail,
    input_df=input_df_inserted,
    us_gaap_dict=us_gaap_is[cik]
    )

test_is.head()

Unnamed: 0,end,form,rev,cost_sale,sale_gen_adm,inpairm,op_income,int_income,int_exp,inc_tax_exp,net_inc,shares,eps_dil
0,2010-10-02,10-K,28430000000,25916000000,929000000,29000000.0,1556000000,14000000,347000000,438000000,765000000,379000000,2.06
1,2011-01-01,10-Q,7615000000,6871000000,246000000,0.0,498000000,3000000,66000000,151000000,294000000,379000000,0.78
2,2011-04-02,10-Q,8000000000,7467000000,230000000,0.0,303000000,3000000,63000000,85000000,156000000,383000000,0.42
3,2011-07-02,10-Q,8247000000,7716000000,219000000,0.0,312000000,2000000,58000000,75000000,188000000,383000000,0.51
4,2011-10-01,10-Q,8404000000,7716000000,219000000,0.0,172000000,2000000,58000000,75000000,95000000,383000000,0.26


In [101]:
#test_is.to_csv('/Users/zsolt.nagy/Desktop/projects/Economy/notebooks/test_is.csv')

## 2.b Filling Deplation, Amortzation, Depratiation

In [102]:
test_amor = get_income_statements(
    cik=cik,
    e_mail=e_mail,
    input_df=input_df_inserted,
    us_gaap_dict=us_gaap_depr[cik]
    )

# fill missing annual D&A
test_amor = fill_missing_annual_depr(test_amor)

test_amor.head()
#test_amor.to_csv('../test_amor.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df['depr_amort'].iloc[i] = 4 * input_df['depr_amort'].iloc[i+1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df['depr_amort'].iloc[i] = 4 * input_df['depr_amort'].iloc[i-1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_df['depr_amort'].iloc[i] = 4 * input_df['depr_amort'].iloc[i-1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/index

Unnamed: 0,end,form,depr_amort
0,2010-10-02,10-K,512000000.0
1,2011-01-01,10-Q,128000000.0
2,2011-04-02,10-Q,128000000.0
3,2011-07-02,10-Q,128000000.0
4,2011-10-01,10-Q,128000000.0


## 3. Filling Balance Sheet

In [103]:
test_bs = get_balance_sheet(
    cik=cik,
    e_mail=e_mail,
    input_df=input_df_inserted,
    us_gaap_dict=us_gaap_bs[cik]
    )

test_bs.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['end'] = pd.to_datetime(data['end'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['end'] = pd.to_datetime(data['end'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['end'] = pd.to_datetime(data['end'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

Unnamed: 0,end,form,cash_cash_eq,acc_rec,invent,other_curr_ass,tot_curr_ass,ppe_net,goodwill,intang_ass,other_ass,tot_ass,short_debt,acc_pay,other_curr_liab,tot_curr_liab,tot_long_debt,other_liab,tot_equity
0,2010-10-02,10-K,978000000,1198000000,2274000000,168000000,4618000000,3674000000,1893000000,166000000,401000000,10752000000,401000000,1110000000,486000000,2545000000,2135000000,486000000,5166000000
1,2011-01-01,10-Q,1122000000,1180000000,2489000000,162000000,4953000000,3714000000,1894000000,163000000,466000000,11190000000,424000000,1212000000,520000000,2645000000,2124000000,520000000,5445000000
2,2011-04-02,10-Q,794000000,1256000000,2730000000,157000000,4937000000,3762000000,1895000000,161000000,471000000,11226000000,390000000,1126000000,500000000,2479000000,2105000000,500000000,5649000000
3,2011-07-02,10-Q,981000000,1334000000,2711000000,146000000,5172000000,3802000000,1895000000,158000000,461000000,11488000000,362000000,1193000000,457000000,2761000000,2094000000,457000000,5752000000
4,2011-10-01,10-Q,716000000,1321000000,2587000000,156000000,4780000000,3823000000,1892000000,149000000,427000000,11071000000,70000000,1264000000,476000000,2374000000,2112000000,476000000,5657000000


In [104]:
#test_bs.to_csv('../test_bs.csv')

In [105]:
merged_temp = pd.merge(test_is, test_amor, on=['end', 'form'])
merged_final = pd.merge(merged_temp, test_bs, on=['end', 'form'])
merged_final.head()

Unnamed: 0,end,form,rev,cost_sale,sale_gen_adm,inpairm,op_income,int_income,int_exp,inc_tax_exp,...,intang_ass,other_ass,tot_ass,short_debt,acc_pay,other_curr_liab,tot_curr_liab,tot_long_debt,other_liab,tot_equity
0,2010-10-02,10-K,28430000000,25916000000,929000000,29000000.0,1556000000,14000000,347000000,438000000,...,166000000,401000000,10752000000,401000000,1110000000,486000000,2545000000,2135000000,486000000,5166000000
1,2011-01-01,10-Q,7615000000,6871000000,246000000,0.0,498000000,3000000,66000000,151000000,...,163000000,466000000,11190000000,424000000,1212000000,520000000,2645000000,2124000000,520000000,5445000000
2,2011-04-02,10-Q,8000000000,7467000000,230000000,0.0,303000000,3000000,63000000,85000000,...,161000000,471000000,11226000000,390000000,1126000000,500000000,2479000000,2105000000,500000000,5649000000
3,2011-07-02,10-Q,8247000000,7716000000,219000000,0.0,312000000,2000000,58000000,75000000,...,158000000,461000000,11488000000,362000000,1193000000,457000000,2761000000,2094000000,457000000,5752000000
4,2011-10-01,10-Q,8404000000,7716000000,219000000,0.0,172000000,2000000,58000000,75000000,...,149000000,427000000,11071000000,70000000,1264000000,476000000,2374000000,2112000000,476000000,5657000000


In [106]:
# write file
#merged_final.to_csv('../input/TSN_Q_exp_test.csv')