# Explore SEC API & download Income Statement and Balance Sheets

In [19]:
import requests
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None # not to get SettingWithCopyWarning

### How to use this file?

## Set variables - Ticker, CIK etc.

In [20]:
cik = '0000100493'
ticker = 'TSN'
e_mail = 'zs.nagy.1989@gmail.com'

## Function to use

In [21]:
def update_us_gaap_jsons(path, cik, dictionary_new_key):
    import json
    # open existing JSON file
    with open(path) as infile:
        data = json.load(infile)
    # add new key to the dictionary
    if cik in data.keys():
        print('CIK is already in JSON file.')
    else:
        data[cik] = dictionary_new_key[cik]
        print('New CIK has been added to JSON file.')
    # write & update old JSON file
    with open(path, 'w') as outfile:
        json.dump(data, outfile, sort_keys=False, indent=2)

In [22]:
def get_company_reports(cik=str, report_list=['10-K', '10-Q'], e_mail=str):
    # create link
    cik = cik
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    # create header
    header = {
    "User-Agent": e_mail#, # remaining fields are optional
    #    "Accept-Encoding": "gzip, deflate",
    #    "Host": "data.sec.gov"
    }
    # get company fillings
    company_filings = requests.get(url, headers=header).json()
    # create dataframe & filter based on documentum type
    company_filings_df = pd.DataFrame(company_filings["filings"]["recent"])
    company_filings_df_filt = company_filings_df.loc[company_filings_df['form'].isin(report_list)]
    # reorder the documents
    company_filings_df_filt = company_filings_df_filt.reindex(index=company_filings_df_filt.index[::-1])
    # drop original index
    company_filings_df_filt.reset_index(inplace=True, drop=True)
    company_filings_df_filt = company_filings_df_filt[['reportDate', 'form']].copy().rename(columns={'reportDate':'end'})
    # add cik column
    company_filings_df_filt['cik'] = [cik for x in range(len(company_filings_df_filt))]

    return company_filings_df_filt

In [23]:
def insert_fourth_quaterly_report(input_df=pd.DataFrame):
    counter = 0
    data = {
        'end': [],
        'form': [],
        'cik': []   
        }

    for index, value in enumerate(input_df.form):
        if (counter == 3 and value == '10-K') or (index < 3 and value == '10-K'):
            data['form'].append('10-Q')
            data['form'].append(value)
            data['end'].append(input_df.end.iloc[index])
            data['end'].append(input_df.end.iloc[index])
            data['cik'].append(input_df.cik.iloc[index])
            data['cik'].append(input_df.cik.iloc[index])
        else:
            data['form'].append(value)
            data['end'].append(input_df.end.iloc[index])
            data['cik'].append(input_df.cik.iloc[index])
        # calculate 10-Q rows
        if value == '10-Q':
            counter += 1
        else:
            counter = 0

    result = pd.DataFrame(data)
    return result

A köv functionba kellene belenyulni, h mi van ha utcso sor 10-K....

In [24]:
def get_income_statements(cik=str, e_mail=str, input_df=pd.DataFrame, us_gaap_dict=dict):
    # create link
    url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    
    # create header
    header = {
    "User-Agent": e_mail
    }
    
    # get company fillings
    company_facts = requests.get(url, headers=header).json()
    # create output template from company fillings table
    result = input_df.copy()
    result['end'] = pd.to_datetime(result['end'])
    # iterate us_gaap keys related to specific cik (company)
    for key in us_gaap_dict.keys():
        # set key unit parameters based on filling type
        if key == 'shares':
            word = 'shares'
        elif key == 'eps_dil':
            word = 'USD/shares'
        else:
            word = 'USD'

        # empty DataFrame that will be filled and after merged with result
        data = pd.DataFrame()
        # iterate us-gaap lists
        for index in range(len(us_gaap_dict[key])):
            # get data & create dataframe
            temporary = pd.DataFrame(company_facts["facts"]["us-gaap"][us_gaap_dict[key][index]]["units"][word])
            # set date columns datatype
            temporary['end'] = pd.to_datetime(temporary['end'])
            temporary['start'] = pd.to_datetime(temporary['start'])
            # calculate the covered timeperiods of unique reports
            temporary['time_diff'] = temporary['end'] - temporary['start']
            temporary['time_diff'] = [x.days for x in temporary['time_diff']]
            # filter based on report lenght
            temporary = temporary.loc[((temporary['time_diff'] > 345) & (temporary['time_diff'] < 390)) | ((temporary['time_diff'] > 80) & (temporary['time_diff'] < 100))]
            # set form based on report lenght
            temporary['form'] = ['10-K' if elem > 120 else '10-Q' for elem in temporary['time_diff']]
            temporary.sort_values(by=['end'])
            # drop duplicates
            data.drop_duplicates(subset=['val', 'form'], keep='last', inplace=True, ignore_index=True)
            data.drop_duplicates(subset=['end', 'form'], keep='last', inplace=True, ignore_index=True)
            # merge temporary data before add to result
            if index == 0:
                data = temporary.loc[temporary['form'].isin(['10-Q', '10-K'])]
            else:
                data = pd.concat([data, temporary], ignore_index=True)
        
        # add data to result
        data['end'] = pd.to_datetime(data['end'])
        result = pd.merge_asof(result.sort_values('end'), data[['val', 'end', 'form']].sort_values('end'), on="end" , by="form")

        result.rename(columns={'val': key}, inplace=True)
    
    if 'inpairm' in result.columns:
        result['inpairm'].fillna(0.0, inplace=True)

    return result

In [25]:
def fill_missing_annual_depr(input_df=pd.DataFrame):
    # fill missing annual D&A
    for i, val in enumerate(input_df['depr_amort']):
        if input_df['form'].iloc[i] == '10-K' and i > 0:
            input_df['depr_amort'].iloc[i] = 4 * input_df['depr_amort'].iloc[i-1]
        elif input_df['form'].iloc[i] == '10-K' and i == 0:
            input_df['depr_amort'].iloc[i] = 4 * input_df['depr_amort'].iloc[i+1]

    return input_df

In [26]:
def get_balance_sheet(cik=str, e_mail=str, input_df=pd.DataFrame, us_gaap_dict=dict):
    # create link
    url = f"https://data.sec.gov/api/xbrl/companyfacts/CIK{cik}.json"
    
    # create header
    header = {
    "User-Agent": e_mail
    }
    
    # get company fillings
    company_facts = requests.get(url, headers=header).json()
    # create output template from company fillings table
    result = input_df.copy()
    result['end'] = pd.to_datetime(result['end'])
    # iterate us_gaap keys related to specific cik (company)
    for key in us_gaap_dict.keys():
        # empty DataFrame that will be filled and after merged with result
        data = pd.DataFrame()
        # iterate us-gaap lists
        for index in range(len(us_gaap_dict[key])):
            # get data & create dataframe
            temporary = pd.DataFrame(company_facts["facts"]["us-gaap"][us_gaap_dict[key][index]]["units"]["USD"])
            # set date columns datatype
            # drop duplicates
            data.drop_duplicates(subset=['val', 'form'], keep='last', inplace=True, ignore_index=True)
            data.drop_duplicates(subset=['end', 'val'], keep='last', inplace=True, ignore_index=True)
            # merge temporary data before add to result
            if index == 0:
                data = temporary.loc[temporary['form'].isin(['10-Q', '10-K'])]
            else:
                data = pd.concat([data, temporary], ignore_index=True)
        
        # add data to result
        data['end'] = pd.to_datetime(data['end'])
        result = pd.merge_asof(result.sort_values('end'), data[['val', 'end']].sort_values('end'), on="end")

        result.rename(columns={'val': key}, inplace=True)

    return result

## 0. US-GAAP dictionaries

COMMENT : Gondold at nm e egyszerubb lenne megnyitni a JSONokat?!?!

In [27]:
us_gaap_is = {
    cik:{
        "rev": [
      "SalesRevenueNet",
      "Revenues"
    ],
    "cost_sale": [
      "CostOfGoodsSold",
      "CostOfGoodsAndServicesSold"
    ],
    "sale_gen_adm": [
      "SellingGeneralAndAdministrativeExpense"
    ],
    "inpairm": [
      "GoodwillImpairmentLoss"
    ],
    "op_income": [
      "OperatingIncomeLoss"
    ],
    "int_income": [
      "InvestmentIncomeInterest"
    ],
    "int_exp": [
      "InterestAndDebtExpense"
    ],
    "inc_tax_exp": [
      "IncomeTaxExpenseBenefit"
    ],
    "net_inc": [
      "ProfitLoss"
    ],
    "shares": [
      "WeightedAverageNumberOfDilutedSharesOutstanding"
    ],
    "eps_dil": [
      "EarningsPerShareDiluted"
    ]
        }
    }

In [28]:
us_gaap_depr = {
    cik:{
        "depr_amort": [
      "DepreciationDepletionAndAmortization"
    ]
        }
    }

In [29]:
us_gaap_bs = {
    cik:{
        "cash_cash_eq": [
      "CashAndCashEquivalentsAtCarryingValue"
    ],
    "acc_rec": [
      "AccountsReceivableNetCurrent"
    ],
    "invent": [
      "InventoryNet"
    ],
    "other_curr_ass": [
      "OtherAssetsCurrent"
    ],
    "tot_curr_ass": [
      "AssetsCurrent"
    ],
    "ppe_net": [
      "PropertyPlantAndEquipmentNet"
    ],
    "goodwill": [
      "Goodwill"
    ],
    "intang_ass": [
      "IntangibleAssetsNetExcludingGoodwill"
    ],
    "other_ass": [
      "OtherAssetsNoncurrent"
    ],
    "tot_ass": [
      "Assets"
    ],
    "short_debt": [
      "DebtCurrent"
    ],
    "acc_pay": [
      "AccountsPayableCurrent"
    ],
    "other_curr_liab": [
      "OtherLiabilitiesNoncurrent"
    ],
    "tot_curr_liab": [
      "LiabilitiesCurrent"
    ],
    "tot_long_debt": [
      "LongTermDebtAndCapitalLeaseObligations"
    ],
    "other_liab": [
      "OtherLiabilitiesNoncurrent"
    ],
    "tot_equity": [
      "StockholdersEquity"
    ]
        }
    }

### Useful LINK
#### https://www.kaggle.com/code/svendaj/extracting-data-from-sec-edgar-restful-apis

## 1. Get Comnpany related report list

In [30]:
input_df = get_company_reports(
    cik=cik,
    report_list=['10-K', '10-Q'],
    e_mail=e_mail
    )
input_df.tail()

Unnamed: 0,end,form,cik
47,2022-10-01,10-K,100493
48,2022-12-31,10-Q,100493
49,2023-04-01,10-Q,100493
50,2023-07-01,10-Q,100493
51,2023-09-30,10-K,100493


In [31]:
input_df_inserted = insert_fourth_quaterly_report(input_df)
input_df_inserted.tail()

Unnamed: 0,end,form,cik
60,2022-12-31,10-Q,100493
61,2023-04-01,10-Q,100493
62,2023-07-01,10-Q,100493
63,2023-09-30,10-Q,100493
64,2023-09-30,10-K,100493


## 2. Filling Income Sattement

In [32]:
df_income_statement = get_income_statements(
    cik=cik,
    e_mail=e_mail,
    input_df=input_df_inserted,
    us_gaap_dict=us_gaap_is[cik]
    )

df_income_statement.tail()

Unnamed: 0,end,form,cik,rev,cost_sale,sale_gen_adm,inpairm,op_income,int_income,int_exp,inc_tax_exp,net_inc,shares,eps_dil
60,2022-12-31,10-Q,100493,13260000000,12292000000,501000000,0.0,467000000,9000000,84000000,114000000,320000000,358000000,0.88
61,2023-04-01,10-Q,100493,13133000000,12606000000,576000000,0.0,-49000000,7000000,89000000,-39000000,-91000000,354000000,-0.28
62,2023-07-01,10-Q,100493,13140000000,12463000000,579000000,448000000.0,-350000000,6000000,89000000,9000000,-435000000,354000000,-1.18
63,2023-09-30,10-Q,100493,13140000000,12463000000,579000000,448000000.0,-350000000,6000000,89000000,9000000,-435000000,354000000,-1.18
64,2023-09-30,10-K,100493,52881000000,50250000000,2245000000,781000000.0,-395000000,30000000,355000000,-29000000,-649000000,284000000,-1.87


In [43]:
if 'K' in df_income_statement['form'].iloc[-1]:
    for column in df_income_statement.columns[3:]:
        df_income_statement['form'].iloc[-1]


OK


## 2.b Filling Deplation, Amortzation, Depratiation

In [33]:
df_income_amortization = get_income_statements(
    cik=cik,
    e_mail=e_mail,
    input_df=input_df_inserted,
    us_gaap_dict=us_gaap_depr[cik]
    )

# fill missing annual D&A
df_income_amortization = fill_missing_annual_depr(df_income_amortization)

df_income_amortization.tail()

Unnamed: 0,end,form,cik,depr_amort
60,2022-12-31,10-Q,100493,303000000.0
61,2023-04-01,10-Q,100493,303000000.0
62,2023-07-01,10-Q,100493,303000000.0
63,2023-09-30,10-Q,100493,303000000.0
64,2023-09-30,10-K,100493,1212000000.0


## 3. Filling Balance Sheet

In [34]:
df_balance_sheet = get_balance_sheet(
    cik=cik,
    e_mail=e_mail,
    input_df=input_df_inserted,
    us_gaap_dict=us_gaap_bs[cik]
    )

df_balance_sheet.tail()

Unnamed: 0,end,form,cik,cash_cash_eq,acc_rec,invent,other_curr_ass,tot_curr_ass,ppe_net,goodwill,intang_ass,other_ass,tot_ass,short_debt,acc_pay,other_curr_liab,tot_curr_liab,tot_long_debt,other_liab,tot_equity
60,2022-12-31,10-Q,100493,654000000,2295000000,5596000000,408000000,8953000000,9120000000,10550000000,6213000000,1842000000,36678000000,490000000,2530000000,1445000000,5114000000,7859000000,1445000000,19635000000
61,2023-04-01,10-Q,100493,543000000,2433000000,5504000000,412000000,8892000000,9351000000,10550000000,6157000000,1846000000,36796000000,1065000000,2387000000,1589000000,5346000000,7865000000,1589000000,19399000000
62,2023-07-01,10-Q,100493,699000000,2451000000,5391000000,342000000,8883000000,9612000000,10211000000,6155000000,1900000000,36761000000,457000000,2421000000,1599000000,4948000000,8863000000,1599000000,18779000000
63,2023-09-30,10-Q,100493,573000000,2476000000,5328000000,345000000,8722000000,9634000000,9878000000,6098000000,1919000000,36251000000,1895000000,2594000000,1578000000,6499000000,7611000000,1578000000,18133000000
64,2023-09-30,10-K,100493,573000000,2476000000,5328000000,345000000,8722000000,9634000000,9878000000,6098000000,1919000000,36251000000,1895000000,2594000000,1578000000,6499000000,7611000000,1578000000,18133000000


## 4. Merge & Check every statement

In [35]:
merged_temp = pd.merge(df_income_statement, df_income_amortization, on=['end', 'form', 'cik'])
merged_final = pd.merge(merged_temp, df_balance_sheet, on=['end', 'form', 'cik'])
merged_final['ticker'] = [ticker for x in range(len(merged_final.end))]
merged_final.tail()

Unnamed: 0,end,form,cik,rev,cost_sale,sale_gen_adm,inpairm,op_income,int_income,int_exp,...,other_ass,tot_ass,short_debt,acc_pay,other_curr_liab,tot_curr_liab,tot_long_debt,other_liab,tot_equity,ticker
60,2022-12-31,10-Q,100493,13260000000,12292000000,501000000,0.0,467000000,9000000,84000000,...,1842000000,36678000000,490000000,2530000000,1445000000,5114000000,7859000000,1445000000,19635000000,TSN
61,2023-04-01,10-Q,100493,13133000000,12606000000,576000000,0.0,-49000000,7000000,89000000,...,1846000000,36796000000,1065000000,2387000000,1589000000,5346000000,7865000000,1589000000,19399000000,TSN
62,2023-07-01,10-Q,100493,13140000000,12463000000,579000000,448000000.0,-350000000,6000000,89000000,...,1900000000,36761000000,457000000,2421000000,1599000000,4948000000,8863000000,1599000000,18779000000,TSN
63,2023-09-30,10-Q,100493,13140000000,12463000000,579000000,448000000.0,-350000000,6000000,89000000,...,1919000000,36251000000,1895000000,2594000000,1578000000,6499000000,7611000000,1578000000,18133000000,TSN
64,2023-09-30,10-K,100493,52881000000,50250000000,2245000000,781000000.0,-395000000,30000000,355000000,...,1919000000,36251000000,1895000000,2594000000,1578000000,6499000000,7611000000,1578000000,18133000000,TSN


In [36]:
# write file
merged_final.to_csv('../sec_report_csv/{}_{}_sec_reports.csv'.format(cik, ticker), index=False)

## 5. Update JSON files with US-GAAP codes

You should manually cross check the downloaded data, and after that change the variable

In [37]:
downloaded_data_correct = False

In [38]:
if downloaded_data_correct:
    # unpdate Income Statement - US-GAAP 
    update_us_gaap_jsons(
        path='../JSON/us_gaap_is.json',
        cik=cik,
        dictionary_new_key=us_gaap_is
    )
    # unpdate D&A - US-GAAP 
    update_us_gaap_jsons(
        path='../JSON/us_gaap_depr.json',
        cik=cik,
        dictionary_new_key=us_gaap_depr
    )
    # unpdate Balance Sheet - US-GAAP
    update_us_gaap_jsons(
        path='../JSON/us_gaap_bs.json',
        cik=cik,
        dictionary_new_key=us_gaap_bs
    )