In [2]:
import requests
import edgar_functions
from bs4 import BeautifulSoup
import utility_belt


headers = {"User-agent": "email@email.com"}
ticker_list = ["appl", "pypl", ""]
ticker = "nvda"
statement_keys_map = utility_belt.import_json_file("statement_key_mapping.json")


#### Return the accesssion number

Accession number parameters.
- ticker
- 10-K or 10-Q
- return accession number only

In [3]:
acc = edgar_functions.get_filter_filing(
    ticker, headers=headers, ten_k=False, accession_number_only=True
)
display(acc)

acc_num = acc.iloc[0].replace("-", "")  # accession number
print(acc_num)

reportDate
2023-10-29    0001045810-23-000227
2023-07-30    0001045810-23-000175
2023-04-30    0001045810-23-000093
2022-10-30    0001045810-22-000166
2022-07-31    0001045810-22-000147
2022-05-01    0001045810-22-000079
2021-10-31    0001045810-21-000163
2021-08-01    0001045810-21-000131
2021-05-02    0001045810-21-000064
2020-10-25    0001045810-20-000189
2020-07-26    0001045810-20-000147
2020-04-26    0001045810-20-000065
2019-10-27    0001045810-19-000170
2019-07-28    0001045810-19-000144
2019-04-28    0001045810-19-000079
2018-10-28    0001045810-18-000150
2018-07-29    0001045810-18-000114
2018-04-29    0001045810-18-000080
2017-10-29    0001045810-17-000172
2017-07-30    0001045810-17-000123
2017-04-30    0001045810-17-000075
2016-10-30    0001045810-16-000353
2016-07-31    0001045810-16-000300
2016-05-01    0001045810-16-000275
2015-10-25    0001045810-15-000173
2015-07-26    0001045810-15-000143
2015-04-26    0001045810-15-000097
2014-10-26    0001045810-14-000188
2014-07-2

000104581023000227


#### Returns the statement table name

`statement_file_name_dict`

- Inputs:
    - ticker [str]: ticker symbol
    - acc_num [str]: accession number
    - headers [dict]: headers for the requests.get() function

- Returns:
    - statement_file_names_dict [dict]: dictionary of statement names and statment_ID

- Description:
    - Gets the cik number from the ticker symbol
    - Creates baselink `https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}`
    - Gets the filing summary xml from baselink and return the XML as string
    - Parses the filing summary XML string into a BeautifulSoup object
    - Loops through the BeautifulSoup object to find the file names of the statements
        - calls is_file_statement() to check if the report is a statement
    - Returns statement_file_names_dict which is a dictionary of statement names and file names

<br>

- Baselink: `https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}`
    - The baselink is the directory containing all the data for the filing number.
        - Inlcudes balance sheet, cash flow and income statements.
    - example: `https://www.sec.gov/Archives/edgar/data/0001633917/000163391723000033/`


In [4]:
statement_file_name_dict = edgar_functions.get_statement_file_names_in_filling_summary(ticker, acc_num, headers)
display(statement_file_name_dict)

{'condensed consolidated statements of income': 'R2.htm',
 'condensed consolidated statement of comprehensive income': 'R3.htm',
 'condensed consolidated balance sheets': 'R4.htm',
 "condensed consolidated statements of shareholders' equity": 'R5.htm',
 'condensed consolidated statements of shareholders??? equity (parenthetical)': 'R6.htm',
 'condensed consolidated statements of cash flows': 'R7.htm'}

#### Return the beautiful soup of statements

`get_statement_soup` performs the following:


- Args:
    - ticker [str]: ticker symbol
    - acc_num [str]: accession number
    - statement_name [str]: name of the statement, e.g. "balance_sheet"
    - headers [dict]: headers for the requests.get() function
    - statement_keys_map [dict]: dictionary of statement names and possible keys

- Returns:
    - BeautifulSoup object of the html or xml of the statement from baselink.
    - Baselink: https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number}/{statement_ID}

- Description:
    - execute `cik_matching_ticker`, returns the cik for the ticker.
    - Gets the filing summary dictionary from baselink, cik, and acc_num and return FilingSummary.XML file
        - execute `get_statement_file_names_in_filling_summary`
    - Loops through the possible statement keys to find the file name of the statement
    - Create the statement link from the base link and file name
    - Query the statement link and return the BeautifulSoup object
    - returns the BeautifulSoup object of the html or xml of the statement


In [8]:
statement_balance_sheet = "balance_sheet"
statement_income_statement = "income_statement"
statement_cash_flow_statement = "cash_flow_statement"

soup_balance_sheet = edgar_functions.get_statement_soup(
    ticker, acc_num, statement_balance_sheet, headers, statement_keys_map
)

soup_income_statement = edgar_functions.get_statement_soup(
    ticker, acc_num, statement_income_statement, headers, statement_keys_map
)

soup_cash_flow_statement = edgar_functions.get_statement_soup(
    ticker, acc_num, statement_cash_flow_statement, headers, statement_keys_map
)

# The logged links of the quries are stored in:
edgar_functions.links_logged

{'nvda-balance_sheet-R4.htm-000104581023000227': 'https://www.sec.gov/Archives/edgar/data/0001045810/000104581023000227/R4.htm',
 'nvda-income_statement-R2.htm-000104581023000227': 'https://www.sec.gov/Archives/edgar/data/0001045810/000104581023000227/R2.htm',
 'nvda-cash_flow_statement-R7.htm-000104581023000227': 'https://www.sec.gov/Archives/edgar/data/0001045810/000104581023000227/R7.htm'}

#### Column, Date-Time, and, Values of the Statements

`extract_columns_values_and_dates_from_statement`

- Args:
    - soup (BeautifulSoup): BeautifulSoup object of the statement.

- Returns:
    - list: A list of statement column names.
    - list: A list of statement values.
    - pd.DatetimeIndex: A Pandas DatetimeIndex object containing the extracted dates.

- Description:
    - call get_datetime_index_dates_from_statement to get the date_time_index
    - find all tables in soup and iterate over them
    - check table header (th) for unit multiplier and special case scenario
        - special case scenario: check for values like EPS
    - search each row of table (tr) for "onclick" attribute
        - if "onclick" is not found, skip the row
        - if "onclick" is found
            - append column title to columns list from "onclick" attribute
            - for each cell in the row (td)
                - find all elements with class "text", "nump", or "num"
                    - nump: positive values
                    - num: negative values
                    - text: skip
    - return columns, values, and date_time_index


In [9]:
(
    columns_balance_sheet,
    values_set_balance_sheet,
    date_time_index_balance_sheet,
) = edgar_functions.extract_columns_values_and_dates_from_statement(
    soup=soup_balance_sheet
)

(
    columns_income_statement,
    values_set_income_statement,
    date_time_index_income_statement,
) = edgar_functions.extract_columns_values_and_dates_from_statement(
    soup=soup_income_statement
)

(
    columns_cash_flow_statement,
    values_set_cash_flow_statement,
    date_time_index_cash_flow_statement,
) = edgar_functions.extract_columns_values_and_dates_from_statement(
    soup=soup_cash_flow_statement
)

print(columns_cash_flow_statement)
print(values_set_cash_flow_statement)
print(date_time_index_cash_flow_statement)


['us-gaap_NetCashProvidedByUsedInOperatingActivitiesAbstract', 'us-gaap_NetIncomeLoss', 'us-gaap_AdjustmentsToReconcileNetIncomeLossToCashProvidedByUsedInOperatingActivitiesAbstract', 'us-gaap_ShareBasedCompensation', 'us-gaap_DepreciationDepletionAndAmortization', 'us-gaap_GainLossOnInvestments', 'us-gaap_DeferredIncomeTaxExpenseBenefit', 'nvda_BusinessCombinationAdvancedConsiderationWrittenOff', 'us-gaap_OtherNoncashIncomeExpense', 'us-gaap_IncreaseDecreaseInOperatingCapitalAbstract', 'us-gaap_IncreaseDecreaseInAccountsReceivable', 'us-gaap_IncreaseDecreaseInInventories', 'us-gaap_IncreaseDecreaseInPrepaidDeferredExpenseAndOtherAssets', 'us-gaap_IncreaseDecreaseInAccountsPayable', 'us-gaap_IncreaseDecreaseInAccruedLiabilitiesAndOtherOperatingLiabilities', 'us-gaap_IncreaseDecreaseInOtherNoncurrentLiabilities', 'us-gaap_NetCashProvidedByUsedInOperatingActivities', 'us-gaap_NetCashProvidedByUsedInInvestingActivitiesAbstract', 'us-gaap_ProceedsFromMaturitiesPrepaymentsAndCallsOfAvailabl

#### Construct the DataFrame from Column, Date-Time, and, Values of the Statements
- balance sheet
- income statement
- cash flow statement

In [None]:
df_balance_sheet = edgar_functions.create_dataframe_of_statement_values_columns_dates(
    values_set_balance_sheet,
    columns_balance_sheet,
    date_time_index_balance_sheet,
)

df_income = edgar_functions.create_dataframe_of_statement_values_columns_dates(
    values_set_income_statement,
    columns_income_statement,
    date_time_index_income_statement,
)

df_cash_flow = edgar_functions.create_dataframe_of_statement_values_columns_dates(
    values_set_cash_flow_statement,
    columns_cash_flow_statement,
    date_time_index_cash_flow_statement,
)

display(df_balance_sheet.transpose()/1000)
display(df_income.transpose()/1000)
display(df_cash_flow.transpose()/1000)

Unnamed: 0,2022-12-31,2021-12-31
us-gaap_AssetsCurrentAbstract,,
us-gaap_CashAndCashEquivalentsAtCarryingValue,7776.0,5197.0
us-gaap_ShortTermInvestments,3092.0,4303.0
us-gaap_AccountsReceivableNetCurrent,963.0,800.0
us-gaap_NotesAndLoansReceivableNetCurrent,7431.0,4846.0
pypl_FundsReceivableAndCustomerAccounts,36357.0,36141.0
us-gaap_PrepaidExpenseAndOtherAssetsCurrent,1898.0,1287.0
us-gaap_AssetsCurrent,57517.0,52574.0
us-gaap_LongTermInvestments,5018.0,6797.0
us-gaap_PropertyPlantAndEquipmentNet,1730.0,1909.0


Unnamed: 0,2022-12-31,2021-12-31,2020-12-31
us-gaap_IncomeStatementAbstract,,,
us-gaap_Revenues,27518.0,25371.0,21454.0
us-gaap_OperatingExpensesAbstract,,,
pypl_TransactionExpense,12173.0,10315.0,7934.0
pypl_TransactionAndCreditLosses,1572.0,1060.0,1741.0
pypl_CustomerSupportAndOperationsExpense,2120.0,2075.0,1778.0
us-gaap_SellingAndMarketingExpense,2257.0,2445.0,1861.0
pypl_TechnologyAndDevelopmentExpense,3253.0,3038.0,2642.0
us-gaap_GeneralAndAdministrativeExpense,2099.0,2114.0,2070.0
us-gaap_RestructuringCostsAndAssetImpairmentCharges,207.0,62.0,139.0


Unnamed: 0,2022-12-31,2021-12-31,2020-12-31
us-gaap_NetCashProvidedByUsedInOperatingActivitiesAbstract,,,
us-gaap_NetIncomeLoss,2419.0,4169.0,4202.0
us-gaap_AdjustmentsNoncashItemsToReconcileNetIncomeLossToCashProvidedByUsedInOperatingActivitiesAbstract,,,
pypl_TransactionAndCreditLosses,1572.0,1060.0,1741.0
us-gaap_DepreciationDepletionAndAmortization,1317.0,1265.0,1189.0
us-gaap_ShareBasedCompensation,1261.0,1376.0,1376.0
us-gaap_DeferredIncomeTaxExpenseBenefit,-811.0,-482.0,165.0
us-gaap_GainLossOnInvestments,304.0,-46.0,-1914.0
us-gaap_OtherNoncashIncomeExpense,205.0,100.0,47.0
us-gaap_IncreaseDecreaseInOperatingCapitalAbstract,,,


#### Process the entire statement

- Args:
    - ticker (str): Ticker of the company.
    - acc_num (str): Accession number of the filing.
    - statement_name (str): Name of the statement.
    - headers (dict): Headers for the request.

- Returns:
    - pd.DataFrame: DataFrame containing the statement data.


- Description:
    - Get the BeautifulSoup object of the statement from get_statement_soup
    - Extract the columns, values, and date_time_index from extract_columns_values_and_dates_from_statement
    - Create a DataFrame from create_dataframe_of_statement_values_columns_dates
    - Transpose the DataFrame and drop duplicates
    - Return the DataFrame


- Process
    - Executes: get_statement_soup
    - Executes: extract_columns_values_and_dates_from_statement
    - Execites: create_dataframe_of_statement_values_columns_dates

In [None]:
df_balance_sheet0 = edgar_functions.process_one_statement(
    ticker, acc_num, statement_balance_sheet, headers
)

df_income0 = edgar_functions.process_one_statement(
    ticker, acc_num, statement_income_statement, headers
)

df_cash_flow0 = edgar_functions.process_one_statement(
    ticker, acc_num, statement_cash_flow_statement, headers
)

display(df_balance_sheet0/1000)
display(df_income0/1000)
display(df_cash_flow0/1000)


Unnamed: 0,2022-12-31,2021-12-31
us-gaap_AssetsCurrentAbstract,,
us-gaap_CashAndCashEquivalentsAtCarryingValue,7776.0,5197.0
us-gaap_ShortTermInvestments,3092.0,4303.0
us-gaap_AccountsReceivableNetCurrent,963.0,800.0
us-gaap_NotesAndLoansReceivableNetCurrent,7431.0,4846.0
pypl_FundsReceivableAndCustomerAccounts,36357.0,36141.0
us-gaap_PrepaidExpenseAndOtherAssetsCurrent,1898.0,1287.0
us-gaap_AssetsCurrent,57517.0,52574.0
us-gaap_LongTermInvestments,5018.0,6797.0
us-gaap_PropertyPlantAndEquipmentNet,1730.0,1909.0


Unnamed: 0,2022-12-31,2021-12-31,2020-12-31
us-gaap_IncomeStatementAbstract,,,
us-gaap_Revenues,27518.0,25371.0,21454.0
pypl_TransactionExpense,12173.0,10315.0,7934.0
pypl_TransactionAndCreditLosses,1572.0,1060.0,1741.0
pypl_CustomerSupportAndOperationsExpense,2120.0,2075.0,1778.0
us-gaap_SellingAndMarketingExpense,2257.0,2445.0,1861.0
pypl_TechnologyAndDevelopmentExpense,3253.0,3038.0,2642.0
us-gaap_GeneralAndAdministrativeExpense,2099.0,2114.0,2070.0
us-gaap_RestructuringCostsAndAssetImpairmentCharges,207.0,62.0,139.0
us-gaap_OperatingExpenses,23681.0,21109.0,18165.0


Unnamed: 0,2022-12-31,2021-12-31,2020-12-31
us-gaap_NetCashProvidedByUsedInOperatingActivitiesAbstract,,,
us-gaap_NetIncomeLoss,2419.0,4169.0,4202.0
pypl_TransactionAndCreditLosses,1572.0,1060.0,1741.0
us-gaap_DepreciationDepletionAndAmortization,1317.0,1265.0,1189.0
us-gaap_ShareBasedCompensation,1261.0,1376.0,1376.0
us-gaap_DeferredIncomeTaxExpenseBenefit,-811.0,-482.0,165.0
us-gaap_GainLossOnInvestments,304.0,-46.0,-1914.0
us-gaap_OtherNoncashIncomeExpense,205.0,100.0,47.0
us-gaap_IncreaseDecreaseInAccountsReceivable,-163.0,-222.0,-100.0
pypl_IncreaseDecreaseInTransactionLossAllowanceForCashLossesNet,-1230.0,-1178.0,-1120.0
