## Retrieve the individual statements from SEC-EDGAR


In [52]:
import requests
import edgar_functions
from bs4 import BeautifulSoup
from edgar_functions import statement_keys_map

headers = {"User-agent": "email@email.com"}
ticker = "pypl"

## Gets the accession numbers

In [53]:
acc = edgar_functions.get_filter_filing(
    ticker, headers=headers, ten_k=True, accession_number_only=True
)
acc_num = acc.iloc[0].replace("-", "")

print("----- accession numbers -----")
print(acc)
print("----- accession numbers -----\n")

print("----- accession number -----")
print(acc_num)
print("----- accession number -----")

----- accession numbers -----
reportDate
2022-12-31    0001633917-23-000033
2021-12-31    0001633917-22-000027
2020-12-31    0001633917-21-000018
2019-12-31    0001633917-20-000028
2018-12-31    0001633917-19-000043
2017-12-31    0001633917-18-000029
2016-12-31    0001633917-17-000027
2015-12-31    0001633917-16-000113
Name: accessionNumber, dtype: object
----- accession numbers -----

----- accession number -----
000163391723000033
----- accession number -----


## Returns the statement file

Returns the statement file names from the ticker, CIK, and accession numbers from the `baselink``.

`base_link = "https://www.sec.gov/Archives/edgar/data/{cik}/{acc_num}/"`

In [54]:
session = requests.Session()
cik = edgar_functions.cik_matching_ticker(ticker, headers=headers)
baselink = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_num}/"
filing_summary_link = f"{baselink}/FilingSummary.xml"
filing_summary_response = session.get(
    filing_summary_link, headers=headers
).content.decode("utf-8")


# lxml-xml preserves the capitalization of the tags.
# html.parser does not preserve the capitalization of the tags.
filing_summary_soup = BeautifulSoup(filing_summary_response, "lxml-xml")
# filing_summary_soup = BeautifulSoup(filing_summary_response, 'html.parser')

- `lxml-xml` : preserves the capitalization of the tags.
- `html.parser` : html.parser does not preserve the capitalization of the tags.


In [55]:
def get_file_name(report):
    """
    Looks for the HtmlFileName tag returns the text if it exists, otherwise
    looks for the XmlFileName tag returns the text if it exists, otherwise
    returns None.
    """
    html_file_name = report.find("HtmlFileName")
    xml_file_name = report.find("XmlFileName")

    if html_file_name:
        return html_file_name.text
    elif xml_file_name:
        return xml_file_name.text
    else:
        return None


def is_file_statement(long_name, short_name, file_name):
    """
    Returns boolean [True] if (long_name, short_name, file_name is not None),
    and "Statement" is in the long_name.
    """
    return (
        long_name is not None
        and short_name is not None
        and file_name is not None
        and "Statement" in long_name.text
    )

In [56]:
report = next(i for i in filing_summary_soup.find_all("Report"))
file_name = get_file_name(report)
short_name, long_name = report.find("ShortName"), report.find("LongName")

file_statement_bool = is_file_statement(long_name, short_name, file_name)


print("----- report -----")
print(type(report))
print(report)
print("----- report -----\n")

print("----- file_name -----")
print(file_name)
print("----- file_name -----\n")

print("----- short_name -----")
print(short_name)
print("----- short_name -----\n")

print("----- long_name -----")
print(long_name)
print("----- long_name -----\n")

print("----- file_statement_bool -----")
print(file_statement_bool)
print("----- file_statement_bool -----\n")

----- report -----
<class 'bs4.element.Tag'>
<Report instance="pypl-20221231.htm">
<IsDefault>false</IsDefault>
<HasEmbeddedReports>false</HasEmbeddedReports>
<HtmlFileName>R1.htm</HtmlFileName>
<LongName>0000001 - Document - Cover Page</LongName>
<ReportType>Sheet</ReportType>
<Role>http://www.paypal.com/role/CoverPage</Role>
<ShortName>Cover Page</ShortName>
<MenuCategory>Cover</MenuCategory>
<Position>1</Position>
</Report>
----- report -----

----- file_name -----
R1.htm
----- file_name -----

----- short_name -----
<ShortName>Cover Page</ShortName>
----- short_name -----

----- long_name -----
<LongName>0000001 - Document - Cover Page</LongName>
----- long_name -----

----- file_statement_bool -----
False
----- file_statement_bool -----



In [57]:
statement_file_names_dict = {}

for report in filing_summary_soup.find_all("Report"):
    file_name = get_file_name(report)
    short_name, long_name = report.find("ShortName"), report.find("LongName")
    # print(f'short_name: {short_name} ; long_name: {long_name} ; file_name: {file_name}')
    if is_file_statement(long_name, short_name, file_name):
        statement_file_names_dict[short_name.text] = file_name

display(statement_file_names_dict)

{'CONSOLIDATED BALANCE SHEETS': 'R3.htm',
 'CONSOLIDATED BALANCE SHEETS (PARENTHETICAL)': 'R4.htm',
 'CONSOLIDATED STATEMENTS OF INCOME (LOSS)': 'R5.htm',
 'CONSOLIDATED STATEMENTS OF COMPREHENSIVE INCOME (LOSS)': 'R6.htm',
 "CONSOLIDATED STATEMENTS OF STOCKHOLDERS' EQUITY": 'R7.htm',
 'CONSOLIDATED STATEMENTS OF CASH FLOWS': 'R8.htm',
 'DERIVATIVE INSTRUMENTS - Location in the Condensed Consolidated Statements of Income and Amount of Recognized Gains or Losses Related to Derivative Instruments (Details)': 'R91.htm'}

In [62]:
def get_statement_file_names_in_filling_summary(ticker, acc_num, headers):
    """
    Inputs:
        ticker [str]: ticker symbol
        acc_num [str]: accession number
        headers [dict]: headers for the requests.get() function

    Returns:
        statement_file_names_dict [dict]: dictionary of statement names and file names

    Description:
        - Gets the cik number from the ticker symbol
        - Gets the filing summary xml from baselink and return the XML as string
        - Parses the filing summary XML string into a BeautifulSoup object
        - Loops through the BeautifulSoup object to find the file names of the statements
            - calls is_file_statement() to check if the report is a statement
        - Returns statement_file_names_dict which is a dictionary of statement names and file names
    """
    try:
        session = requests.Session()
        cik = edgar_functions.cik_matching_ticker(ticker, headers=headers)
        baselink = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_num}/"
        filing_summary_link = f"{baselink}/FilingSummary.xml"
        filing_summary_response = session.get(
            filing_summary_link, headers=headers
        ).content.decode("utf-8")

        filing_summary_soup = BeautifulSoup(filing_summary_response, "lxml-xml")
        statement_file_name_dict = {}

        for report in filing_summary_soup.find_all("Report"):
            file_name = get_file_name(report)
            short_name, long_name = report.find("ShortName"), report.find("LongName")
            # print(f'short_name: {short_name} ; long_name: {long_name} ; file_name: {file_name}')
            if is_file_statement(long_name, short_name, file_name):
                statement_file_name_dict[short_name.text.lower()] = file_name

        return statement_file_name_dict

    except requests.RequestException as e:
        print(f"An error occured: {e}")
        return {}


statement_file_name_dict = get_statement_file_names_in_filling_summary(ticker, acc_num, headers)
display(statement_file_name_dict)

{'consolidated balance sheets': 'R3.htm',
 'consolidated balance sheets (parenthetical)': 'R4.htm',
 'consolidated statements of income (loss)': 'R5.htm',
 'consolidated statements of comprehensive income (loss)': 'R6.htm',
 "consolidated statements of stockholders' equity": 'R7.htm',
 'consolidated statements of cash flows': 'R8.htm',
 'derivative instruments - location in the condensed consolidated statements of income and amount of recognized gains or losses related to derivative instruments (details)': 'R91.htm'}

In [63]:
def get_statement_soup(ticker, acc_num, statement_name, headers, statement_keys_map):
    """
    Inputs:
        ticker [str]: ticker symbol
        acc_num [str]: accession number
        statement_name [str]: name of the statement, e.g. "balance_sheet"
        headers [dict]: headers for the requests.get() function
        statement_keys_map [dict]: dictionary of statement names and possible keys

    Returns:
        BeautifulSoup object of the html or xml of the statement

    Description:
        - Gets the cik number from the ticker symbol
        - Gets the filing summary xml from baselink, cik, and acc_num and return the xml as dict
        - Loops through the possible statement keys to find the file name of the statement
        - Create the statement link from the base link and file name
        - Query the statement link and return the BeautifulSoup object
        - returns the BeautifulSoup object of the html or xml of the statement

    """

    session = requests.Session()
    cik = edgar_functions.cik_matching_ticker(ticker, headers=headers)
    base_link = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_num}"

    statement_file_name_dict = get_statement_file_names_in_filling_summary(
        ticker, acc_num, headers
    )

    statement_link = None

    i = 0
    for possible_key in statement_keys_map.get(statement_name.lower(), []):
        file_name = statement_file_name_dict.get(possible_key.lower())
        if file_name:
            statement_link = f"{base_link}/{file_name}"
            break

    if not statement_link:
        raise ValueError(f"Could not find statement file name for {statement_name}")

    try:
        statement_response = session.get(statement_link, headers=headers)
        statement_response.raise_for_status()  # Check if the request was successful

        if statement_link.endswith(".xml"):
            return BeautifulSoup(
                statement_response.content, "lxml-xml", from_encoding="utf-8"
            )
        else:
            return BeautifulSoup(statement_response.content, "lxml")

    except requests.RequestException as e:
        raise ValueError(f"Error fetching the statement: {e}")


soup = get_statement_soup(ticker, acc_num, "balance_sheet", headers, statement_keys_map)
display(soup)

<html><body><document>
<type>XML
<sequence>32
<filename>R3.htm
<description>IDEA: XBRL DOCUMENT
<text>
<title></title>
<link href="include/report.css" rel="stylesheet" type="text/css"/>
<script src="Show.js" type="text/javascript">/* Do Not Remove This Comment */</script><script type="text/javascript">
							function toggleNextSibling (e) {
							if (e.nextSibling.style.display=='none') {
							e.nextSibling.style.display='block';
							} else { e.nextSibling.style.display='none'; }
							}</script>
<span style="display: none;">v3.22.4</span><table border="0" cellspacing="2" class="report" id="idm140203797935344">
<tr>
<th class="tl" colspan="1" rowspan="1"><div style="width: 200px;"><strong>CONSOLIDATED BALANCE SHEETS - USD ($)<br/> $ in Millions</strong></div></th>
<th class="th"><div>Dec. 31, 2022</div></th>
<th class="th"><div>Dec. 31, 2021</div></th>
</tr>
<tr class="re">
<td class="pl" style="border-bottom: 0px;" valign="top"><a class="a" href="javascript:void(0);" onclick="

In [64]:
import numpy as np
import calendar
import logging


columns = []
values_set = []

soup



<html><body><document>
<type>XML
<sequence>32
<filename>R3.htm
<description>IDEA: XBRL DOCUMENT
<text>
<title></title>
<link href="include/report.css" rel="stylesheet" type="text/css"/>
<script src="Show.js" type="text/javascript">/* Do Not Remove This Comment */</script><script type="text/javascript">
							function toggleNextSibling (e) {
							if (e.nextSibling.style.display=='none') {
							e.nextSibling.style.display='block';
							} else { e.nextSibling.style.display='none'; }
							}</script>
<span style="display: none;">v3.22.4</span><table border="0" cellspacing="2" class="report" id="idm140203797935344">
<tr>
<th class="tl" colspan="1" rowspan="1"><div style="width: 200px;"><strong>CONSOLIDATED BALANCE SHEETS - USD ($)<br/> $ in Millions</strong></div></th>
<th class="th"><div>Dec. 31, 2022</div></th>
<th class="th"><div>Dec. 31, 2021</div></th>
</tr>
<tr class="re">
<td class="pl" style="border-bottom: 0px;" valign="top"><a class="a" href="javascript:void(0);" onclick="