In [1]:
import pandas as pd
import os
import bs4
import pickle
import requests
from bs4 import BeautifulSoup
import sys
import matplotlib.pyplot as plt

In [31]:
company_list = ['Microsoft']
years = ['2018']
filing_type = '10-K'

In [3]:
#DO NOT NEED TO RUN: CSV Already Created
#Found at https://pythonprogramming.net/sp500-company-list-python-programming-for-finance/
#I added more lists to obtain the Company name and CIK numbers, then combined into a DataFrame and Exported it out

resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})

key_path = os.path.join('..','Financials', 'Ticker_and_CIK', 'Key.csv')

stock_tickers = []
companies = []
central_index_keys = []

for sp_row in table.findAll('tr')[1:]:
    stock_ticker = sp_row.findAll('td')[0].text
    stock_tickers.append(stock_ticker.rstrip())
    company = sp_row.findAll('td')[1].text
    companies.append(company.rstrip())
    central_index_key = sp_row.findAll('td')[7].text
    central_index_keys.append(central_index_key.rstrip())

sp500_df = pd.DataFrame(list(zip(stock_tickers, companies, central_index_keys)), columns = ["Ticker", "Company", "CIK"])
sp500_df.to_csv(key_path, index = False)

In [3]:
key_path = os.path.join('..','Financials', 'Ticker_and_CIK', 'Key.csv')

key_df = pd.read_csv(key_path)

key_dict = {}

for query in range(len(company_list)):
    try:
        company_df = key_df[key_df['Company'].str.contains(company_list[query].title())]
        key_dict[company_df.iloc[0, 1]] = [company_df.iloc[0, 0], company_df.iloc[0, 2]]
    except IndexError:
        print(f"{company_list[query]} could not be found")

if key_dict == {}:
    print ("Whoops, none of your companies were on the S&P 500. Please try again!")
else:
    print (f"Success! We were able to find {len(key_dict.keys())} of your queries in the S&P 500")
    print (f"We found: {list(key_dict.keys())} in the S&P 500")
    print (key_dict)

Success! We were able to find 1 of your queries in the S&P 500
We found: ['Microsoft Corp.'] in the S&P 500
{'Microsoft Corp.': ['MSFT', 789019]}


In [22]:
#Unmodified Base Code found at https://www.codeproject.com/Articles/1227765/Parsing-XBRL-with-Python
#I added the ability for the code to query through a list of companies for multiple years and multiple quarters
#and save the statements 

published_date_dict = {}
as_of_date_dict = {}

for companies_found in range(len(key_dict.keys())):
    
    company_name = list(key_dict.keys())[companies_found]
    ticker = key_dict.get(company_name)[0]
    
    cik = key_dict.get(company_name)[1]
    dateb = f'{years[0]}-12-31'
    
    # Obtain HTML for search page
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}"
    edgar_resp = requests.get(base_url.format(cik, filing_type, dateb))
    edgar_str = edgar_resp.text
    
    doc_links = []
    published_dates = []
    as_of_dates = []
            
    # Find the document link
    soup = BeautifulSoup(edgar_str, 'html.parser')
    table_tag = soup.find('table', class_='tableFile2')
    rows = table_tag.find_all('tr')
        
    for year in years:
        for row in rows:
            cells = row.find_all('td')
            if len(cells) > 3:
                if year in cells[3].text:
                    doc_link = 'https://www.sec.gov' + cells[1].a['href']
                    published_date = cells[3].text
                    published_dates.append(published_date)
                    
                    # Obtain HTML for document page
                    doc_resp = requests.get(doc_link)
                    doc_str = doc_resp.text
                    # Find the XBRL link
                    xbrl_link = ''
                    soup_doc = BeautifulSoup(doc_str, 'html.parser')
                    table_tag_doc = soup_doc.find('table', class_='tableFile', summary='Data Files')
                    rows_doc = table_tag_doc.find_all('tr')
                    for row_doc in rows_doc:
                        cells_doc = row_doc.find_all('td')
                        if len(cells_doc) > 3:
                            if 'INS' in cells_doc[3].text:
                                xbrl_link = 'https://www.sec.gov' + cells_doc[2].a['href']
                                print(xbrl_link)
                    # Obtain XBRL text from document
                    xbrl_resp = requests.get(xbrl_link)
                    xbrl_str = xbrl_resp.text
               
                    # Find and print stockholder's equity
                    soup = BeautifulSoup(xbrl_str, 'lxml')
                    tag_list = soup.find_all()
                    
                    df = pd.DataFrame()
                    
                    for tag in tag_list:
                        if 'schemaref' in tag.name:
                            ignored_href_index = len(ticker) + 1
                            as_of_date = tag['xlink:href'][ignored_href_index:(ignored_href_index+8)]
                            as_of_dates.append(as_of_date)
                            
                         #Modified from the original; The original searched through to find specific line items
                        #Modified to search through every 10-Q per given year, and extract any financial line item into its own 10-Q

                        if tag.name.find('us-gaap:') != -1:
                            try: 
                                if float(tag.text) > 0:
                                    i = [f'{tag.name}', tag.text, tag['contextref']]
                                    i_series = pd.Series(i)
                                    df = df.append(i_series, ignore_index = True)
                            except ValueError:
                                pass

                    folder_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}')
                    file_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}', f'{ticker}_{date}.csv')

                    try:
                        os.mkdir(folder_path)
                    except FileExistsError:
                        pass
                    df.to_csv(file_path, index = False)

    published_date_dict[ticker] = published_dates
    as_of_date_dict[ticker] = as_of_dates
    
as_of_date_dict

https://www.sec.gov/Archives/edgar/data/789019/000156459018019062/msft-20180630.xml


{'MSFT': ['20180630']}

In [24]:
#

import difflib

cases=[('C_0000789019_20170331', 'C_0000789019_20160630'),
       ('As_Of_12_31_2016', 'As_Of_6_30_2017'),
       ('afrykanerskojęzycznym', 'afrykanerskojęzyczny'),
       ('nieafrykanerskojęzyczni', 'afrykanerskojęzyczni'),
       ('nieafrynerskojęzyczni', 'afrykanerskojzyczni'),
       ('abcdefg','xac')] 

for a,b in cases:     
    print('{} => {}'.format(a,b))  
    for i,s in enumerate(difflib.ndiff(a, b)):
        if s[0]==' ': continue
        elif s[0]=='-':
            print(u'Delete "{}" from position {}'.format(s[-1],i))
        elif s[0]=='+':
            print(u'Add "{}" to position {}'.format(s[-1],i))    
    print()  

C_0000789019_20170331 => C_0000789019_20160630
Delete "7" from position 16
Add "6" to position 17
Add "6" to position 19
Add "0" to position 21
Delete "3" from position 22
Delete "1" from position 23

As_Of_12_31_2016 => As_Of_6_30_2017
Add "6" to position 6
Delete "1" from position 7
Delete "2" from position 8
Delete "1" from position 11
Add "0" to position 12
Delete "6" from position 17
Add "7" to position 18

afrykanerskojęzycznym => afrykanerskojęzyczny
Delete "m" from position 20

nieafrykanerskojęzyczni => afrykanerskojęzyczni
Delete "n" from position 0
Delete "i" from position 1
Delete "e" from position 2

nieafrynerskojęzyczni => afrykanerskojzyczni
Delete "n" from position 0
Delete "i" from position 1
Delete "e" from position 2
Add "k" to position 7
Add "a" to position 8
Delete "ę" from position 16

abcdefg => xac
Add "x" to position 0
Delete "b" from position 2
Delete "d" from position 4
Delete "e" from position 5
Delete "f" from position 6
Delete "g" from position 7



In [None]:
MSFT_2017 = 'C_0000789019_20170331'
MSFT_20171 = 'C_0000789019_20160701_20170331'
MSFT_2016 = 'eol_PE8528----1510-Q0018_STD_0_20151231_0'

m_d = {'0331': 1, '0630': 2, '0930': 3, '1231': 4} 



In [29]:
cik = 'Microsoft Corp.'
filing_type = '10-Q'
dateb = f'2018-08-03'

years = ['2016']

# Obtain HTML for search page
base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}"
edgar_resp = requests.get(base_url.format(cik, filing_type, dateb))
edgar_str = edgar_resp.text

doc_links = []
dates = []


# Find the document link
soup = BeautifulSoup(edgar_str, 'html.parser')
table_tag = soup.find('table', class_='tableFile2')
rows = table_tag.find_all('tr')

for year in years:
    for row in rows:
        cells = row.find_all('td')
        if len(cells) > 3:
            if year in cells[3].text:
                doc_link = 'https://www.sec.gov' + cells[1].a['href']
                date = cells[3].text
                dates.append(date)

                # Obtain HTML for document page
                doc_resp = requests.get(doc_link)
                doc_str = doc_resp.text
                # Find the XBRL link
                xbrl_link = ''
                soup_doc = BeautifulSoup(doc_str, 'html.parser')
                table_tag_doc = soup_doc.find('table', class_='tableFile', summary='Data Files')
                rows_doc = table_tag_doc.find_all('tr')
                for row_doc in rows_doc:
                    cells_doc = row_doc.find_all('td')
                    if len(cells_doc) > 3:
                        if 'INS' in cells_doc[3].text:
                            xbrl_link = 'https://www.sec.gov' + cells_doc[2].a['href']
                # Obtain XBRL text from document
                xbrl_resp = requests.get(xbrl_link)
                xbrl_str = xbrl_resp.text
                # Find and print stockholder's equity
                soup_xbrl = BeautifulSoup(xbrl_str, 'lxml')
                tag_list = soup_xbrl.find_all()

                df = pd.DataFrame()

                for tag in tag_list:

                    #Modified from the original; The original searched through to find specific line items
                    #Modified to search through every 10-Q per given year, and extract any financial line item into its own 10-Q

                    if tag.name.find('us-gaap:') != -1:
                        try: 
                            if float(tag.text) > 0:
                                i = [f'{tag.name}', tag.text, tag['contextref']]
                                i_series = pd.Series(i)
                                df = df.append(i_series, ignore_index = True)
                        except ValueError:
                            pass

                folder_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}')
                file_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}', f'{ticker}_{date}.csv')

                try:
                    os.mkdir(folder_path)
                except FileExistsError:
                    pass
                df.to_csv(file_path, index = False)

df

AttributeError: 'NoneType' object has no attribute 'find_all'

In [75]:
#Unmodified Base Code found at https://www.codeproject.com/Articles/1227765/Parsing-XBRL-with-Python
#I added the ability for the code to query through a list of companies for multiple years and multiple quarters
#and save the statements 

date_dict = {}

for companies_found in range(len(key_dict.keys())):
    
    company_name = list(key_dict.keys())[companies_found]
    ticker = key_dict.get(company_name)[0]
    
    cik = key_dict.get(company_name)[1]
    dateb = f'{years[0]}-12-31'
    
    # Obtain HTML for search page
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}"
    edgar_resp = requests.get(base_url.format(cik, filing_type, dateb))
    edgar_str = edgar_resp.text
    
    doc_links = []
    dates = []
    
            
    # Find the document link
    soup = BeautifulSoup(edgar_str, 'html.parser')
    table_tag = soup.find('table', class_='tableFile2')
    rows = table_tag.find_all('tr')
        
    for year in years:
        for row in rows:
            cells = row.find_all('td')
            if len(cells) > 3:
                if year in cells[3].text:
                    doc_link = 'https://www.sec.gov' + cells[1].a['href']
                    date = cells[3].text
                    dates.append(date)
                    
                    # Obtain HTML for document page
                    doc_resp = requests.get(doc_link)
                    doc_str = doc_resp.text
                    # Find the XBRL link
                    xbrl_link = ''
                    soup_doc = BeautifulSoup(doc_str, 'html.parser')
                    table_tag_doc = soup_doc.find('table', class_='tableFile', summary='Data Files')
                    rows_doc = table_tag_doc.find_all('tr')
                    for row_doc in rows_doc:
                        cells_doc = row_doc.find_all('td')
                        if len(cells_doc) > 3:
                            if 'INS' in cells_doc[3].text:
                                xbrl_link = 'https://www.sec.gov' + cells_doc[2].a['href']
                    # Obtain XBRL text from document
                    xbrl_resp = requests.get(xbrl_link)
                    xbrl_str = xbrl_resp.text
                    # Find and print stockholder's equity
                    soup_xbrl = BeautifulSoup(xbrl_str, 'lxml')
                    tag_list = soup_xbrl.find_all()

                    df = pd.DataFrame()

                    for tag in tag_list:

                        #Modified from the original; The original searched through to find specific line items
                        #Modified to search through every 10-Q per given year, and extract any financial line item into its own 10-Q

                        if tag.name.find('us-gaap:') != -1:
                            try: 
                                if float(tag.text) > 0:
                                    i = [f'{tag.name}', tag.text, tag['contextref']]
                                    i_series = pd.Series(i)
                                    df = df.append(i_series, ignore_index = True)
                            except ValueError:
                                pass

                    folder_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}')
                    file_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}', f'{ticker}_{date}.csv')

                    try:
                        os.mkdir(folder_path)
                    except FileExistsError:
                        pass
                    df.to_csv(file_path, index = False)
                    
    date_dict[ticker]=dates
    
date_dict

{'AAPL': ['2016-07-27', '2016-04-27', '2016-01-27']}

In [71]:
#Unmodified Base Code found at https://www.codeproject.com/Articles/1227765/Parsing-XBRL-with-Python
#I added the ability for the code to query through a list of companies for multiple years and multiple quarters
#and save the statements 

example = ["Apple Inc."]

years = ['2018']

date_dict = {}

for companies_found in range(len(example)):
    
    company_name = list(example)[companies_found]
    ticker = key_dict.get(company_name)[0]
    
    cik = key_dict.get(company_name)[1]
    filing_type = '10-K'
    dateb = f'{years[0]}-12-31'
    
    # Obtain HTML for search page
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}"
    edgar_resp = requests.get(base_url.format(cik, filing_type, dateb))
    edgar_str = edgar_resp.text
    
    doc_links = []
    dates = []
    
            
    # Find the document link
    soup = BeautifulSoup(edgar_str, 'html.parser')
    table_tag = soup.find('table', class_='tableFile2')
    rows = table_tag.find_all('tr')
        
    for year in years:
        for row in rows:
            cells = row.find_all('td')
            if len(cells) > 3:
                if year in cells[3].text:
                    doc_link = 'https://www.sec.gov' + cells[1].a['href']
                    date = cells[3].text
                    dates.append(date)
                    # Obtain HTML for document page
                    doc_resp = requests.get(doc_link)
                    doc_str = doc_resp.text
                    # Find the XBRL link
                    xbrl_link = ''
                    soup_doc = BeautifulSoup(doc_str, 'html.parser')
                    table_tag_doc = soup_doc.find('table', class_='tableFile', summary='Data Files')
                    rows_doc = table_tag_doc.find_all('tr')
                    for row_doc in rows_doc:
                        cells_doc = row_doc.find_all('td')
                        if len(cells_doc) > 3:
                            if 'INS' in cells_doc[3].text:
                                xbrl_link = 'https://www.sec.gov' + cells_doc[2].a['href']
                        
                    # Obtain XBRL text from document
                    xbrl_resp = requests.get(xbrl_link)
                    xbrl_str = xbrl_resp.text
                    # Find and print stockholder's equity
                    soup_xbrl = BeautifulSoup(xbrl_str, 'lxml')
                    tag_list = soup_xbrl.find_all()

                    df = pd.DataFrame()

                    for tag in tag_list:
                        print(tag)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




<xbrli:instant>2017-09-30</xbrli:instant>
<xbrli:context id="FI2017Q4_us-gaap_BalanceSheetLocationAxis_us-gaap_OtherLiabilitiesMember_us-gaap_DerivativeInstrumentRiskAxis_us-gaap_InterestRateContractMember_us-gaap_FairValueByFairValueHierarchyLevelAxis_us-gaap_FairValueInputsLevel2Member">
<xbrli:entity>
<xbrli:identifier scheme="http://www.sec.gov/CIK">0000320193</xbrli:identifier>
<xbrli:segment>
<xbrldi:explicitmember dimension="us-gaap:BalanceSheetLocationAxis">us-gaap:OtherLiabilitiesMember</xbrldi:explicitmember>
<xbrldi:explicitmember dimension="us-gaap:DerivativeInstrumentRiskAxis">us-gaap:InterestRateContractMember</xbrldi:explicitmember>
<xbrldi:explicitmember dimension="us-gaap:FairValueByFairValueHierarchyLevelAxis">us-gaap:FairValueInputsLevel2Member</xbrldi:explicitmember>
</xbrli:segment>
</xbrli:entity>
<xbrli:period>
<xbrli:instant>2017-09-30</xbrli:instant>
</xbrli:period>
</xbrli:context>
<xbrli:entity>
<xbrli:identifier scheme="http://www.sec.gov/CIK">0000320193</x

</xbrli:entity>
<xbrli:identifier scheme="http://www.sec.gov/CIK">0000320193</xbrli:identifier>
<xbrli:segment>
<xbrldi:explicitmember dimension="us-gaap:ShareRepurchaseProgramAxis">aapl:May2018ShareRepurchaseProgramMember</xbrldi:explicitmember>
</xbrli:segment>
<xbrldi:explicitmember dimension="us-gaap:ShareRepurchaseProgramAxis">aapl:May2018ShareRepurchaseProgramMember</xbrldi:explicitmember>
<xbrli:period>
<xbrli:startdate>2017-10-01</xbrli:startdate>
<xbrli:enddate>2018-09-29</xbrli:enddate>
</xbrli:period>
<xbrli:startdate>2017-10-01</xbrli:startdate>
<xbrli:enddate>2018-09-29</xbrli:enddate>
<xbrli:context id="I2018Q3OldCapReturn_us-gaap_ShareRepurchaseProgramAxis_aapl_PreviousShareRepurchaseProgramMember">
<xbrli:entity>
<xbrli:identifier scheme="http://www.sec.gov/CIK">0000320193</xbrli:identifier>
<xbrli:segment>
<xbrldi:explicitmember dimension="us-gaap:ShareRepurchaseProgramAxis">aapl:PreviousShareRepurchaseProgramMember</xbrldi:explicitmember>
</xbrli:segment>
</xbrli:enti

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [16]:
print(years)

['2018']


Take year --> find year within Context ref
If cannot find year, take year before

MSFT_2017 = 'C_0000789019_20170331'
MSFT_20171 = 'C_0000789019_20160701_20170331'
MSFT_2016 = 'eol_PE8528----1510-Q0018_STD_0_20151231_0'
AAPL_2017 - 'FI2016Q4'
AAPL_2015 = 'eol_PE2035----1510-K0012_STD_0_20150926_0'
MMM_2016 = 'As_Of_12_31_2015'

m_d = {'0331': 1, '0630': 2, '0930': 3, '1231': 4}

if multiple with same year, take last 4 digits after year, take biggest

In [32]:
date = "2018-08-03"
year = 2018

#Get the saved 10-K     
file_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}', f'{ticker}_{date}.csv')
df = pd.read_csv(file_path)

#Find the Current Ratio
c_assets_df = df[df['0'].str.contains('us-gaap:assetscurrent')]
print(c_assets_df)

print(c_assets_df['2'][c_assets_df['2']==2017])

reduced_df = c_assets_df[c_assets_df['2'].str.contains(f'{year}')]

#if len(reduced_df ==)
len(reduced_df)

                        0             1                      2
76  us-gaap:assetscurrent  1.696620e+11  C_0000789019_20180630
77  us-gaap:assetscurrent  1.626960e+11  C_0000789019_20170630
Series([], Name: 2, dtype: object)


1

In [None]:
# parsing through different types of href

def 



def parse_contextref ():
    
    return





if year exists
    if 1 entry
        extract
    if 2 entries
        if 20171231
        if 2017Q4
        if 12_31_2017
if not

In [106]:
test_date = '20180630'

y = test_date [0:4]
m = test_date [4:6]
d = test_date [6:8]

date_formats = [f'{y}{m}{d}', f'{y}-{m}-{d}', f'{m.lstrip("0")}_{d.lstrip("0")}_{y}']


quarter_dict = {'03': 'Q1', '06':'Q2', '09': 'Q3', "12": "Q4"}

context_ref = c_assets_df['2']

year_check_df = c_assets_df[context_ref.str.contains(f'{y}')]


test_df = pd.DataFrame()


if len(context_ref) == 1:
    test_df = test_df.append(c_assets_df)
    print(test_df)
elif len(context_ref) == 0:
    print('Error')
else:
    if len(year_check_df) == 1:
        test_df = test_df.append(year_check_df)
        print(test_df)
    elif len(year_check_df) == 0:
        print ("need to code loop")
    else: 
        if context_ref.str.contains(test_date):
            print("Help")
        elif len(c_assets_df[context_ref.str.contains(f'{quarter_dict.get(m)}{y}')]) == 1:
            test_df = test_df.append(c_assets_df[context_ref.str.contains(f'{quarter_dict.get(m)}{y}')])
        elif len(c_assets_df[context_ref.str.contains(f'{y}{quarter_dict.get(m)}')]) == 1:
            test_df = test_df.append(c_assets_df[context_ref.str.contains(f'{y}{quarter_dict.get(m)}')]) 
            print("idk fam")



                        0             1                      2
76  us-gaap:assetscurrent  1.696620e+11  C_0000789019_20180630


In [30]:
from bs4 import BeautifulSoup
import requests
import sys

# Access page
cik = '0000051143'
type = '10-K'
dateb = '20160101'

# Obtain HTML for search page
base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}"
edgar_resp = requests.get(base_url.format(cik, type, dateb))
edgar_str = edgar_resp.text

# Find the document link
doc_link = ''
soup = BeautifulSoup(edgar_str, 'html.parser')
table_tag = soup.find('table', class_='tableFile2')
rows = table_tag.find_all('tr')
for row in rows:
    cells = row.find_all('td')
    if len(cells) > 3:
        if '2015' in cells[3].text:
            doc_link = 'https://www.sec.gov' + cells[1].a['href']

# Exit if document link couldn't be found
if doc_link == '':
    print("Couldn't find the document link")
    sys.exit()

# Obtain HTML for document page
doc_resp = requests.get(doc_link)
doc_str = doc_resp.text

# Find the XBRL link
xbrl_link = ''
soup = BeautifulSoup(doc_str, 'html.parser')
table_tag = soup.find('table', class_='tableFile', summary='Data Files')
rows = table_tag.find_all('tr')
for row in rows:
    cells = row.find_all('td')
    if len(cells) > 3:
        if 'INS' in cells[3].text:
            xbrl_link = 'https://www.sec.gov' + cells[2].a['href']

# Obtain XBRL text from document
xbrl_resp = requests.get(xbrl_link)
xbrl_str = xbrl_resp.text

# Find and print stockholder's equity
soup = BeautifulSoup(xbrl_str, 'lxml')
tag_list = soup.find_all()
for tag in tag_list:
    if tag.name == 'us-gaap:stockholdersequity':
        print("Stockholder's equity: " + tag.text)

https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000051143&type=10-K&dateb=20181231
https://www.sec.gov/Archives/edgar/data/51143/000104746917001061/ibm-20161231.xml
https://www.sec.gov/Archives/edgar/data/51143/000104746917001061/ibm-20161231.xml
https://www.sec.gov/Archives/edgar/data/51143/000104746917001061/ibm-20161231.xml
https://www.sec.gov/Archives/edgar/data/51143/000104746917001061/ibm-20161231.xml
https://www.sec.gov/Archives/edgar/data/51143/000104746917001061/ibm-20161231.xml
https://www.sec.gov/Archives/edgar/data/51143/000104746917001061/ibm-20161231.xml


KeyError: 'As_Of_12_31_2015'

In [21]:
#Unmodified Base Code found at https://www.codeproject.com/Articles/1227765/Parsing-XBRL-with-Python
#I added the ability for the code to query through a list of companies for multiple years and multiple quarters
#and save the statements 

date_dict = {}

for companies_found in range(len(key_dict.keys())):
    
    company_name = list(key_dict.keys())[companies_found]
    ticker = key_dict.get(company_name)[0]
    
    cik = key_dict.get(company_name)[1]
    filing_type = '10-K'
    dateb = f'{years[0]}-12-31'
    
    # Obtain HTML for search page
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}"
    edgar_resp = requests.get(base_url.format(cik, filing_type, dateb))
    edgar_str = edgar_resp.text
    
    doc_links = []
    dates = []
    
            
    # Find the document link
    soup = BeautifulSoup(edgar_str, 'html.parser')
    table_tag = soup.find('table', class_='tableFile2')
    rows = table_tag.find_all('tr')
        
    for year in years:
        for row in rows:
            cells = row.find_all('td')
            if len(cells) > 3:
                if year in cells[3].text:
                    doc_link = 'https://www.sec.gov' + cells[1].a['href']
                    date = cells[3].text
                    dates.append(date)
                    
                    # Obtain HTML for document page
                    doc_resp = requests.get(doc_link)
                    doc_str = doc_resp.text
                    # Find the XBRL link
                    xbrl_link = ''
                    soup_doc = BeautifulSoup(doc_str, 'html.parser')
                    table_tag_doc = soup_doc.find('table', class_='tableFile', summary='Data Files')
                    rows_doc = table_tag_doc.find_all('tr')
                    for row_doc in rows_doc:
                        cells_doc = row_doc.find_all('td')
                        if len(cells_doc) > 3:
                            if 'INS' in cells_doc[3].text:
                                xbrl_link = 'https://www.sec.gov' + cells_doc[2].a['href']
                                print(xbrl_link)
                    # Obtain XBRL text from document
                    xbrl_resp = requests.get(xbrl_link)
                    xbrl_str = xbrl_resp.text
               
                    # Find and print stockholder's equity
                    soup = BeautifulSoup(xbrl_str, 'lxml')
                    tag_list = soup.find_all()

                    #This section of code creates a context table. 
                    #The context table is a dictionary of context names keys that reference dictionary values 
                    #containing date information for each context. For contexts with datetype of 'period' the table 
                    #contains the start and end date. For contexts with datetype of 'instant' the context
                    #contains the instant date of the context. All entries include a date and dateType value.
                    #For contexts with datetype of period, the date is equal to the enddate of the context.

                    contexts = {}
                    
                    for tag in tag_list:
                        if tag.name == 'xbrll:schemaref':
                            print(tag['xlink:href'])
                    
                    for tag in tag_list:
                        if tag.name == 'xbrli:context':

                            #This section of code finds the start date of the context if it exists.
                            start_date_tag = tag.find(name = 'xbrli:startdate')
                            if start_date_tag == None:
                                start_date = None
                            else:
                                start_date = start_date_tag.text

                            #This section of code finds the end date of the context if it exists.
                            end_date_tag = tag.find(name = 'xbrli:enddate')
                            if end_date_tag == None:
                                end_date = None
                            else:
                                end_date = end_date_tag.text
                                date = end_date_tag.text
                                datetype = 'period'

                            #This section of code finds the instant date of the context if it exists.
                            instant_date_tag = tag.find(name = 'xbrli:instant')
                            if instant_date_tag != None:
                                date = instant_date_tag.text
                                datetype = 'instant'

                            #build a dictionary of date information within a dictionary of context titles
                            dtinfo = {'date' : date, 'year' : date[0:4], 'datetype' : datetype, 'startdate' : start_date, 'enddate' : end_date}
                            contexts[tag.attrs['id']] = dtinfo
                  
                    # Find and print stockholder's equity
                    for tag in tag_list:
                        if tag.name == 'us-gaap:assetscurrent':
                            
                            print(contexts)
                            year = contexts[tag.attrs['contextref']]
                            print(year + " Current Assets: " + tag.text)


https://www.sec.gov/Archives/edgar/data/789019/000156459018019062/msft-20180630.xml
msft-20180630.xsd
{}


KeyError: 'C_0000789019_20180630'

In [92]:
from datetime import datetime

test_date = '20180630'

Y = test_date [0:3]
m = test_date [4:6]
d = test_date [6:8]
print(y)

2018
