In [1]:
import pandas as pd
import os
import bs4
import pickle
import requests
from bs4 import BeautifulSoup
import sys
import matplotlib.pyplot as plt

In [51]:
company_list = ['Apple']
years = ['2017']
filing_type = '10-Q'

In [52]:
key_path = os.path.join('..','Financials', 'Ticker_and_CIK', 'Key.csv')

key_df = pd.read_csv(key_path)

key_dict = {}

for query in range(len(company_list)):
    try:
        company_df = key_df[key_df['Company'].str.contains(company_list[query].title())]
        key_dict[company_df.iloc[0, 1]] = [company_df.iloc[0, 0], company_df.iloc[0, 2]]
    except IndexError:
        print(f"{company_list[query]} could not be found")

if key_dict == {}:
    print ("Whoops, none of your companies were on the S&P 500. Please try again!")
else:
    print (f"Success! We were able to find {len(key_dict.keys())} of your queries in the S&P 500")
    print (f"We found: {list(key_dict.keys())} in the S&P 500")
    print (key_dict)

Success! We were able to find 1 of your queries in the S&P 500
We found: ['Apple Inc.'] in the S&P 500
{'Apple Inc.': ['AAPL', 320193]}


In [54]:
#Unmodified Base Code found at https://www.codeproject.com/Articles/1227765/Parsing-XBRL-with-Python
#I added the ability for the code to query through a list of companies for multiple years and multiple quarters
#and save the statements 

published_date_dict = {}
as_of_date_dict = {}

for companies_found in range(len(key_dict.keys())):
    
    company_name = list(key_dict.keys())[companies_found]
    ticker = key_dict.get(company_name)[0]
    
    cik = key_dict.get(company_name)[1]
    dateb = f'{years[0]}-12-31'
    
    # Obtain HTML for search page
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}"
    edgar_resp = requests.get(base_url.format(cik, filing_type, dateb))
    edgar_str = edgar_resp.text
    
    doc_links = []
    published_dates = []
    as_of_dates = []
            
    # Find the document link
    soup = BeautifulSoup(edgar_str, 'html.parser')
    table_tag = soup.find('table', class_='tableFile2')
    rows = table_tag.find_all('tr')
        
    for year in years:
        for row in rows:
            cells = row.find_all('td')
            if len(cells) > 3:
                if year in cells[3].text:
                    doc_link = 'https://www.sec.gov' + cells[1].a['href']
                    published_date = cells[3].text
                    published_dates.append(published_date)
                    
                    # Obtain HTML for document page
                    doc_resp = requests.get(doc_link)
                    doc_str = doc_resp.text
                    # Find the XBRL link
                    xbrl_link = ''
                    soup_doc = BeautifulSoup(doc_str, 'html.parser')
                    table_tag_doc = soup_doc.find('table', class_='tableFile', summary='Data Files')
                    rows_doc = table_tag_doc.find_all('tr')
                    for row_doc in rows_doc:
                        cells_doc = row_doc.find_all('td')
                        if len(cells_doc) > 3:
                            if 'INS' in cells_doc[3].text:
                                xbrl_link = 'https://www.sec.gov' + cells_doc[2].a['href']
                                print(xbrl_link)
                    # Obtain XBRL text from document
                    xbrl_resp = requests.get(xbrl_link)
                    xbrl_str = xbrl_resp.text
               
                    # Find and print stockholder's equity
                    soup = BeautifulSoup(xbrl_str, 'lxml')
                    tag_list = soup.find_all()
                    
                    df = pd.DataFrame()
                    
                    for tag in tag_list:
                        if 'schemaref' in tag.name:
                            ignored_href_index = len(ticker) + 1
                            as_of_date = tag['xlink:href'][ignored_href_index:(ignored_href_index+8)]
                            as_of_dates.append(as_of_date)
                            
                         #Modified from the original; The original searched through to find specific line items
                        #Modified to search through every 10-Q per given year, and extract any financial line item into its own 10-Q

                        if tag.name.find('us-gaap:') != -1:
                            try: 
                                if float(tag.text) > 0:
                                    i = [f'{tag.name}', tag.text, tag['contextref']]
                                    i_series = pd.Series(i)
                                    df = df.append(i_series, ignore_index = True)
                            except ValueError:
                                pass

                    folder_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}')
                    file_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}', f'{ticker}_{published_date}.csv')

                    try:
                        os.mkdir(folder_path)
                    except FileExistsError:
                        pass
                    df.to_csv(file_path, index = False)

    published_date_dict[ticker] = published_dates
    as_of_date_dict[ticker] = as_of_dates
    
print(as_of_date_dict)
print(published_date_dict)

https://www.sec.gov/Archives/edgar/data/320193/000032019317000009/aapl-20170701.xml
https://www.sec.gov/Archives/edgar/data/320193/000162828017004790/aapl-20170401.xml
https://www.sec.gov/Archives/edgar/data/320193/000162828017000717/aapl-20161231.xml
{'AAPL': ['20170701', '20170401', '20161231']}
{'AAPL': ['2017-08-02', '2017-05-03', '2017-02-01']}


In [60]:
date = "2017-02-01"
year = 2017

#Get the saved 10-K     
file_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}', f'{ticker}_{date}.csv')
df = pd.read_csv(file_path)

#Find the Current Ratio
c_assets_df = df[df['0'].str.contains('us-gaap:assetscurrent')]
print(c_assets_df)

print(c_assets_df['2'][c_assets_df['2']==2017])

reduced_df = c_assets_df[c_assets_df['2'].str.contains(f'{year}')]

#if len(reduced_df ==)
len(reduced_df)

                        0             1         2
25  us-gaap:assetscurrent  1.068690e+11  FI2016Q4
26  us-gaap:assetscurrent  1.033320e+11  FI2017Q1
Series([], Name: 2, dtype: object)


1

In [81]:
test_date = '20170201'

y = test_date [0:4]
m = test_date [4:6]
d = test_date [6:8]
year_check = int(y) + 1

date_formats = [f'{y}{m}{d}', f'{y}-{m}-{d}', f'{m.lstrip("0")}_{d.lstrip("0")}_{y}']


quarter_list = ['Q4', 'Q3', "Q2", "Q1"]

context_ref = c_assets_df['2']



test_df = pd.DataFrame()


if len(context_ref) == 1:
    test_df = test_df.append(c_assets_df)
elif len(context_ref) == 0:
    print('Error, There are no Entries for that')
else:
    while year_check >= (int(y)-1):
        year_check_df = c_assets_df[context_ref.str.contains(f'{year_check}')]
        
        if len(year_check_df) == 1:
            test_df = test_df.append(year_check_df)
            break
        else: 
            for item in date_formats:
                if len(c_assets_df[context_ref.str.contains(f'{item}')]) == 1:
                    test_df = test_df.append(c_assets_df[context_ref.str.contains(f'{item}')])
                    
                    break
            else:
                for quarter in quarter_list:
                    if len(c_assets_df[context_ref.str.contains(f'{quarter}{year_check}')]) == 1:
                        test_df = test_df.append(c_assets_df[context_ref.str.contains(f'{quarter}{year_check}')])
                        
                        break
                    elif len(c_assets_df[context_ref.str.contains(f'{year_check}{quarter}')]) == 1:
                        test_df = test_df.append(c_assets_df[context_ref.str.contains(f'{year_check}{quarter}')])
                        break
        year_check = year_check - 1
print(test_df)

                        0             1         2
26  us-gaap:assetscurrent  1.033320e+11  FI2017Q1
