In [3]:
"""
Scrape edgar for all the xbrl filings from one company, extract all of 
the facts accross all the filings and output a unified set of facts by 
instant and period.
"""
import os, re
from itertools import groupby
import requests, requests_cache
from bs4 import BeautifulSoup

In [79]:
def download_filings(ticker, kind="10"):
    """ Download the text of all the edgar filings for ticker """
    requests_cache.install_cache("/data/swan/%s" % ticker)
    edgar_search_url = "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&type={0}&output=atom&CIK={1}&count=100&start=0"
    edgar_search_url = edgar_search_url.format(kind, ticker)
    entries = BeautifulSoup(requests.get(edgar_search_url).text).find_all('entry')
    page_urls = [entry.find('filing-href').text for entry in entries if entry.find('xbrl_href')]
    pages = [requests.get(url).text for url in page_urls]
    regex = re.compile("href=\"(.*xml)\"")
    urls = ["http://www.sec.gov{0}".format(regex.findall(p)[0]) for p in pages]
    filings = [requests.get(url).text for url in urls]
    print "Found", len(filings), "filings for", ticker
    return filings
    
if __name__ == "__main__":
    ticker = 'AAPL'
    filings = download_filings(ticker)


Found 24 filings for AAPL


In [80]:
def get_dates_and_instants(filing_soup):
    """ Return all the unique dates and a list of instants for each of them. """
    instants = [c for c in filing_soup("context") if c("instant")]
    unique_instants = [(k, [i.attrs['id'] for i in list(v)]) for k, v in groupby(instants, lambda x: x.period.instant.text)]
    return unique_instants
    
if __name__ == "__main__":
    all_dates = [get_dates_and_instants(BeautifulSoup(f)) for f in filings]

Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates
Found 24 dates


In [74]:
def extract_gaap_facts(dates_instants, filing_soup):
    """ Return a list of (date, xbrl fact tag, value) for the given filing """
    try:
        for instant in dates_instants:
            for context in instant[1]:
                for e in filing_soup.find_all(contextref=re.compile(context)):
                    if  "xsi:nil" not in e.attrs and e.name.startswith("us-gaap"):
                        yield((instant[0], e.name, float(e.contents[0])))
    except:
        print "Problem extracting gaap facts"
                    
if __name__ == "__main__":
    soup = BeautifulSoup(filings[0])
    dates_instants = get_dates_and_instants(soup)
    gaap_facts = extract_gaap_facts(dates_instants, soup)
    facts = [f for f in gaap_facts]
    print "Found", len(facts), "facts"    

Found 919 facts


In [78]:
facts[3]

(u'2014-09-27',
 'us-gaap:debtinstrumentinterestrateeffectivepercentage',
 0.0079)

In [75]:
def get_all_facts(ticker):
    """ Return a list of (date, fact tag, value) for every filing for the given ticker """
    soups = [BeautifulSoup(f) for f in download_filings(ticker)]    
    facts = [[f for f in extract_gaap_facts(get_dates_and_instants(s), s)] for s in soups]
    return [item for sublist in facts for item in sublist]
    
if __name__ == "__main__":
    all_facts = get_all_facts(ticker)
    print "Found", len(all_facts), "all_facts"

['http://www.sec.gov/Archives/edgar/data/320193/000119312515023697/aapl-20141227.xml', 'http://www.sec.gov/Archives/edgar/data/320193/000119312514383437/aapl-20140927.xml', 'http://www.sec.gov/Archives/edgar/data/320193/000119312514277160/aapl-20140628.xml', 'http://www.sec.gov/Archives/edgar/data/320193/000119312514157311/aapl-20140329.xml', 'http://www.sec.gov/Archives/edgar/data/320193/000119312514024487/aapl-20131228.xml', 'http://www.sec.gov/Archives/edgar/data/320193/000119312513416534/aapl-20130928.xml', 'http://www.sec.gov/Archives/edgar/data/320193/000119312513300670/aapl-20130629.xml', 'http://www.sec.gov/Archives/edgar/data/320193/000119312513168288/aapl-20130330.xml', 'http://www.sec.gov/Archives/edgar/data/320193/000119312513022339/aapl-20121229.xml', 'http://www.sec.gov/Archives/edgar/data/320193/000119312512444068/aapl-20120929.xml', 'http://www.sec.gov/Archives/edgar/data/320193/000119312512314552/aapl-20120630.xml', 'http://www.sec.gov/Archives/edgar/data/320193/000119

In [96]:
groups = [f for f in groupby(all_facts, lambda x: x[1])]

In [115]:
all_facts[0:10]

[(u'2012-12-31', u'us-gaap:operatinglosscarryforwards', 333000000.0),
 (u'2012-12-31', u'us-gaap:operatinglosscarryforwards', 1048000000.0),
 (u'2012-12-31', u'us-gaap:operatinglosscarryforwards', 384000000.0),
 (u'2012-12-31',
  u'us-gaap:futureminimumsubleaserentalssaleleasebacktransactions',
  22000000.0),
 (u'2012-12-31',
  u'us-gaap:operatingleasesfutureminimumpaymentsdue',
  475000000.0),
 (u'2012-12-31',
  u'us-gaap:futureminimumsubleaserentalssaleleasebacktransactions',
  1000000.0),
 (u'2012-12-31',
  u'us-gaap:operatingleasesfutureminimumpaymentsdue',
  1596000000.0),
 (u'2012-12-31',
  u'us-gaap:futureminimumsubleaserentalssaleleasebacktransactions',
  26000000.0),
 (u'2012-12-31',
  u'us-gaap:operatingleasesfutureminimumpaymentsdue',
  492000000.0),
 (u'2012-12-31',
  u'us-gaap:futureminimumsubleaserentalssaleleasebacktransactions',
  17000000.0)]

In [25]:
def get_sp500_tickers():
    import csv
    with open('sp500.csv', 'rU') as f:
        reader = csv.reader(f)
        return [r[1] for r in reader]

if __name__ == "__main__":
    tickers = get_sp500_tickers()
    print "Found", len(tickers), "tickers"
    print tickers[0:10]

Found 500 tickers
['MMM', 'ABT', 'ABBV', 'ANF', 'ACN', 'ACE', 'ACT', 'ADBE', 'AMD', 'AES']


In [34]:
for t in tickers[0:10]:
    filings = download_filings(t)
    print "Found", len(filings), "filings for", t
#     filing_soup = BeautifulSoup(filings[0])
#     dates_instants = get_dates_and_instants(filing_soup)
    # facts = extract_gaap_facts(filing_soup, )

Found 23 filings for MMM
Found 23 filings for ABT
Found 8 filings for ABBV
Found 19 filings for ANF
Found 21 filings for ACN
Found 23 filings for ACE
Found 14 filings for ACT
Found 23 filings for ADBE
Found 19 filings for AMD
Found 23 filings for AES


In [46]:
[len(get_dates_and_instants(BeautifulSoup(f))) for f in filings]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 69, 41, 32, 38, 14, 37, 5]