In [1]:
#import your libraries
#the third-party data analysis library pandas
#provides a spreadsheet-like tabular data object.
import pandas as pd 
#the third-party requests library makes fetching data (html, txt, csv)
#from the web easier
import requests

#re is the standard Python library for regular expressions
import re

Start by reading the documentation on what kind of data is available on the web in what formats: https://www.sec.gov/edgar/searchedgar/accessing-edgar-data.htm

In [2]:
#set the base of your url. You'll use this over and over again
#all the data we want will start with this url
base_url = "https://www.sec.gov/Archives/"

In [3]:
#and here's the tail of the url just for
#the index of all recent company filings
company_index = "edgar/full-index/2017/QTR4/company.idx"

In [4]:
#use the requests library
#make a "get" request to grab the text 
#at this url: "https://www.sec.gov/Archives/edgar/full-index/2017/QTR4/company.idx
#which is all of the filings thus far in the 4th quarter of 2017    
r = requests.get(base_url+company_index)

In [5]:
#if the web request worked properly (was a 200 status code or OK==True)
if r.ok:
    #split the giant text blob you got from the URL
    #into a list with each item being one row of data
    rows = r.text.splitlines()
else:
    print("Your web request failed.")
    #this is called string formatting
    #you write out the string you want to print in quotes
    #with placeholders in {}
    #then add the .format method after the print statement
    #including a list of variables 
    #for what you want to print in the {} placeholders
    print("{0} status code").format(r.status_code)


In [6]:
#how many items do you have in your "rows" list?
len(rows)

135113

In [7]:
#let's look at the top of this data.
#it's mostly the boilerplate of the file header
rows[:10]

[u'Description:           Master Index of EDGAR Dissemination Feed by Company Name',
 u'Last Data Received:    November 29, 2017',
 u'Comments:              webmaster@sec.gov',
 u'Anonymous FTP:         ftp://ftp.sec.gov/edgar/',
 u' ',
 u' ',
 u' ',
 u' ',
 u'Company Name                                                  Form Type   CIK         Date Filed  File Name',
 u'---------------------------------------------------------------------------------------------------------------------------------------------']

In [8]:
#let's look at our data after the header rows
rows[10:15]

[u'1 800 FLOWERS COM INC                                         10-Q        1084869     2017-11-09  edgar/data/1084869/0001437749-17-018857.txt         ',
 u'1 800 FLOWERS COM INC                                         4           1084869     2017-10-30  edgar/data/1084869/0001084869-17-000018.txt         ',
 u'1 800 FLOWERS COM INC                                         4           1084869     2017-11-01  edgar/data/1084869/0001084869-17-000019.txt         ',
 u'1 800 FLOWERS COM INC                                         4           1084869     2017-11-02  edgar/data/1084869/0001084869-17-000020.txt         ',
 u'1 800 FLOWERS COM INC                                         4           1084869     2017-11-02  edgar/data/1084869/0001084869-17-000021.txt         ']

In [9]:
#now we have two options
#first, let's try breaking this up with a for-loop
#and cleaning each row, filtering it manually

In [10]:
#this for-loop syntax does a few things
#do the same procedure one time 
#for each item (name your variable representing each list item) 
#in (your list variable)
for item in rows:
    #make sure each nested section is indented 4 spaces 
    #so it executes within the larger loop
    #now we're setting a conditional that only prints off
    #the rows we're interested in if it finds the string "EXXON"
    #in the larger string that is the item in our "rows" list
    if "EXXON" in item:
        #if that's the case, then split up the long string
        #into a list of items. Think of each piece in the "my_company"
        #list as one cell in a row of a spreadsheet
        #to break it up, we use the .split method of the re regular expressions library
        #this regex '\s{2,}' says "find a space (\s) where there are 2 or more
        #in a row ({2,})
        #rubular.com is a fantastic resource for designing and testing regexes
        my_company = re.split('\s{2,}', item)
        #now, we only want to show filings that have "8-K" in the 
        #file type column, which is the second ([1] when Python counts from 0)
        #piece in the list
        if my_company[1] == "8-K":
            print(item)
            #counting backwards from the end of the list
            #the second item from the right (the first is an empty string)
            #is the unique URL for this 8-K filing
            url_end = my_company[-2]
            #the date is the 4th ([3] in Python) piece in the row list
            date = my_company[3]
            #again, using string formatting, print off the date and the 
            #full unique URL for the filing
            #you could also store that in a variable to make more requests
            print("{0}: {1}{2}".format(date, base_url, url_end))

EXXON MOBIL CORP                                              8-K         34088       2017-10-27  edgar/data/34088/0000034088-17-000044.txt           
2017-10-27: https://www.sec.gov/Archives/edgar/data/34088/0000034088-17-000044.txt
EXXON MOBIL CORP                                              8-K         34088       2017-10-31  edgar/data/34088/0000034088-17-000047.txt           
2017-10-31: https://www.sec.gov/Archives/edgar/data/34088/0000034088-17-000047.txt


In [11]:
#now, alter the loop above
#to find and print off the find the 10-Q 
#for a company you're interested in 

In [12]:
#now let's look at how we could do this in pandas

In [13]:
#pandas doesn't like to read in text that isn't delimited
#so we'll create a new_list from our rows list, where we split
#every line up into columns
#make an empty list
new_list = []
#cycle through every line (we name it "row") in our "rows" list
for row in rows:
    #use the regex library to split the string
    #into pieces whenever 2 or more spaces in a row appear
    filing = re.split('\s{2,}', row)
    #append this row to our new_list list
    new_list.append(filing)

In [14]:
#measure the length of new_list
#it should be the same as the len(rows) above
#if it's not, you lost some lines somehow
len(new_list)

135113

In [15]:
#let's peek at our data, skipping those dumb header rows above
new_list[10:15]

[[u'1 800 FLOWERS COM INC',
  u'10-Q',
  u'1084869',
  u'2017-11-09',
  u'edgar/data/1084869/0001437749-17-018857.txt',
  u''],
 [u'1 800 FLOWERS COM INC',
  u'4',
  u'1084869',
  u'2017-10-30',
  u'edgar/data/1084869/0001084869-17-000018.txt',
  u''],
 [u'1 800 FLOWERS COM INC',
  u'4',
  u'1084869',
  u'2017-11-01',
  u'edgar/data/1084869/0001084869-17-000019.txt',
  u''],
 [u'1 800 FLOWERS COM INC',
  u'4',
  u'1084869',
  u'2017-11-02',
  u'edgar/data/1084869/0001084869-17-000020.txt',
  u''],
 [u'1 800 FLOWERS COM INC',
  u'4',
  u'1084869',
  u'2017-11-02',
  u'edgar/data/1084869/0001084869-17-000021.txt',
  u'']]

In [16]:
#make a new pandas dataframe, we'll call "sec_index"
#by giving it our new_list starting with the 11th row
#we'll give it a list with the names of the columns we want to use
#for whatever reason (maybe because of headers), it adds two empty columns at the end
sec_index=pd.DataFrame(new_list[10:], columns = ["Company_Name","Form_Type","CIK","Date_Filed","File_Name","Blank1","Blank2"])

#drop our two two blank columns from our dataframe
sec_index.drop(["Blank1", "Blank2"], axis=1, inplace=True)

In [17]:
#now, with our sec_index dataframe
#filter it where any item in the 'Company_Name' columns
#contains the substring "FORD MOTOR"
sec_index[sec_index['Company_Name'].str.contains('FORD MOTOR')]

Unnamed: 0,Company_Name,Form_Type,CIK,Date_Filed,File_Name
45276,FORD MOTOR CO,10-Q,37996,2017-10-26,edgar/data/37996/0000037996-17-000092.txt
45277,FORD MOTOR CO,3,37996,2017-11-08,edgar/data/37996/0001209191-17-059706.txt
45278,FORD MOTOR CO,4,37996,2017-10-05,edgar/data/37996/0001209191-17-056144.txt
45279,FORD MOTOR CO,4,37996,2017-10-05,edgar/data/37996/0001209191-17-056147.txt
45280,FORD MOTOR CO,4,37996,2017-10-05,edgar/data/37996/0001209191-17-056148.txt
45281,FORD MOTOR CO,8-K,37996,2017-10-02,edgar/data/37996/0000037996-17-000080.txt
45282,FORD MOTOR CO,8-K,37996,2017-10-03,edgar/data/37996/0000037996-17-000082.txt
45283,FORD MOTOR CO,8-K,37996,2017-10-03,edgar/data/37996/0000037996-17-000084.txt
45284,FORD MOTOR CO,8-K,37996,2017-10-18,edgar/data/37996/0000037996-17-000086.txt
45285,FORD MOTOR CO,8-K,37996,2017-10-26,edgar/data/37996/0000037996-17-000090.txt


In [18]:
#now, say you want to create a new column
#with the full url for each filing
#how would you attempt that?