These functions will help you extract the entire report and place them into a pandas dataframe - please remember to download the conversion table (dow 30 to cik #):
1) getting_crawler(year_number, qtr): this function will download the index file as provdied by sec.gov. Please indicate the year as well the quarter you would like to extract the data from. For more information, visit https://www.sec.gov/edgar/searchedgar/accessing-edgar-data.htm

2) milestone_search(file_name): milestone_search looks for where the listing begins in the index file.

3) convert_to_pandas(file_name, data_start): convert_to_pandas function converts index file to a pandas data frame

4) search_company_name(data, company_name, report_type): allows you to find the url for the report as well as the the specified companies. Please note that the function currently only take CIK# for accuracy. Many larger companies have multiple reporting sub-companies, i.e., J P Morgan.

5) getting_data(url_list, year_number, qtr, report_type): Will download the entire webpage of the report as a file and put it into a pandas dataframe. For missing values, it will automatically assigned np.NaN

6) adding_company_name(data): will add company name and other inforamtion back to the current dataframe

7) putting_together(year_number,qtr, company_name, report_type): goes through the entire process and only return the data_frame with the full data.

8) the functions now support saving and reading in pickle files!

See below listing for the options for company_name

In [2]:
import urllib
import pandas as pd
import struct
import numpy as np
from bs4 import BeautifulSoup
import requests
import regex as re
import pickle

In [3]:
company_table = pd.read_csv("cik_list.csv")

In [5]:
company_table.head(3)

Unnamed: 0,Company,Name,CIK #
0,AAPL,Apple Inc.,320193
1,AXP,American Expression Co.,4962
2,BA,Boeing Co.,12927


In [77]:
cik_lookup = pd.read_table("cik_look_up.txt",names = ["company"])

In [78]:
cik_lookup.head(5)

Unnamed: 0,company
0,!J INC:0001438823:
1,"#1 A LIFESAFER HOLDINGS, INC.:0001509607:"
2,#1 ARIZONA DISCOUNT PROPERTIES LLC:0001457512:
3,#1 PAINTBALL CORP:0001433777:
4,$ LLC:0001427189:


In [79]:
def clean_up(pattern, x):
    if not re.match(pattern, x):
        return np.NaN
    else:
        return re.match(pattern, x).group(1).lower()

In [80]:
cik_lookup.loc[:,"Name"] = cik_lookup["company"].apply(lambda x: clean_up("(.+):\d+:",x))

In [81]:
cik_lookup.loc[:,"cik #"] = cik_lookup["company"].apply(lambda x: clean_up(".+:(\d+):",x))

In [82]:
cik_lookup.head(3)

Unnamed: 0,company,Name,cik #
0,!J INC:0001438823:,!j inc,1438823
1,"#1 A LIFESAFER HOLDINGS, INC.:0001509607:","#1 a lifesafer holdings, inc.",1509607
2,#1 ARIZONA DISCOUNT PROPERTIES LLC:0001457512:,#1 arizona discount properties llc,1457512


In [83]:
def stripping_0(x):
    if not x:
        return np.NaN
    else:
        return str(x).strip("0")

In [84]:
cik_lookup["CIK #"] = cik_lookup["cik #"].apply(lambda x: stripping_0(x))

In [85]:
cik_lookup.head(5)

Unnamed: 0,company,Name,cik #,CIK #
0,!J INC:0001438823:,!j inc,1438823,1438823
1,"#1 A LIFESAFER HOLDINGS, INC.:0001509607:","#1 a lifesafer holdings, inc.",1509607,1509607
2,#1 ARIZONA DISCOUNT PROPERTIES LLC:0001457512:,#1 arizona discount properties llc,1457512,1457512
3,#1 PAINTBALL CORP:0001433777:,#1 paintball corp,1433777,1433777
4,$ LLC:0001427189:,$ llc,1427189,1427189


In [86]:
cik_lookup.shape

(661046, 4)

In [87]:
cik_nm = cik_lookup.dropna().reset_index(drop=True)

In [88]:
cik_nm.shape[0]

661042

In [89]:
cik_nm.to_csv("cik_nm.csv",index=False)
#cik_nm.to_pickle("cik_nm.txt")

In [90]:
def getting_crawler(year_number,qtr):
    base_url = "https://www.sec.gov/Archives/edgar/full-index/"
    year = str(year_number) + "/"
    if qtr == 1:
        QTR = "QTR1/"
    if qtr == 2:
        QTR = "QTR2/"
    if qtr == 3:
        QTR = "QTR3/"
    if qtr == 4:
        QTR = "QTR4/"
    download_file = "crawler.idx"
    full = base_url + year + QTR + download_file
    file_name = str(year_number) + "_QTR" + str(qtr) + "_crawler.idx"
    testfile = urllib.URLopener()
    testfile.retrieve(full, file_name)
    return file_name

In [91]:
def milestone_search(file_name):
    with open(file_name) as myFile:
        for num, line in enumerate(myFile, 0):
            if ("CIK" in line):
                if ("Form Type" in line):
                    #print 'found at line:', num
                    line_start = num
            if "------------------" in line:
                #print "found ----- at line:", num
                data_start = num+1
                #print "data starts at:", data_start
    return data_start

In [92]:
def convert_to_pandas(file_name, data_start):
    company_name = list()
    report_type = list()
    CIK = list()
    date = list()
    url = list()
    with open(file_name) as myfile:
        testing = myfile.readlines()[data_start:]
        for record in testing:
            record_split = map(lambda x: x.strip(), filter(lambda x: x!="" and x !="\n", record.split("  ")))
            company_name.append(record_split[0])
            report_type.append(record_split[1])
            CIK.append(record_split[2])
            date.append(record_split[3])
            url.append(record_split[4])
    data_dict = {"company":company_name,
                 "report":report_type,
                 "CIK":CIK,
                 "date":date,
                 "url":url}
    df = pd.DataFrame.from_dict(data_dict)
    df["company_l"] = df["company"].str.lower()
    return df

In [160]:
def search_company_name(data, company_name, report_type):
    data_report = data[data["report"] == report_type]
    company_table = pd.read_csv(company_name)
    company_table = company_table[["Name","CIK #"]]
    company_table["CIK #"] = company_table["CIK #"].astype(str)
    result = pd.merge(data_report, company_table, how ='inner', left_on = "CIK", right_on = "CIK #")
    result = result[["Name", "CIK #", "company", "report", "date", "url"]].reset_index(drop=True)
    return result

In [161]:
def getting_data(company_list, year_number, qtr, report_type):
    for i in range(0, company_list.shape[0]):
        url_address = company_list.loc[i,"url"]
        text = requests.get(url_address).text
        text = BeautifulSoup(text, "html.parser")
        target = text.find("table",{"class":"tableFile"})
        link = target.find_all("tr")[1].find_all("a")[0]["href"]
        final_link = "https://www.sec.gov"+link
        text_report = requests.get(final_link).text
        text_report = BeautifulSoup(text_report, "html.parser")
        content = text_report.text
        content = content.encode(encoding='UTF-8')
        company_list.loc[i,"full_text"] = content
    company_list["year"] = year_number
    company_list["quarter"] = qtr
    company_list["report_type"] = report_type
    return company_list

In [162]:
def putting_together(year_number, qtr, company_name, report_type):
    file_path = getting_crawler(year_number,qtr)
    data_start_num = milestone_search(file_path)
    data_frame = convert_to_pandas(file_path, data_start_num)
    results = search_company_name(data_frame, company_name, report_type)
    final_data = getting_data(results, year_number, qtr, report_type)
    return final_data

Saving the final_dataset to a pickle file, and reading in a pickle file

In [163]:
def save_as_pickle(path_to_save, final_data):
    final_data.to_pickle(path_to_save)

In [164]:
def read_data(path_to_open):
    infile = open(path_to_open,"rb")
    newdata = pickle.load(infile)
    infile.close()
    return newdata

Demonstration - extracting the latest document for all 30 companies

In [165]:
data_2017_qrt1 = putting_together(2017, 1, "cik_list.csv","10-K")

In [166]:
data_2017_qrt1

Unnamed: 0,Name,CIK #,company,report,date,url,full_text,year,quarter,report_type
0,3 M Co.,66740,3M CO,10-K,2017-02-09,https://www.sec.gov/Archives/edgar/data/66740/...,\n10-K\n1\nmmm-20161231x10k.htm\n10-K\n\n\n\n\...,2017,1,10-K
1,American Expression Co.,4962,AMERICAN EXPRESS CO,10-K,2017-02-17,https://www.sec.gov/Archives/edgar/data/4962/0...,\n10-K\n1\nd321397d10k.htm\nFORM 10-K\n\n\nFor...,2017,1,10-K
2,American International Group,5272,AMERICAN INTERNATIONAL GROUP INC,10-K,2017-02-23,https://www.sec.gov/Archives/edgar/data/5272/0...,\n10-K\n1\nmaindocument001.htm\n10-K\n\n\n\n\n...,2017,1,10-K
3,Bank of America,70858,BANK OF AMERICA CORP /DE/,10-K,2017-02-23,https://www.sec.gov/Archives/edgar/data/70858/...,\n10-K\n1\nbac-1231201610xk.htm\n10-K\n\n\n\n\...,2017,1,10-K
4,Boeing Co.,12927,BOEING CO,10-K,2017-02-08,https://www.sec.gov/Archives/edgar/data/12927/...,\n10-K\n1\na201612dec3110k.htm\n10-K\n\n\n\n\n...,2017,1,10-K
5,Caterpillar Inc.,18230,CATERPILLAR INC,10-K,2017-02-15,https://www.sec.gov/Archives/edgar/data/18230/...,\n10-K\n1\ncat_10-kx12312016.htm\n10-K\n\n\n\n...,2017,1,10-K
6,Chevron Corp,93410,CHEVRON CORP,10-K,2017-02-23,https://www.sec.gov/Archives/edgar/data/93410/...,\n10-K\n1\ncvx-123116x10kdoc.htm\n10-K\n\n\n\n...,2017,1,10-K
7,Citi,831001,CITIGROUP INC,10-K,2017-02-24,https://www.sec.gov/Archives/edgar/data/831001...,\n10-K\n1\nc-12312016x10k.htm\n10-K\n\n\n\n\n\...,2017,1,10-K
8,Coca-Cola Co.,21344,COCA COLA CO,10-K,2017-02-24,https://www.sec.gov/Archives/edgar/data/21344/...,\n10-K\n1\na2016123110-k.htm\n10-K\n\n\n\n\n\n...,2017,1,10-K
9,Dupont E I De Numours and Co.,30554,DUPONT E I DE NEMOURS & CO,10-K,2017-02-02,https://www.sec.gov/Archives/edgar/data/30554/...,\n10-K\n1\ndd-12312016x10k.htm\n10-K\n\n\n\n\n...,2017,1,10-K


In [None]:
# old code
#def search_company_name(data, company_name, report_type):
    company_list = dict()
    url_list = list()
    date_list = list()
    data_report = data[data["report"] == report_type]
    for company in map(lambda x: str(x), company_name):
        data_company = data_report[data_report["CIK"]==(company)]
        url_company = list(data_company["url"])
        file_date = list(data_company["date"])
        if not url_company:
            url_list.append(np.NaN)
        else:
            url_list.append(url_company[0])
        if not file_date:
            date_list.append(np.NaN)
        else:
            date_list.append(file_date[0])
    company_list["url"] = url_list
    company_list["date"] = date_list
    return company_list

In [None]:
# old code
#def getting_data(company_list, year_number, qtr, report_type, company_name):
    full_text_list = list()
    url_list = company_list["url"]
    for report in url_list:
        if pd.isnull(report):
            full_text = np.NaN
            full_text_list.append(full_text)
            #print "not found"
        else:
            text = requests.get(report).text
            text = BeautifulSoup(text, "html.parser")
            target = text.find("table",{"class":"tableFile"})
            link = target.find_all("tr")[1].find_all("a")[0]["href"]
            final_link = "https://www.sec.gov"+link
            text_report = requests.get(final_link).text
            text_report = BeautifulSoup(text_report, "html.parser")
            content = text_report.text
            content = content.encode(encoding='UTF-8')
            full_text_list.append(content)
            #print"found it"
    #print len(names)
    #print len(full_text_list)
    data_text = {"CIK":company_name,"full_text":full_text_list,"url":company_list["url"], "file_date":company_list["date"]}
    df_text = pd.DataFrame.from_dict(data_text)
    df_text["year"] = year_number
    df_text["quarter"] = qtr
    df_text["report_type"] = report_type
    return df_text