These functions will help you extract the entire report and place them into a pandas dataframe - please remember to download the conversion table (dow 30 to cik #):
1) getting_crawler(year_number, qtr): this function will download the index file as provdied by sec.gov. Please indicate the year as well the quarter you would like to extract the data from. For more information, visit https://www.sec.gov/edgar/searchedgar/accessing-edgar-data.htm

2) milestone_search(file_name): milestone_search looks for where the listing begins in the index file.

3) convert_to_pandas(file_name, data_start): convert_to_pandas function converts index file to a pandas data frame

4) search_company_name(data, company_name, report_type): allows you to find the url for the report as well as the the specified companies. Please note that the function currently only take CIK# for accuracy. Many larger companies have multiple reporting sub-companies, i.e., J P Morgan.

5) getting_data(url_list, year_number, qtr, report_type): Will download the entire webpage of the report as a file and put it into a pandas dataframe. For missing values, it will automatically assigned np.NaN

6) adding_company_name(data): will add company name and other inforamtion back to the current dataframe

7) putting_together(year_number,qtr, company_name, report_type): goes through the entire process and only return the data_frame with the full data.

See below listing for the options for company_name

In [7]:
company_table = pd.read_csv("cik_list.csv")

In [17]:
company_table[["Company","Name","CIK #"]]

Unnamed: 0,Company,Name,CIK #
0,AAPL,Apple Inc.,320193
1,AXP,American Expression Co.,4962
2,BA,Boeing Co.,12927
3,CAT,Caterpillar Inc.,18230
4,CSCO,Cisco Systems Inc.,858877
5,CVX,Chevron Corp,93410
6,DD,Dupont E I De Numours and Co.,30554
7,DIS,Walt Disney Co.,1001039
8,GE,General Electric Co.,40545
9,GS,Goldman Sachs Group Inc.,886982


In [76]:
import urllib
import pandas as pd
import struct
import numpy as np
from bs4 import BeautifulSoup
import requests
import regex as re

In [77]:
def getting_crawler(year_number,qtr):
    base_url = "https://www.sec.gov/Archives/edgar/full-index/"
    year = str(year_number) + "/"
    if qtr == 1:
        QTR = "QTR1/"
    if qtr == 2:
        QTR = "QTR2/"
    if qtr == 3:
        QTR = "QTR3/"
    if qtr == 4:
        QTR = "QTR4/"
    download_file = "crawler.idx"
    full = base_url + year + QTR + download_file
    file_name = str(year_number) + "_QTR" + str(qtr) + "_crawler.idx"
    testfile = urllib.URLopener()
    testfile.retrieve(full, file_name)
    return file_name

In [78]:
def milestone_search(file_name):
    with open(file_name) as myFile:
        for num, line in enumerate(myFile, 0):
            if ("CIK" in line):
                if ("Form Type" in line):
                    #print 'found at line:', num
                    line_start = num
            if "------------------" in line:
                #print "found ----- at line:", num
                data_start = num+1
                #print "data starts at:", data_start
    return data_start

In [79]:
def convert_to_pandas(file_name, data_start):
    company_name = list()
    report_type = list()
    CIK = list()
    date = list()
    url = list()
    with open(file_name) as myfile:
        testing = myfile.readlines()[data_start:]
        for record in testing:
            record_split = map(lambda x: x.strip(), filter(lambda x: x!="" and x !="\n", record.split("  ")))
            company_name.append(record_split[0])
            report_type.append(record_split[1])
            CIK.append(record_split[2])
            date.append(record_split[3])
            url.append(record_split[4])
    data_dict = {"company":company_name,
                 "report":report_type,
                 "CIK":CIK,
                 "date":date,
                 "url":url}
    df = pd.DataFrame.from_dict(data_dict)
    df["company_l"] = df["company"].str.lower()
    return df

In [80]:
def search_company_name(data, company_name, report_type):
    url_list = list()
    data_report = data[data["report"] == report_type]
    for company in map(lambda x: str(x), company_name):
        data_company = data_report[data_report["CIK"]==(company)]
        url_company = list(data_company["url"])
        if not url_company:
            url_list.append(np.NaN)
        else:
            url_list.append(url_company[0])
    return url_list

In [81]:
def getting_data(url_list, year_number, qtr, report_type):
    full_text_list = list()
    for report in url_list:
        if pd.isnull(report):
            full_text = np.NaN
            full_text_list.append(full_text)
            #print "not found"
        else:
            text = requests.get(report).text
            text = BeautifulSoup(text, "html.parser")
            target = text.find("table",{"class":"tableFile"})
            link = target.find_all("tr")[1].find_all("a")[0]["href"]
            final_link = "https://www.sec.gov"+link
            text_report = requests.get(final_link).text
            text_report = BeautifulSoup(text_report, "html.parser")
            content = text_report.text
            content = content.encode(encoding='UTF-8')
            full_text_list.append(content)
            #print"found it"
    #print len(names)
    #print len(full_text_list)
    data_text = {"CIK":names,"full_text":full_text_list}
    df_text = pd.DataFrame.from_dict(data_text)
    df_text["year"] = year_number
    df_text["quarter"] = qtr
    df_text["report_type"] = report_type
    return df_text

In [82]:
def adding_company_name(data):
    a = pd.read_csv("cik_list.csv")
    a = a[["Company","Name","CIK #"]]
    df_return = pd.merge(data, a, how = "left", left_on='CIK', right_on="CIK #")
    return df_return

In [83]:
def putting_together(year_number,qtr, company_name, report_type):
    file_path = getting_crawler(year_number,qtr)
    data_start_num = milestone_search(file_path)
    data_frame = convert_to_pandas(file_path, data_start_num)
    results = search_company_name(data_frame, company_name, report_type)
    final_data = getting_data(results, year_number, qtr, report_type)
    return_file = adding_company_name(final_data)
    return return_file.dropna().reset_index(drop=True)

Demonstration - extracting the latest document for all 30 companies

In [84]:
names = list(company_table["CIK #"])

In [85]:
data_2017_qtr1 = putting_together(2017, 1, names, "10-K")

In [86]:
data_2016_qtr4 = putting_together(2016, 4, names, "10-K")

In [87]:
data_2016_qtr3 = putting_together(2016, 3, names, "10-K")

In [90]:
final = [data_2017_qtr1,data_2016_qtr4,data_2016_qtr3]
latest = pd.concat(final).reset_index(drop=True)

In [91]:
latest

Unnamed: 0,CIK,full_text,year,quarter,report_type,Company,Name,CIK #
0,4962,\n10-K\n1\nd321397d10k.htm\nFORM 10-K\n\n\nFor...,2017,1,10-K,AXP,American Expression Co.,4962
1,12927,\n10-K\n1\na201612dec3110k.htm\n10-K\n\n\n\n\n...,2017,1,10-K,BA,Boeing Co.,12927
2,18230,\n10-K\n1\ncat_10-kx12312016.htm\n10-K\n\n\n\n...,2017,1,10-K,CAT,Caterpillar Inc.,18230
3,93410,\n10-K\n1\ncvx-123116x10kdoc.htm\n10-K\n\n\n\n...,2017,1,10-K,CVX,Chevron Corp,93410
4,30554,\n10-K\n1\ndd-12312016x10k.htm\n10-K\n\n\n\n\n...,2017,1,10-K,DD,Dupont E I De Numours and Co.,30554
5,40545,\n10-K\n1\nge10k2016.htm\n\n\n\n\n\n\n\n\n\n\n...,2017,1,10-K,GE,General Electric Co.,40545
6,886982,\n10-K\n1\nd308759d10k.htm\nFORM 10-K\n\n\nFor...,2017,1,10-K,GS,Goldman Sachs Group Inc.,886982
7,354950,\n10-K\n1\nhd-01292017x10xk.htm\n10-K\n\n\n\n\...,2017,1,10-K,HD,Home Deport Inc.,354950
8,51143,\n10-K\n1\na2230222z10-k.htm\n10-K\n\n\n\n\n\n...,2017,1,10-K,IBM,International Business Machines Co.,51143
9,50863,\n10-K\n1\na10kdocument12312016q4.htm\n10-K\n\...,2017,1,10-K,INTC,Intel Co.,50863


In [92]:
#latest.to_csv("latest_10_k.csv")

In [97]:
full_text = list(latest[latest["CIK"]==4962]["full_text"])