In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

# Part 1: 10K Puller

### Part 1 of the code uses CIKs as an input and constructs the URLs for the relevant 10K files (that meets three conditions: CIK + filed in 2016 + 10-K's)

# Part 1.1: Navigating the Edgar Search Results Page

## Preparing search criteria

### Note: Code can be easily changed to read all the CIK's in a local .txt file. However, for the purpose of this assignment, I have deliberately avoided sending an additional .txt file (that would have all the CIK's). I have hard coded the CIK's in the code itself. In real world, it is much easier to save all the CIKs in a local .txt file and allow code to automatically read it. 

In [2]:
# INPUTS
cikList = ["4962","320193","732712","12927","18230","19617","93410","21344","1001039","30554","34088","40545","354950","50863","51143","200406","63908","310158","66740","320187","78003","80424","731766","101829","104169","1618921","789019","858877","86312","886982","1403161"]
filingType = "10-K"
dateBefore = "20170101"
count = "10"
yearOfInterest = "2016"
urlBase = "https://www.sec.gov/"
urlPt1 = "cgi-bin/browse-edgar?action=getcompany&CIK="
urlPt2 = "&type="
urlPt3 = "&dateb="
urlPt4 = "&owner=exclude&count="

In [3]:
# PADDING CIKS WITH ADDITIONAL 0S
cikListPadded = [cik.zfill(10) for cik in cikList]
print(cikListPadded[0:2], "\n")

# OBTAINING URL LIST
urlList = [urlBase + urlPt1 + cik + urlPt2 + filingType + urlPt3 
           + dateBefore + urlPt4 + count for cik in cikListPadded]
print(urlList[0:2])

# YEAR REGEX PATTERN
yearRegex = yearOfInterest + "-[0-9][0-9]-[0-9][0-9]"

['0000004962', '0000320193'] 

['https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000004962&type=10-K&dateb=20170101&owner=exclude&count=10', 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000320193&type=10-K&dateb=20170101&owner=exclude&count=10']


## Extracting search result table

In [4]:
# TESTING WITH 1 URL AT A TIME
testUrl = urlList[30]

In [5]:
page = requests.get(testUrl)
pageSoup = BeautifulSoup(page.content, "html.parser")

In [6]:
tableCandidates = pageSoup.find_all("table", class_= "tableFile2")

#ENSURING ONLY ONE TABLE IS READ
if len(tableCandidates) == 1:
    resultTable = tableCandidates[0]

In [7]:
print(testUrl)

https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001403161&type=10-K&dateb=20170101&owner=exclude&count=10


## Finding the required rows

In [8]:
requiredRows = []

for tableRow in resultTable.find_all("tr"):
    for tableCell in tableRow.find_all("td"):
        if re.match(yearRegex, str(tableCell.contents[0])) is not None:
            requiredRows.append(tableRow)
            
print(len(requiredRows))

1


## Extracting the link to Filing Details page

In [9]:
filingLinks = []

for row in requiredRows:
    cell = row.find_all("a", id = "documentsbutton")
    link = urlBase + cell[0]["href"]
    filingLinks.append(link)
    
print(filingLinks)

['https://www.sec.gov//Archives/edgar/data/1403161/000140316116000058/0001403161-16-000058-index.htm']


# Part 1.2: Navigating Filing Links Page

In [10]:
links10K = []

for link in filingLinks:
    filingPage = requests.get(link)
    filingPageSoup = BeautifulSoup(filingPage.content, "html.parser")
    
    # EXTRACTING TABLE
    docTable = filingPageSoup.find_all("table", summary = "Document Format Files")[0]

    # EXTRACING REQUIRED ROW
    row10K = None
    for row in docTable.find_all("tr"):
        for tableCell in row.find_all("td"):
            if re.search("10-K", str(tableCell.contents)):
                row10K = row
                break
    
    # EXTRACTING REQUIRED LINK
    link = row10K.find_all("a")[0]["href"]
    link = urlBase + link
    links10K.append(link)

In [11]:
print(links10K)

['https://www.sec.gov//Archives/edgar/data/1403161/000140316116000058/v093016.htm']


# Part 1.3: Testing link validity by writing the file to a test output

In [12]:
testPage = requests.get(links10K[0])
testPageSoup = BeautifulSoup(testPage.content, "html.parser")

with open("testop.html", "w", encoding = "utf-8") as file:
    file.write(str(testPageSoup))