In [12]:
from bs4 import BeautifulSoup
import re
from sec_edgar_downloader import Downloader
import pandas as pd
from os import listdir
from os.path import isfile, join, isdir
import csv
import shutil
import unicodedata
import utils

In [13]:
# Load the company ticker list
df = pd.read_csv('../../tickerList.csv')
df.head()

Unnamed: 0,Ticker,Company Name,Industry,Top 100
0,AAPL,Apple Inc.,Technology Hardware Storage & Peripherals,1
1,MSFT,Microsoft Corporation,Software,1
2,AMZN,"Amazon.com, Inc.",Internet & Direct Marketing Retail,1
3,FB,"Facebook, Inc. Class A",Interactive Media & Services,1
4,BRKB,Berkshire Hathaway Inc. Class B,Diversified Financial Services,1


In [14]:
# set the location where all the downloaded filings will be saved
downloadPath = "../../"
dl = Downloader(downloadPath)

In [27]:
# Download 10 latest 10-K filings for the given company ticker
def downloadFilings(ticker, filing_type="10-K", latest=10):
    dl.get(filing_type, ticker, latest)

def removefilings(path):
    shutil.rmtree(path, ignore_errors=True)
# downloadFilings("BRK")

In [28]:
pos_dat = None
document = {}
itemStartEndMapping = {"item1":"item1a","item1a":"item1b","item1b":"item2", "item2":"item3","item3":"item4","item4":"item5",
                      "item5":"item6", "item6":"item7","item7":"item7a","item7a":"item8", "item8":"item9","item9":"item9a",
                       "item9a":"item9b","item9b":"item10","item10":"item11","item11":"item12","item12":"item13",
                       "item13":"item14","item14":"item15","item15":"item16"}
def extractHTMLSections(raw_10k):
    
    global pos_dat
    # Regex to find <DOCUMENT> tags
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    
    # Regex to find <TYPE> tag prceeding any characters, terminating at new line
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    
    # Create 3 lists with the span idices for each regex
    
    ### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
    ### First filter will give us document tag start <end> and document tag end's <start> 
    ### We will use this to later grab content in between these tags
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    ### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
    ### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
    ### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K' 
    ### as section names
    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

    # Create a loop to go through each section type and save only the 10-K section in the dictionary
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K' and doc_type not in document.keys():
            document[doc_type] = raw_10k[doc_start:doc_end]
    
    # Write the regex to get different sections from the 10-K
    regex = re.compile(r'(>(\s|&#160;|&nbsp;)*item(\s|&#160;|&nbsp;)*(1(\s|&nbsp;|&#160;|&#160;\(|\(|)*a|6|1(\s|&nbsp;|&#160;|&#160;\(|\(|)*b|7(\s|&nbsp;|&#160;|&#160;\(|\(|)*a|7|8|2|4)\.{0,1})', re.I)
    
    # Use finditer to math the regex
    matches = regex.finditer(document['10-K'])
    
    # Create the dataframe
    test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    test_df.columns = ['item', 'start', 'end']
    test_df['item'] = test_df.item.str.lower()
    
    # Get rid of unnesesary charcters from the dataframe
    test_df.replace('&#160;ris',' ',regex=True,inplace=True)
    test_df.replace('&#160;unresolve',' ',regex=True,inplace=True)
    test_df.replace('&#160;',' ',regex=True,inplace=True)
    test_df.replace('&nbsp;',' ',regex=True,inplace=True)
    test_df.replace(' ','',regex=True,inplace=True)
    test_df.replace('\.','',regex=True,inplace=True)
    test_df.replace('>','',regex=True,inplace=True)
    test_df.replace('\n','',regex=True,inplace=True)
    
    # Aggregate the different parts of the sane section
    pos_dat = test_df.groupby(['item']).agg({'start': utils.customsort, 'end': 'max'})
    print("Sections Extracted:{}".format(list(pos_dat.index)))
#     print("send any section name as printed")
    return list(pos_dat.index)

def extractTextFromSection(key, document_type = "10-K"):
    global pos_dat
    if key in list(pos_dat.index):
        if document_type not in document:
            print("{} not found".format(document_type))
            # Get Item 1a
        else:
            end = ""
            original = key
            while True:
                if original not in itemStartEndMapping.keys():
                    break
                elif itemStartEndMapping[original] not in list(pos_dat.index) or pos_dat["start"].loc[key]>= pos_dat["start"].loc[itemStartEndMapping[original]]:
                    original = itemStartEndMapping[original]
                else:
                    end = itemStartEndMapping[original]
                    break
            if len(end) == "0":
                print("Error Cannnot find End tag for {}".format(key))
                return
            else:                
                item_raw = document['10-K'][pos_dat['start'].loc[key]:pos_dat['end'].loc[end]]
                item_raw_content = BeautifulSoup(item_raw, "lxml")
                content = item_raw_content.get_text()
                content = unicodedata.normalize("NFKD", content)
                content = content.replace("\n", " ")
                text = content.replace("  ", " ")
                content = content.lower()
                return content
    else:
        print("{} not found".format(key))
        return ""



In [17]:
index = 0

In [29]:
## Get List of all the folders concatenated with its relative path
companyLessthan10 = []
companyWithNoData = []
## SEC-Edgar module creates a folder 'sec_edgar_filings' in which it downloades the filings
newPath = downloadPath + "sec_edgar_filings"
companyFilingsPath = ""
erroneousCompanies = []
## loop through all folders i.e. companies
with open('../../writeData.csv', mode='w+', encoding="utf-8", newline='') as file:
    fieldnames = ['Ticker', "Company Name", "Industry","Top 100","Year","Risk Factors", "MDA"]
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    for i in range(len(df)):
#         try:
        row = df.iloc[i]
        ## Download latest 10 filings for the Company
        latest = 10
        print("Downloading {} latest Filings for {}".format(latest, row["Company Name"]))
        downloadFilings(ticker=row['Ticker'], latest = 10)

        ## SEC-Edgar module creates a subfolder named by the company ticker inside 'sec_edgar_filings'
        # Therfore join the path 
        companyFilingsPath = join(newPath, row['Ticker'], "10-K")
        print("companyfilingsPath {}".format(companyFilingsPath))

        ## Sanity Check: if the files are downloaded
        if not isdir(companyFilingsPath):
            print("No filings Downloaded for {}".format(row["Company Name"]))
            companyWithNoData.append(row["Company Name"])
            continue
        # List downloded filing path and file names
        companyFilings = [(join(companyFilingsPath, f), f) for f in listdir(companyFilingsPath)]

        ## Log the companies having less than 10 recent filings
        if len(companyFilings) <10:
            companyLessthan10.append(row["Company Name"])

        # Extract Section from each filings
        for filing in companyFilings:

            # Get filing path and filing path
            filingPath, filingName = filing

            f = open(filingPath,'r')
            textdata = f.read()
            f.close()
            sectionList = extractHTMLSections(textdata)
            risk_text = ""
            mda_text = ""
            ## Extracting Risk FActors Section Item 1A
            if "item1a" in sectionList:
                risk_text = extractTextFromSection(key ="item1a")
                print("Extracted risk text Length:{}".format(len(risk_text)))
            else:
                index = i
                print("iem1a not in section list", row["Company Name"], filing)
#                     break
            if "item7" in sectionList:
                mda_text = extractTextFromSection(key ="item7")
                print("Extracted MDA text Length:{}".format(len(mda_text)))
            else:
                index = i
                print("item7 not in section list", row["Company Name"], filing)
#                     break
            if len(risk_text) <100:
                index = i
                print("empty RISK text", row["Company Name"], filing)
#                 break
                
            if len(mda_text) <100:
                index = i
                print("empty MDA text", row["Company Name"], filing)
#                     break
            
            # Get year of the filling from the filing name
            # file name ex "0001283630-16-000038.txt", 2016 represents year in which 10k was filed and it was of 2015
            filingName = filingName.split("-")
            #way to write to csv file
            writer.writerow({"Ticker":row['Ticker'], "Company Name":row["Company Name"],"Industry": row["Industry"],"Top 100":row["Top 100"], "Year":int(filingName[1])-1 +2000,"Risk Factors":risk_text, "MDA":mda_text})
        removefilings(join(newPath, row['Ticker']))
#         except Exception as e:
#             erroneousCompanies.append((e, row["Company Name"]))
#             print("Exception {} Occured for {}, skipping".format(e,row["Company Name"]))

Downloading 10 latest Filings for Apple Inc.
companyfilingsPath ../../sec_edgar_filings\AAPL\10-K
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extract

companyfilingsPath ../../sec_edgar_filings\GOOGL\10-K
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Downloading 10 latest Filings for JPMorgan Chase & Co.
companyfilingsPath ../../sec_edgar_filings\JPM\10-K
Sections Extracted:['item1a', 'item1b',

Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections

Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections

Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Downloading 10 latest Filings for Pfizer Inc.
companyfilingsPath ../../sec_edgar_filings\PFE\10-K
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'it

Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Downloading 10 latest Filings for Wells Fargo & Company
companyfilingsPath ../../sec_edgar_filings\WFC\10-K
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted risk text Length:53078
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'i

ChunkedEncodingError: ('Connection broken: OSError("(10051, \'WSAENETUNREACH\')")', OSError("(10051, 'WSAENETUNREACH')"))

In [None]:
companyWithNoData

In [None]:
companyLessthan10

In [None]:
erroneousCompanies