In [134]:
from bs4 import BeautifulSoup
import re
from sec_edgar_downloader import Downloader
import pandas as pd
from os import listdir
from os.path import isfile, join, isdir
import csv
import shutil
import unicodedata
import utils

In [135]:
# set the location where all the downloaded filings will be saved
downloadPath = "./"
dl = Downloader(downloadPath)

In [136]:
ticker = "AAPL"
# ticker = "C"
# pos_dat = None
document = {}

In [137]:
# Download 10 latest 10-K filings for the given company ticker
def downloadFilings(ticker, filing_type="10-K", latest=10):
    dl.get(filing_type, ticker, latest)

def removefilings(path):
    shutil.rmtree(path, ignore_errors=True)
# downloadFilings("BRK")

In [138]:
downloadFilings(ticker, "10-Q", latest=40)

In [99]:
itemStartEndMapping = {"item1":"item1a","item1a":"item1b","item1b":"item2", "item2":"item3","item3":"item4","item4":"item5",
                      "item5":"item6", "item6":"item7","item7":"item7a","item7a":"item8", "item8":"item9","item9":"item9a",
                       "item9a":"item9b","item9b":"item10","item10":"item11","item11":"item12","item12":"item13",
                       "item13":"item14","item14":"item15","item15":"item16"}
def extractHTMLSections(raw_10k):
    document = {}
#     global pos_dat
    # Regex to find <DOCUMENT> tags
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')
    
    # Regex to find <TYPE> tag prceeding any characters, terminating at new line
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    
    # Create 3 lists with the span idices for each regex
    
    ### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
    ### First filter will give us document tag start <end> and document tag end's <start> 
    ### We will use this to later grab content in between these tags
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

    ### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
    ### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
    ### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K' 
    ### as section names
    doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

    # Create a loop to go through each section type and save only the 10-K section in the dictionary
    for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
        if doc_type == '10-K' and doc_type not in document.keys():
            document[doc_type] = raw_10k[doc_start:doc_end]
    
    # Write the regex to get different sections from the 10-K
    regex = re.compile(r'(>(\s|&#160;|&nbsp;)*item(\s|&#160;|&nbsp;)*(1(\s|&nbsp;|&#160;|&#160;\(|\(|)*a|6|1(\s|&nbsp;|&#160;|&#160;\(|\(|)*b|7(\s|&nbsp;|&#160;|&#160;\(|\(|)*a|7|8|2|4)\.{0,1})', re.I)
    
    # Use finditer to math the regex
    matches = regex.finditer(document['10-K'])
    
    # Create the dataframe
    test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

    test_df.columns = ['item', 'start', 'end']
    test_df['item'] = test_df.item.str.lower()
    
    # Get rid of unnesesary charcters from the dataframe
    test_df.replace('&#160;ris',' ',regex=True,inplace=True)
    test_df.replace('&#160;unresolve',' ',regex=True,inplace=True)
    test_df.replace('&#160;',' ',regex=True,inplace=True)
    test_df.replace('&nbsp;',' ',regex=True,inplace=True)
    test_df.replace(' ','',regex=True,inplace=True)
    test_df.replace('\.','',regex=True,inplace=True)
    test_df.replace('>','',regex=True,inplace=True)
    test_df.replace('\n','',regex=True,inplace=True)
    
    # Aggregate the different parts of the sane section
    pos_dat = test_df.groupby(['item']).agg({'start': utils.customsort, 'end': 'max'})
    print("Sections Extracted:{}".format(list(pos_dat.index)))
#     print(pos_dat)
    return list(pos_dat.index), pos_dat, document

def extractTextFromSection(key,pos_dat, document,document_type = "10-K"):
    if key in list(pos_dat.index):
        if document_type not in document:
            print("{} not found".format(document_type))
            # Get Item 1a
        else:
            end = ""
            original = key
            while True:
                if original not in itemStartEndMapping.keys():
                    break
                elif itemStartEndMapping[original] not in list(pos_dat.index) or pos_dat["start"].loc[key]>= pos_dat["start"].loc[itemStartEndMapping[original]]:
                    original = itemStartEndMapping[original]
                else:
                    end = itemStartEndMapping[original]
                    break
            if len(end) == "0":
                print("Error Cannnot find End tag for {}".format(key))
                return
            else:                
                item_raw = document['10-K'][pos_dat['start'].loc[key]:pos_dat['end'].loc[end]]
                item_raw_content = BeautifulSoup(item_raw, "lxml")
                content = item_raw_content.get_text()
                content = unicodedata.normalize("NFKD", content)
                content = content.replace("\n", " ")
                text = content.replace("  ", " ")
                content = content.lower()
                return content
    else:
        print("{} not found".format(key))
        return ""



In [103]:
''' SEC-Edgar module creates a folder 'sec_edgar_filings' in which it downloades the filings '''
newPath = downloadPath + "sec_edgar_filings"
companyFilingsPath = ""
text_data = {"risktext":[],"mdatext":[]}
dataStats = {}
fieldnames = ['Ticker', "Company Name", "Industry","Top 100","Year","Risk Factors", "MDA"]

''' Download latest 10 filings for the Company ''' 
latest = 10
downloadFilings(ticker=ticker, latest = 10)
dataStats[ticker] = {"totalFilings":0,"MDAlen":{},"Risklen":{},"years":[]}
'''  SEC-Edgar module creates a subfolder named by the company ticker inside 'sec_edgar_filings'
     Therfore join the path ''' 
companyFilingsPath = join(newPath, ticker, "10-K")

''' Sanity Check: if the files are downloaded ''' 
if isdir(companyFilingsPath):        
    ''' List downloded filing path and file names ''' 
    companyFilings = [(join(companyFilingsPath, f), f) for f in listdir(companyFilingsPath)]
    dataStats[ticker]["totalFilings"] = len(companyFilings)
    ## Log the companies having less than 10 recent filings
    if len(companyFilings) <10:
        dataStats[ticker] = len(companyFilings)

    # Extract Section from each filings
    for filing in companyFilings:

        # Get filing path and filing path
        filingPath, filingName = filing

        f = open(filingPath,'r')
        textdata = f.read()
        f.close()
        ''' Get year of the filling from the filing name 
            file name ex "0001283630-16-000038.txt", 2016 represents year in which 10k was filed and it was of 2015 '''
        filingName = filingName.split("-")
#         print(filingName)
        filingYear = int(filingName[1])-1
        dataStats[ticker]["years"].append(filingYear)
        sectionList, data, document = extractHTMLSections(textdata)
        risk_text = ""
        mda_text = ""
        ## Extracting Risk FActors Section Item 1A
        if "item1a" in sectionList:
            risk_text = extractTextFromSection("item1a", data, document)
        else:
            print("item1a not in section list", filing)
        
        text_data["risktext"].append(risk_text)
        
        if "item7" in sectionList:
            mda_text = extractTextFromSection("item7", data, document)
            print("Extracted MDA text Length:{}".format(len(mda_text)))
        else:
            print("item7 not in section list", filing)
        
        text_data["mdatext"].append(mda_text)
        
        dataStats[ticker]["MDAlen"][filingYear] = len(mda_text)
        dataStats[ticker]["Risklen"][filingYear] = len(risk_text)


        
else:
    print("No filings Downloaded for {}".format(row["Company Name"]))
removefilings(join(newPath, ticker))

Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted MDA text Length:60016
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted MDA text Length:56851
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted MDA text Length:33501
Sections Extracted:['item1a', 'item1b', 'item2', 'item6', 'item7', 'item7a', 'item8']
Extracted MDA text Length:65410
Sections Extracted:['item1a', 'item1b', 'item2', 'item6', 'item7', 'item7a', 'item8']
Extracted MDA text Length:61832
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted MDA text Length:63361
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted MDA text Length:65905
Sections Extracted:['item1a', 'item1b', 'item2', 'item4', 'item6', 'item7', 'item7a', 'item8']
Extracted MDA text Length:68342
Se

In [108]:
dataStats
dataStats[ticker]

{'totalFilings': 10,
 'MDAlen': {16: 60016,
  17: 56851,
  18: 33501,
  9: 65410,
  10: 61832,
  11: 63361,
  12: 65905,
  13: 68342,
  14: 64159,
  15: 58349},
 'Risklen': {16: 53078,
  17: 55255,
  18: 54781,
  9: 51523,
  10: 51092,
  11: 48705,
  12: 50965,
  13: 53250,
  14: 53687,
  15: 52302},
 'years': [16, 17, 18, 9, 10, 11, 12, 13, 14, 15]}

In [None]:
text_data["risktext"][0][:500]

In [None]:
text_data["risktext"][1][:500]

In [56]:
import difflib

In [93]:
text1_lines = text_data["risktext"][4].split(" ")
text2_lines = text_data["risktext"][2].split(" ")
# d = difflib.Differ()
# diff = d.compare(text1_lines, text2_lines)
# print('\n'.join(diff))
# diff = difflib.ndiff(text1_lines, text2_lines)

d = difflib.HtmlDiff()
diff = d.make_file(text1_lines, text2_lines)

# diff = difflib.unified_diff(text1_lines, text2_lines)
# diff = '\n'.join(list(diff))
# diff


In [94]:
f = open("compare.html","wb+")
# import unicode
f.write(diff.encode('utf8'))
f.close()

In [155]:
""" 10-K Data Analysis """
import json
# f = open("data_10K_1K/dataStats.txt")
f = open("data_10K_3K/dataStats.txt")
data = f.read()
f.close()
data = json.loads(data)
ten_filing = 0
less_than_ten = 0
countdict = {}
total = len(data.keys())
sections=["item1a", "item7"]
allrisk = 0
allMDA =0
for ticker in data.keys():
    company = data[ticker]
    item7 = False
    item1a = False
    if company["totalfilings"] == 10:
        ten_filing +=1
    else:
        less_than_ten +=1
    if "item1a" in company.keys():
        item1a = True
    if "item7" in company.keys():
        item7 = True
    for i in range(len(company["years"])):
        if company["years"][i] not in countdict.keys():
            countdict[company["years"][i]] = {"item1a":0,"item7":0}
        if item1a and company["item1a"][i]>100:
            countdict[company["years"][i]]["item1a"]+=1
        if item7 and company["item7"][i]>100:
            countdict[company["years"][i]]["item7"]+=1
print(countdict)
coverage = 0
total = len(data.keys())
for ticker in data.keys():
    company = data[ticker]
    if "item1a" not in company.keys():
        continue
    if "item7" not in company.keys():
        continue
    item1a = sorted(company["item1a"])
    item7 = sorted(company["item7"])
    years = sorted(company["years"])
    if len(years) == 10 and item1a[0] >100 and item7[0]>100 and years[0] == 10:
        coverage += 1
print(coverage)
print(total)
print(coverage/total)

{10: {'item1a': 1239, 'item7': 1295}, 14: {'item1a': 1420, 'item7': 1482}, 15: {'item1a': 1474, 'item7': 1522}, 16: {'item1a': 1532, 'item7': 1571}, 17: {'item1a': 1593, 'item7': 1632}, 18: {'item1a': 1666, 'item7': 1708}, 19: {'item1a': 1437, 'item7': 1481}, 11: {'item1a': 1287, 'item7': 1331}, 12: {'item1a': 1320, 'item7': 1359}, 13: {'item1a': 1383, 'item7': 1417}, 9: {'item1a': 229, 'item7': 238}, 3: {'item1a': 0, 'item7': 6}, 0: {'item1a': 0, 'item7': 3}, 1: {'item1a': 0, 'item7': 6}, 2: {'item1a': 0, 'item7': 6}, 4: {'item1a': 1, 'item7': 9}, 5: {'item1a': 9, 'item7': 9}, 7: {'item1a': 7, 'item7': 8}, 8: {'item1a': 6, 'item7': 8}, -1: {'item1a': 0, 'item7': 2}, 98: {'item1a': 0, 'item7': 0}, 6: {'item1a': 7, 'item7': 8}}
880
1856
0.47413793103448276


In [149]:
""" 10-Q Data Analysis """
import json
from collections import Counter
import numpy as np
f = open("data_10Q_1K/dataStats.txt")
data = f.read()
f.close()
data = json.loads(data)
count = 0
item1aCount = 0
yearwise = {}
for ticker in data.keys():
    company = data[ticker]
    years = company["years"]
    flag = False
    sorted_indexes = np.argsort(years)
    total = 0
    yearTemp = {}
    for i in sorted_indexes:
        if years[i]>=9 and years[i]<19:
            if company["item1a"][i]>100:
                if years[i] in yearTemp:
                    yearTemp[years[i]]+=1
                else:
                    yearTemp[years[i]] = 1
    for key in yearTemp.keys():
        if yearTemp[key] >= 3:
            if key in yearwise:
                yearwise[key] +=1
            else:
                yearwise[key] =1
    for i in range(9,19):
        if i in yearTemp and yearTemp[i] >= 3:
            continue
        else:
            flag = True
            break
    if not flag:
        count +=1
print(count)
print(len(data.keys()))
print(yearwise)

244
449
{9: 303, 10: 318, 11: 319, 12: 334, 13: 336, 14: 334, 15: 341, 16: 348, 17: 354, 18: 368}


In [None]:
import json
from collections import Counter
import numpy as np
f = open("data_10K_3K/dataStats.txt")
data_10K = f.read()
f.close()
data_10K = json.loads(data_10K)

f = open("data_10Q_1K/dataStats.txt")
data_10Q = f.read()
f.close()
data_10Q = json.loads(data_10Q)

