In [1]:
# importing required libraries
import re
import os
import time
import urllib.request
import requests
import unicodedata
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# pip install selenium --user
# conda install -c anaconda beautifulsoup4

In [8]:
def dfSec10k(cik_inp):
    # set the central index key
    cik = cik_inp
    
    # pass the main url from SEC
    url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=' + cik + '&type=10-k&dateb=&owner=exclude&count=40&search_text='

    # grab the html
    try:
        page = urllib.request.urlopen(url)
    except:
        print("An error occured.")

    soup = BeautifulSoup(page, "html.parser")

    list_td = []
    for item in soup.find_all('td',class_=["small",""]):
        list_td.append(item.text)

    list_td_2 = []
    for item1 in soup.find_all('td',class_=["small",""]):
        for i in item1.find_next('td'):
            list_td_2.append(i)

    df = pd.DataFrame(list_td)
    df1 = pd.DataFrame(list_td_2)

    df.columns = ['table']
    df1.columns = ['date']

    result = pd.concat([df, df1], axis=1)

    # Data cleanup step - 1
    result['date'] = result['date'].astype(str)
    result['date'] = pd.to_datetime(result['date'])
    result['year'] = pd.DatetimeIndex(result['date']).year
    del result['date']
    
    # Data cleanup step - 2
    result['table'] = result['table'].astype(str) 
    result['table'] = result['table'].str.replace(' ', '').str.replace('[', '').str.replace(']', '').str.replace(',', '').str.replace('  ', '')
    result['table'] = result['table'].str.replace('(', '').str.replace(')', '').str.replace('-', '',2).str.replace('KB', '')

    result['table'] = result['table'].str.replace('AnnualreportSection13and15d', '').str.replace('Amend', '').str.replace('notSKItem405', '')
    result['table'] = result['table'].str.replace('34Act', '').str.replace('Size', '').str.replace('MB', '').str.replace('SKItem405', '')

    result['table'] = result['table'].str.replace('AnnualreportSections13and15d', '').str.replace('Accno', '').str.replace(':', '',1)

    result['acc_no'] = result['table'].str[:20]
    del result['table']
    
    index_check = []
    for index, row in result.iterrows():
        if row['acc_no'][0:2] == '00':
            index_check.append(index)

    result = result.iloc[index_check].reset_index(drop=True)

    # Data cleanup step - 3
    result['acc_no2'] = result['acc_no'].str.replace('-', '')
    
    result = result.drop_duplicates(subset='year', keep="first")
    result = result.reset_index(drop=True)
    
    result_r11 = result.head(11)
    result_r11
    
    # URL filing link extraction
    df_furl = []

    for x,y in result_r11.iterrows():
        # URL extraction step - 1
        acc_1 = y['acc_no']
        acc_2 = y['acc_no2']

        url_2 = 'https://www.sec.gov/Archives/edgar/data/'+cik+'/'+acc_2+'/'+acc_1+'-index.htm'

        try:
            page_1 = urllib.request.urlopen(url_2)
        except:
            print("An error occured.")

        soup_1 = BeautifulSoup(page_1, "html.parser")
        company_page = [item_page.get_text('\n',strip=True) for item_page in soup_1.select("div.formGrouping")][1]
        company_page = str(company_page)

        r = requests.get(url_2)
        df_table = pd.read_html(r.text)
        df_tp = df_table[0]
        del df_tp['Seq']
        del df_tp['Type']
        del df_tp['Size']
        df_tp = df_tp.astype(str)
        df_tp['Document'] = df_tp['Document'].str.replace(' iXBRL', '')

        # URL extraction step - 2
        doc = df_tp['Document'].iloc[-1]

        url_3 = 'https://www.sec.gov/Archives/edgar/data/'+cik+'/'+acc_2+'/'+doc
        df_furl.append(url_3)
        
    # Text URL output to dataframe
    df_furl
    df_out = pd.DataFrame(df_furl, columns=["final_url"])

    result_r11["final_url"] = df_out["final_url"]
    del result_r11['acc_no2']
    result_r11
    
    return result_r11

In [5]:
def pullTextAll(link):
    URL_text = str(link)

    # Grab the response
    responses = requests.get(URL_text)

    # Parse the response (the XML flag works better than HTML for 10Ks)
    souper = BeautifulSoup(responses.content, 'lxml')

    text_b = []
    text_r = []
    for filing_document in souper.find_all('document'):
        document_type = filing_document.type.find(text=True, recursive=False).strip()

        if document_type == "10-K":
            text = filing_document.find('text').extract().text

            text = re.sub('\n', ' ', text)
            text = re.sub('\xa0', ' ', text)
            matches = list(re.finditer(re.compile('I[tT][eE][mM] [0-9][a-zA-Z]*\s*[.|:|-]\s*'), text))

            # Business section
            try:
                start_b = max([i for i in range(len(matches)) if ((matches[i][0].upper().replace(" ","") == 'ITEM1.')|
                               (matches[i][0].upper().replace(" ","") == 'ITEM1:')|
                               (matches[i][0].upper().replace(" ","") == 'ITEM1-'))])
                end_b = start_b+1
                start_b = matches[start_b].span()[1]
                end_b = matches[end_b].span()[0]
                text_b = text[start_b:end_b]
            except:
                text_b = None
                
            # Risk section
            try:
                start_r = max([i for i in range(len(matches)) if ((matches[i][0].upper().replace(" ","") == 'ITEM1A.')|
                               (matches[i][0].upper().replace(" ","") == 'ITEM1A:')|
                               (matches[i][0].upper().replace(" ","") == 'ITEM1A-'))])
                end_r = start_r+1
                start_r = matches[start_r].span()[1]
                end_r = matches[end_r].span()[0]
                text_r = text[start_r:end_r]
            except:
                text_r = None

    return text_b, text_r

In [6]:
# Read cik data from excel
cik_in = pd.read_excel('S10K_Companies.xlsx')
del cik_in['cik_short']
del cik_in['name']
cik_in = cik_in.loc[21:21]
cik_in = cik_in['cik'].tolist()
cik_in = [str(x) for x in cik_in] 
cik_in

['1291080']

In [10]:
# Extraction loop and file creation
for rw in cik_in:
    cikNo = rw
    
    # Getting the company name 
    addr = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=' + cikNo + '&type=10-k&dateb=&owner=exclude&count=40&search_text='
    
    try:
        linOut = urllib.request.urlopen(addr)
    except:
        print("An error occured.")

    s = BeautifulSoup(linOut, "html.parser")
    company_name = [item.get_text(strip=True) for item in s.select("span.companyName")]
    company_name = str(company_name)
    company_name = re.sub('[^A-Za-z0-9]+', ' ', company_name)

    n1 = company_name.split()[0].upper()
    n2 = company_name.split()[1].upper().replace('CIK', '')
    print (n1 + "_" + n2)
    
    dataFrameOut = dfSec10k(cikNo)
    
    # Method to pull text
    df_text_bus = []
    df_text_risk = []

    for i in dataFrameOut.final_url:

        adder_bus, adder_risk = pullTextAll(str(i))
        df_text_bus.append(adder_bus)
        df_text_risk.append(adder_risk)     

    # Final output to excel spreadsheet
    df_text_out_1 = pd.DataFrame(df_text_bus, columns=["text_business"])
    dataFrameOut["text_business"] = df_text_out_1["text_business"]

    df_text_out_2 = pd.DataFrame(df_text_risk, columns=["text_risk"])
    dataFrameOut["text_risk"] = df_text_out_2["text_risk"]
    
    pd.DataFrame(dataFrameOut).to_excel(r'Temp_Files_2/' + n1 + '_' + n2 + '_' + cikNo + '.xlsx', sheet_name='Sheet1', index = False)

An error occured.


NameError: name 'linOut' is not defined