**All Imports**

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

# logging
import sys
import logging
import time

from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from datetime import date
import datetime

In [2]:
opts = Options()
opts.use_chromium = True
opts.headless = True
opts.add_argument("disable-gpu")
opts.add_argument("--log-level=3")

logging.basicConfig(filename='v.log', filemode='w', level=logging.ERROR, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

try:
    driver = Chrome(options=opts, executable_path='./chromedriver.exe') 
except Exception as e:
    logging.error(f'FATAL ERROR! {e}')
    sys.exit()

# variables
BASE_URL = "https://www.sec.gov/edgar/search/#/"
FILE_BASE_URL = "https://www.sec.gov/Archives/edgar/data/{cik}/" 


In [3]:
def generate_random_names():
    import random
    first_names = ['Madelyn','Lamb','Ty','Long','Janiya','Burke','Kameron','Mercer','August','Ibarra','Jabari','Hurley']
    last_names = ['Vicente','Stevens','Leonard','Stokes','Judah','Frost','Sophie','Parsons','Sydnee','Ellison','Calvin','Calhoun']
    digits = [11,23,45,67,89,30,91,82,73,64,65,54,43,32,21,10]

    name = random.choice(first_names) +' '+ random.choice(last_names) 
     
    return name + ' ' + name.replace(' ','')+ str(random.choice(digits))+'@gmail.com'

def request_data(url):
    headers = {
        'User-Agent': generate_random_names(),
        'Accept-Encoding':'gzip, deflate',
        'Host': 'www.sec.gov'
        }
    r = requests.get(url,headers=headers)

    if r.status_code != 200:
        # retry 3 times
        r = requests.get(url,headers=headers)
        if r.status_code != 200:
            r = requests.get(url)
            if r.status_code != 200:
                print(f"FATAL ERROR: 3 retries failed to retrieve information from url:\n{url}")
                logging.error(f'FATAL ERROR! 3 retries failed to retrieve information from url:\n{url}')
                return False
        #exit() we want the script to continue
    return BeautifulSoup(r.text, 'html.parser')

def get_date_string(filing_date_start,filing_date_end=None):
#   2017-02-10 --- 2022-02-10
#   start_date --- end_date
    today = date.today()
    five_years_ago = today - datetime.timedelta(days=3*365)
    default_start_date = today.strftime("%Y-%m-%d")
    default_end_date = five_years_ago.strftime("%Y-%m-%d")
    if filing_date_start and not filing_date_end:
        return  f"&startdt={filing_date_start}&enddt={filing_date_start}"
    elif not filing_date_start and  filing_date_end:
        return  f"&startdt={filing_date_end}&enddt={filing_date_end}"
    elif filing_date_start and filing_date_end:
        return  f"&startdt={filing_date_start}&enddt={filing_date_end}"
    else:
        return f"&startdt={default_end_date}&enddt={default_start_date}"



In [4]:
def get_search_results(SEARCH_URL):
#     todo: add code to continue appending rows if table has more pages in its pagination
    driver.get(SEARCH_URL)
    soup = BeautifulSoup(driver.page_source, "lxml")
    table = soup.find_all("table")[-1]
    rows = table.find_all("tr")
    
    if len(rows) > 1: # 1 row = header only
        cols = []
        data = []
        for i, row in enumerate(rows):
            cik = ""
            if i == 0:
                table_header = row.find_all("th")
                
                for col in table_header:
                    cols.append(col.text )
            else:
                row_data = dict.fromkeys(cols,[])
                for i, col in enumerate(row.find_all("td")):
                    if col.find('a') and col.a.get('data-adsh'):
                        row_data[cols[i]] = FILE_BASE_URL + str(col.a.get('data-adsh')).replace("-","")+ "/"+ col.a.get('data-file-name')
                    else:
                        row_data[cols[i]] = col.text

                row_data[cols[0]] = row_data[cols[0]].replace("{cik}",str(row_data['CIK']).replace("CIK","").strip()) 
                data.append(row_data)

        df = pd.DataFrame(data)
        df.sort_values(by=['Filed'], inplace=True, ascending=False)
#         DATA_URL = df.iloc[0,0]
        DATA_URL = df
#         df.to_excel('Results.xlsx',sheet_name='SearchResults')
        return DATA_URL
    return None



**Form input for Task 1**

In [5]:
def get_inputs_tsk1():
    form = {}
    form['ticker'] = input('Enter Stock Ticker Symbol>>  ').strip()
    form['filing_type'] = input('Enter Filing Type Required (10-K,10-Q,etc)>>  ')
#     Is the User search for a financial table or particular Item
    while True:
        form['search_type'] = input("Do you want to search for a Financial Table (T) or Particular Item in Document (I),Please Enter (T) or (I)>>  ").strip()
        if form['search_type'].lower() == 't' or form['search_type'].lower() == 'i':
            break
#     if searching for financial table or searching for particular item
    search_type = form['search_type'].lower()
    if search_type == 't':
        form['target'] = input('Enter Document Required>> ').strip()
    else:
        while True:
            form['item_part'] = input('Do you want to get an item in Part I (1) or Part II (2),Please Enter (1) or (2)>>  ').strip()
            if form['item_part'] == '1' or form['item_part'] == '2':
                break
#       get item number
        form['target'] = input('Enter Item Number>> ').strip()
    
    print('To enter ''Date of Filing'' only enter one date. To enter ''Date period'' enter 2 dates.  ')
    form['filing_date_start'] = input('Filed FROM Date: (YYYY-MM-DD)>>  ').strip() or None
    form['filing_date_end'] = input('Filed TO Date: (YYYY-MM-DD)>>  ').strip() or None
    return form



def generate_search_url(data):
    for key in data:
        if data[key] is None:
            data[key] = ''
    ''' get search results from sec.gov and save to excel file'''
    document = '%2522'+'%2520'.join(data.get('target').split(' '))+'%2522'
    date_string = get_date_string( data.get('filing_date_start'), data.get('filing_date_end') )
    SEARCH_URL = f"{BASE_URL}q={document}&category=custom&entityName={data.get('ticker')}&forms={data.get('filing_type')}" + date_string
    print('SEARCH_URL',SEARCH_URL)
    return SEARCH_URL

**Form input for Task 2**

In [6]:
def get_inputs_tsk2():
    form = {}
#   default company names are None
    form['ticker'] = None
    form['filing_type'] = input('Enter Filing Type Required (10-K,10-Q,etc)>>  ')
    while True:
        form['item_part'] = input('Do you want to get an item in Part I (1) or Part II (2),Please Enter (1) or (2)>>  ').strip()
        if form['item_part'] == '1' or form['item_part'] == '2':
            break
            
#       get item number
    form['item_number'] = input('Enter Item Number>> ').strip()
    
#       keyword or phrase to search with the item number
    form['target'] = input('Enter keyword or phrase to search with the item number>> ').strip()
    
    print('To enter ''Date of Filing'' only enter one date. To enter ''Date period'' enter 2 dates.  ')
    form['filing_date_start'] = input('Filed FROM Date: (YYYY-MM-DD)>>  ').strip() or None
    form['filing_date_end'] = input('Filed TO Date: (YYYY-MM-DD)>>  ').strip() or None
    return form