**All Imports**

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

# logging
import sys
import logging
import time

from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from datetime import date
import datetime
import json

In [2]:
opts = Options()
opts.use_chromium = True
opts.headless = True
opts.add_argument("disable-gpu")
opts.add_argument("--log-level=3")

logging.basicConfig(filename='v.log', filemode='w', level=logging.ERROR, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

try:
    driver = Chrome(options=opts, executable_path='./chromedriver.exe') 
except Exception as e:
    logging.error(f'FATAL ERROR! {e}')
    sys.exit()

# variables
BASE_URL = "https://www.sec.gov/edgar/search/#/"
FILE_BASE_URL = "https://www.sec.gov/Archives/edgar/data/" 


In [3]:
def generate_random_names():
    import random
    first_names = ['Madelyn','Lamb','Ty','Long','Janiya','Burke','Kameron','Mercer','August','Ibarra','Jabari','Hurley']
    last_names = ['Vicente','Stevens','Leonard','Stokes','Judah','Frost','Sophie','Parsons','Sydnee','Ellison','Calvin','Calhoun']
    digits = [11,23,45,67,89,30,91,82,73,64,65,54,43,32,21,10]

    name = random.choice(first_names) +' '+ random.choice(last_names) 
     
    return name + ' ' + name.replace(' ','')+ str(random.choice(digits))+'@gmail.com'

def request_data(url,payload={}):
    headers = {
        'User-Agent': generate_random_names(),
        'Accept-Encoding':'gzip, deflate',
        'Host': 'www.sec.gov'
        }
    r = requests.get(url,headers=headers)

    if r.status_code != 200:
        # retry 3 times
        r = requests.get(url,headers=headers)
        if r.status_code != 200:
            r = requests.get(url)
            if r.status_code != 200:
                print(f"FATAL ERROR: 3 retries failed to retrieve information from url:\n{url}")
                logging.error(f'FATAL ERROR! 3 retries failed to retrieve information from url:\n{url}')
                return False
        #exit() we want the script to continue
    return BeautifulSoup(r.text, 'html.parser')

def get_date_string(filing_date_start,filing_date_end=None):
#   2017-02-10 --- 2022-02-10
#   start_date --- end_date
    today = date.today()
    five_years_ago = today - datetime.timedelta(days=3*365)
    default_start_date = today.strftime("%Y-%m-%d")
    default_end_date = five_years_ago.strftime("%Y-%m-%d")
    
    if filing_date_start and not filing_date_end:
        return  (filing_date_start,filing_date_start)
    elif not filing_date_start and  filing_date_end:
        return  (filing_date_end,filing_date_end)
    elif filing_date_start and filing_date_end:
        return  (filing_date_start,filing_date_end)
    else:
        return (default_end_date,default_start_date)



In [4]:
def get_generic_marketdata(url):
    headers = {
        'User-Agent': generate_random_names(),
        'Accept-Encoding':'gzip, deflate',
        'Host': 'www.sec.gov'
        }
    r = requests.get(url,headers=headers)

    if r.status_code != 200:
        # retry 3 times
        r = requests.get(url,headers=headers)
        if r.status_code != 200:
            r = requests.get(url)
            if r.status_code != 200:
                print(f"FATAL ERROR: 3 retries failed to retrieve information from url:\n{url}")
                logging.error(f'FATAL ERROR! 3 retries failed to retrieve information from url:\n{url}')
                return False
        #exit() we want the script to continue
    return BeautifulSoup(r.text, 'lxml') 


def get_search_results(form_data,max_results_needed=float('inf')):
    fd = form_data
#     clean date formats first
    date_strings = get_date_string( fd.get('filing_date_start'), fd.get('filing_date_end') )
#     check if user gave form filings to search by
    forms = fd.get('filing_type','') or "10-K,10-Q"
    url = "https://efts.sec.gov/LATEST/search-index"
    payload = json.dumps({
    "q":fd.get('target',''),
    "category":"custom",
    "entityName":fd.get('ticker',''),
    "forms":forms.split(','),
    "startdt":date_strings[0],
    "enddt":date_strings[1]
    })
    headers = {
      'authority': 'efts.sec.gov',
      'accept': 'application/json, text/javascript, */*; q=0.01',
      'content-type': 'application/json; charset=UTF-8',
      'sec-ch-ua-mobile': '?0',
      'user-agent': generate_random_names(),
      'origin': 'https://www.sec.gov',
      'sec-fetch-site': 'same-site',
      'sec-fetch-mode': 'cors',
      'sec-fetch-dest': 'empty',
      'referer': 'https://www.sec.gov/',
    }

    r = requests.post(url, headers=headers, data=payload)
    if r.status_code != 200:
        # retry 3 times
        r = requests.post(url, headers=headers, data=payload)
        if r.status_code != 200:
            r = requests.post(url, headers=headers, data=payload)
            if r.status_code != 200:
                print(f"FATAL ERROR: 3 retries failed to retrieve information from url:\n{url}")
                logging.error(f'FATAL ERROR! 3 retries failed to retrieve information from url:\n{url}')
                return False

    response_data = json.loads(r.text)
    total_results = response_data.get('hits',{}).get('total',{}).get('value',0)

    clean_results = []
    print("Total results are:",total_results,end="\n\n")
    exe = 0
    if total_results > 0:

    #user passed limits
        limit = min(max_results_needed,total_results)
    #     add results to total results
        results = response_data.get('hits',{}).get('hits',{})
        for result in results:
            source = result.get('_source',{})

    #     clean data avoid exhibit files
            if "ex" not in source.get('file_type',"ex").lower():
                filing_no = source.get('adsh').replace("-","")
                ticker = source.get('ciks')[0]
                index_page = str(FILE_BASE_URL) +  str(ticker) + "/" + str(filing_no)
                file_name = result.get('_id','').split(":")[-1]
                data = {
                    'file_date' : source.get('file_date'),
                    'filing_no' : filing_no,
                    'file_type' : source.get('file_type'),
                    'file_name' : file_name,
                    'ticker' : ticker,
                    'period_ending' : source.get('period_ending'),
                    'display_names' : " ".join(source.get('display_names')),
                    'filing_summa_xml_path': index_page + "/" + 'FilingSummary.xml',
                    'file_path': index_page + "/" + file_name,
                    'index_page': index_page
                }
                clean_results.append(data)
            else:
                exe += 1
        
        print('Exhibits Records (exe) Ignored:',exe,end="\n\n")
        df = pd.DataFrame.from_dict(clean_results, orient='columns')
        df.sort_values(by=['file_date'], inplace=True, ascending=False)
        return df[:limit]
#     print("No Records for this search were found")
    return None
    
# Test function results
# get_search_results({'ticker': 'apple',
#  'filing_type': '',
#  'search_type': 'T',
#  'target': 'balance sheet',
#  'filing_date_start': '2020-01-01',
#  'filing_date_end': '2022-03-01'},3)


**Form input for Task 1**

In [5]:
def get_inputs_tsk1():
    form = {}
    form['ticker'] = input('Enter Stock Ticker Symbol>>  ').strip()
    form['filing_type'] = input('Enter Filing Type Required (10-K,10-Q,etc)>>  ')
#     Is the User search for a financial table or particular Item
    while True:
        form['search_type'] = input("Do you want to search for a Financial Table (T) or Particular Item in Document (I),Please Enter (T) or (I)>>  ").strip()
        if form['search_type'].lower() == 't' or form['search_type'].lower() == 'i':
            break
#     if searching for financial table or searching for particular item
    search_type = form['search_type'].lower()
    if search_type == 't':
        form['target'] = input('Enter Document Required>> ').strip()
    else:
        while True:
            form['item_part'] = input('Do you want to get an item in Part I (1) or Part II (2),Please Enter (1) or (2)>>  ').strip()
            if form['item_part'] == '1' or form['item_part'] == '2':
                break
#       get item number
        form['target'] = input('Enter Item Number>> ').strip()
    
    print('To enter ''Date of Filing'' only enter one date. To enter ''Date period'' enter 2 dates.  ')
    form['filing_date_start'] = input('Filed FROM Date: (YYYY-MM-DD)>>  ').strip() or None
    form['filing_date_end'] = input('Filed TO Date: (YYYY-MM-DD)>>  ').strip() or None
    return form



def generate_search_url(data):
    for key in data:
        if data[key] is None:
            data[key] = ''
    ''' get search results from sec.gov and save to excel file'''
    document = '%2522'+'%2520'.join(data.get('target').split(' '))+'%2522'
    date_strings = get_date_string( data.get('filing_date_start'), data.get('filing_date_end') )
    SEARCH_URL = f"{BASE_URL}q={document}&category=custom&entityName={data.get('ticker')}&forms={data.get('filing_type')}&startdt={date_strings[0]}&enddt={date_strings[1]}"
    print('SEARCH_URL',SEARCH_URL, end="\n\n")
    return SEARCH_URL

**Form input for Task 2**

In [6]:
def get_inputs_tsk2():
    form = {}
#   default company names are None
    form['ticker'] = None
    form['filing_type'] = input('Enter Filing Type Required (10-K,10-Q,etc)>>  ')
    while True:
        form['item_part'] = input('Do you want to get an item in Part I (1) or Part II (2),Please Enter (1) or (2)>>  ').strip()
        if form['item_part'] == '1' or form['item_part'] == '2':
            break
            
#       get item number
    form['item_number'] = input('Enter Item Number>> ').strip()
    
#       keyword or phrase to search with the item number
    form['target'] = input('Enter keyword or phrase to search with the item number>> ').strip()
    
    print('To enter ''Date of Filing'' only enter one date. To enter ''Date period'' enter 2 dates.  ')
    form['filing_date_start'] = input('Filed FROM Date: (YYYY-MM-DD)>>  ').strip() or None
    form['filing_date_end'] = input('Filed TO Date: (YYYY-MM-DD)>>  ').strip() or None
    return form