## Multithreading

In [1]:
# Import modules
import time
import concurrent.futures
from sec_edgar_downloader import Downloader
import pandas as pd
from tqdm import tqdm
import requests

In [3]:
# Initialize the downloader from the pre-written package for edgar download. 
# Create a list of all possible parameters, aka file types supported in the downloader.
dl = Downloader('data/')
# dl = Downloader('C:\\Users\\wangl\\Desktop\\Work\\Files')

df = pd.read_excel('company.xlsx')
cik = list(set(df.CIK[1:]))

filetypes = ['8-K','10-K','10KSB','10-Q','13F-NT','13F-HR','SC 13G','SD','S-1']


In [3]:
df.head()

Unnamed: 0,Company Name,Unnamed: 1,Unnamed: 2,Form Type,CIK,Date Filed,File Name
0,-----------------------------------,------,-----------------,----------------,---------,-------------,---------------------------------------------
1,&VEST Domestic Fund II LP,,,D,1800903,2020-01-27 00:00:00,edgar/data/1800903/0001800903-20-000001.txt
2,&VEST Offshore Fund II L.P.,,,D,1800902,2020-01-27 00:00:00,edgar/data/1800902/0001800902-20-000001.txt
3,"&vest Domestic Fund II KPIV, L.P.",,,D,1802417,2020-02-06 00:00:00,edgar/data/1802417/0001802417-20-000001.txt
4,"024 Pharma, Inc.",,,8-K/A,1307969,2020-02-20 00:00:00,edgar/data/1307969/0001683168-20-000541.txt


In [4]:
len(df)

178819

In [4]:
len(cik)

51883

In [5]:
c = sorted(cik)

In [6]:
for i in tqdm(c):
    def download_edgar(filetype):
        try:
            dl.get(filetype,i,1)
        except:
            pass
    # Initialize multithreading. Loop will be iterated based on diff companies.
    # download_edgar function needs to be modified if one wants to iterate on file types in the multi threads.
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(download_edgar, filetypes)

100%|██████████| 51883/51883 [13:34:45<00:00,  1.50s/it]   


In [13]:
import glob
len(glob.glob('data/sec_edgar_filings/*'))

17205

In [14]:
# get documentation
r = requests.get('https://www.sec.gov/info/edgar/forms/edgform.pdf')
with open('documentation.pdf','wb') as pdf:
    pdf.write(r.content)


In [70]:
# !pip install pdfplumber

In [46]:
## extract the table from the pdf documentation
import pdfplumber
import pandas as pd

d = {}
with pdfplumber.open("documentation.pdf") as pdf:
    for page in pdf.pages:
        try: 
            table = page.extract_table()
            d[page.page_number] = pd.DataFrame(table[1:], columns=table[0])
        except:
            continue

In [67]:
d[2]

Unnamed: 0,Submission Type,Description,Tool/ Template Number,Filer- Constructed Form Spec.
0,"1-A, 1-A/A",Offering statement under Regulation A,EDGAR \nFiling \nWebsite,EDGAR \nREG A XML \nTechnical \nSpecification
1,1-A POS,Post-qualification amendment to a 1-A offering...,EDGAR \nFiling \nWebsite,EDGAR \nREG A XML \nTechnical \nSpecification
2,"1-A-W, 1-A-W/A",Withdrawal of offering statement under Regulat...,EDGARLink \nOnline,EDGARLink \nOnline XML \nTechnical \nSpecifica...
3,"1-E, 1-E/A",Notification under Regulation E by small busin...,EDGARLink \nOnline,EDGARLink \nOnline XML \nTechnical \nSpecifica...
4,"1-E AD, 1-E AD/A",Sales material filed pursuant to Rule 607 unde...,EDGARLink \nOnline,EDGARLink \nOnline XML \nTechnical \nSpecifica...
5,"1-K, 1-K/A",Annual Report Pursuant to Regulation A,EDGAR \nFiling \nWebsite,EDGAR \nREG A XML \nTechnical \nSpecification
6,"1-SA, 1-SA/A",Semiannual Report Pursuant to Regulation A,EDGARLink \nOnline,EDGARLink \nOnline XML \nTechnical \nSpecifica...
7,"1-U, 1-U/A",Current Report pursuant to Regulation A,EDGARLink \nOnline,EDGARLink \nOnline XML \nTechnical \nSpecifica...
8,"1-Z, 1-Z/A",Exit Report under Regulation A,EDGAR \nFiling \nWebsite,EDGAR REG \nA XML \nTechnical \nSpecification
9,"1-Z-W, 1-Z-W/A",Withdrawal of Exit Report under Regulation A,EDGARLink \nOnline,EDGARLink \nOnline XML \nTechnical \nSpecifica...
