# Test SEC crawling

In [14]:
pip install requests_random_user_agent -q

Note: you may need to restart the kernel to use updated packages.


In [20]:
import pandas as pd
import numpy as np
import re
import requests
import requests_random_user_agent
from tqdm.notebook import tqdm
import dask

In [10]:
#function that finds the CIK corresponding to a company's ticker, can return several at once
def get_CIK(Tickers):
    url = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
    CIK_RE = re.compile(r'.*CIK=(\d{10}).*')    
    cik = {}
    for ticker in Tickers:
        r = requests.get(url.format(ticker), stream = True)
        #find all CIKs in the url page (it should be the same CIK several times)
        results = CIK_RE.findall(r.text)
        #if we find the CIK we can return a single instance of it
        if len(results):
            cik[ticker] = results[0]
        else:
            cik[ticker] = 'CIK NOT FOUND'
    return(cik)

In [11]:
get_CIK(['DIS','MSFT','AAPL'])

{'DIS': '0001744489', 'MSFT': '0000789019', 'AAPL': '0000320193'}

In [12]:
#function that finds the url of a specified file for a given company and a given year
#can return several at once
def get_url(tck, Year, FILE, cik = None, show = True):
    #find CIK of all tickers
    if(type(cik) is not np.ndarray):
        CIKs = get_CIK(tck)
    else:
        CIKs = dict(zip(tck, cik))
    Tickers = tck
    
    #10-K should be filled in q4, but sometimes filled in other quarters
    #start looking in 4 quarter and move to q1, then q2 and q3
    url_found = {}
    nb_iter = np.shape(Tickers)[0]*3 if Year == '2017' else np.shape(Tickers)[0]*4
    if(show):
        pbar = tqdm(total=nb_iter)

    for i in (4,1,2,3):
        if(Year == '2017' and i == 3):
            continue
        url = 'https://www.sec.gov/Archives/edgar/full-index/%s/QTR%s/master.idx'%(Year,i)
        response = requests.get(url)

        edgar_data = 'edgar/data/'
        
        for ticker in Tickers:
            CIK = CIKs[ticker]

            #check if CIK was found
            if(CIK == 'CIK NOT FOUND' or CIK == 'TICKER NOT FOUND'):
                url_found[ticker] = CIK
                continue
            #get rid of zeros in the front
            CIK = str(int(CIK))

            element2 = None
            element3 = None
            element4 = None

            ###Go through each line of the master index file and find given CIK 
            #and FILE and extract the text file path
            for line in response.text.split():        
                #when looking for proxy statment, skip these files (not the right ones)
                if (FILE == '14A' and 'PX14A6G' in line or 'DEFA14A' in line 
                    or 'DEFC14A' in line or 'DEFM14A' in line or 'DEFN14A' in line
                    or 'DEFR14A' in line or 'DFAN14A' in line or 'DFRN14A' in line 
                    or 'PRE' in line or 'PRRN14A' in line or 'PX14A6G' in line
                    or 'PX14A6N' in line or 'PX14A6G' in line or 'DEFM14A' in line
                    or 'DEFC14A' in line or 'Schedule' in line):
                       continue
                
                #find path
                if CIK in line and FILE in line:
                    for element in line.split(' '):
                        if edgar_data in element:
                            element2 = element.split('|')
                            for element3 in element2:
                                if edgar_data in element3:
                                    element4 = element3

            # The path of the FILE for the company 
            if (element4 != None and i == 4):
                url_found[ticker] = 'https://www.sec.gov/Archives/' + element4
            if (element4 == None and i == 4):
                url_found[ticker] = 'URL NOT FOUND'
            if (element4 != None and i != 4):
                if (url_found[ticker] == 'URL NOT FOUND'):
                    url_found[ticker] = 'https://www.sec.gov/Archives/' + element4
            if(show):
                pbar.update(n=1)
                
    return url_found

In [25]:
get_url(['AAPL', 'MSFT', 'META', 'COIN'], '2022', '10-K')

  0%|          | 0/16 [00:00<?, ?it/s]

{'AAPL': 'https://www.sec.gov/Archives/edgar/data/320193/0000320193-22-000108.txt',
 'MSFT': 'https://www.sec.gov/Archives/edgar/data/789019/0001564590-22-026876.txt',
 'META': 'https://www.sec.gov/Archives/edgar/data/1326801/0001326801-22-000018.txt',
 'COIN': 'https://www.sec.gov/Archives/edgar/data/1679788/0001679788-22-000031.txt'}