# Importing Packages

In [1]:
# To perform basic text preprocessing
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# To save files
import pickle

# To parse 10K and 10Q forms
from bs4 import BeautifulSoup

# To download 10K and 10Q forms
from secedgar import CompanyFilings, FilingType
# The following 2 lines are needed if using secedgar 
# to download files in Jupyter notebook environment
import nest_asyncio
nest_asyncio.apply()

# For tracking progress in loops
from tqdm import tqdm

# For file path and text manipulation/searching
import os
import re

[nltk_data] Downloading package stopwords to /Users/yashv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yashv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yashv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Helper Functions

In [2]:
def lemmatize_words(words: list) -> list:
    """
    Takes a list of words, and returns them after lemmatizing each word.
    
    INPUTS:
        :words (list): List of input words.
    
    OUTPUTS:
        :(list): List of words after lemmatizing each input word.
    """
    return [WordNetLemmatizer().lemmatize(word, 'v') for word in words]

In [3]:
def lemmatize_text(text: str) -> list:
    """
    Takes a string, and returns a list of all words in the string after lemmatization.
    
    INPUTS:
        :text (str): Input text block.
    
    OUTPUTS:
        :(list): List of words after lemmatizing each word in the input text.
    """
    # Splits all words in the text based on whitespace
    word_pattern = re.compile('\w+')
    # Returns lemmatized versions of each word found by the above pattern
    return lemmatize_words(word_pattern.findall(text))

In [4]:
def stop_words_removal(words):
    """
    Takes a list of words, and returns a list of all words in the list that are not stopwords.
    
    INPUTS:
        :words (list): Input list of words.
    
    OUTPUTS:
        :(list): List of words after removing all the stopwords.
    """
    # English Stopwords
    stop_words = stopwords.words('english')
    # Lemmatizing stopwords
    lemma_stop_words = lemmatize_words(stop_words)
    # Returning only those words that are not in the stopwords list
    return [word for word in words if word not in lemma_stop_words]

# Downloading 10Q forms

Downloading the `10Q` forms from the `secedgar` python API.

In [5]:
filings_10q = CompanyFilings(
    cik_lookup=['fb', 'aapl', 'amzn', 'nflx', 'goog'], 
    filing_type=FilingType.FILING_10Q, 
    count=28, 
    user_agent='Yashveer Singh Sohi (yashveer@seas.upenn.edu)'
  )
filings_10q.save("10Q/")

100%|██████████| 140/140 [00:14<00:00,  9.92it/s]


Collecting all the file names, and file paths for the forms.

In [6]:
filenames = {}
filepaths = {}
ten_qs_root_dir = "10Q/"
for ticker in ['fb', 'aapl', 'amzn', 'nflx', 'goog']:
    ticker_filenames = os.listdir(ten_qs_root_dir + ticker + "/10-Q/")
    ticker_filenames.sort()
    filenames[ticker] = ticker_filenames
    filepaths[ticker] = [ten_qs_root_dir + ticker + "/10-Q/" + ticker_filename 
                       for ticker_filename in ticker_filenames]

- Reading all the forms
- Parsing the content
- Lemmatizing the words in the form and removing stop words
- Storing these final processed forms
- Storing the dates for each form in another dictionary

In [7]:
ten_q_forms = {}
ten_q_forms_dates = {}

date_errors = []
content_errors = []
for ticker, filepath in tqdm(filepaths.items(), total=len(filepaths), desc=" outer", position=0):
    for path in tqdm(filepath, total=len(filepath), desc=" inner loop", position=1, leave=True):
        with open(path, "r") as f:
            form_content = f.read()
        try:
            m = re.search(pattern="FILED AS OF DATE", string=form_content)
            date = "".join([s for s in form_content[m.end():m.end()+20] if s.isnumeric()])
            date_errors.append("NA")
        except:
            date = "NA"
            date_errors.append(path)
        
        try:
            form_text = BeautifulSoup(form_content.lower(), 'html.parser').get_text()
            form_tokens = lemmatize_text(form_text)
            form_tokens_clean = stop_words_removal(form_tokens)
            content_errors.append("NA")
        except:
            form_tokens_clean = "NA"
            content_errors.append(path)

        if ten_q_forms.get(ticker) == None:
            ten_q_forms[ticker] = [form_tokens_clean]
            ten_q_forms_dates[ticker] = [date]
        else:
            ten_q_forms[ticker].append(form_tokens_clean)
            ten_q_forms_dates[ticker].append(date)

 outer:   0%|          | 0/5 [00:00<?, ?it/s]
 inner loop:   0%|          | 0/20 [00:00<?, ?it/s][A
 inner loop:   5%|▌         | 1/20 [00:04<01:32,  4.87s/it][A
 inner loop:  10%|█         | 2/20 [00:09<01:20,  4.44s/it][A
 inner loop:  15%|█▌        | 3/20 [00:12<01:09,  4.09s/it][A
 inner loop:  20%|██        | 4/20 [00:17<01:09,  4.34s/it][A
 inner loop:  25%|██▌       | 5/20 [00:21<01:06,  4.41s/it][A
 inner loop:  30%|███       | 6/20 [00:25<00:58,  4.16s/it][A
 inner loop:  35%|███▌      | 7/20 [00:29<00:52,  4.04s/it][A
 inner loop:  40%|████      | 8/20 [00:32<00:46,  3.90s/it][A
 inner loop:  45%|████▌     | 9/20 [00:36<00:42,  3.87s/it][A
 inner loop:  50%|█████     | 10/20 [00:42<00:42,  4.29s/it][A
 inner loop:  55%|█████▌    | 11/20 [00:46<00:39,  4.37s/it][A
 inner loop:  60%|██████    | 12/20 [00:50<00:35,  4.38s/it][A
 inner loop:  65%|██████▌   | 13/20 [00:56<00:33,  4.75s/it][A
 inner loop:  70%|███████   | 14/20 [01:02<00:30,  5.07s/it][A
 inner loop:

Sanity checks
- If the date for any form was not parsed correctly.

In [8]:
[d for d in date_errors if d!="NA"]

[]

- If the content for any form was not parsed correctly.

In [9]:
[c for c in content_errors if c != "NA"]

['10Q/amzn/10-Q/0001018724-16-000286.txt',
 '10Q/goog/10-Q/0001652044-18-000027.txt']

Sample structure of how the 10Q form's dates are stored.

In [10]:
ten_q_forms_dates

{'fb': ['20150731',
  '20151105',
  '20160428',
  '20160728',
  '20161103',
  '20170504',
  '20170727',
  '20171102',
  '20180426',
  '20180726',
  '20181031',
  '20190425',
  '20190725',
  '20191031',
  '20200430',
  '20200731',
  '20201030',
  '20210429',
  '20210729',
  '20211026'],
 'aapl': ['20170802',
  '20180202',
  '20180502',
  '20180801',
  '20190130',
  '20190501',
  '20190731',
  '20200129',
  '20200501',
  '20200731',
  '20210128',
  '20210429',
  '20210728',
  '20150428',
  '20150722',
  '20160127',
  '20160427',
  '20160727',
  '20170201',
  '20170503'],
 'amzn': ['20150724',
  '20151023',
  '20160429',
  '20160729',
  '20161028',
  '20170428',
  '20170728',
  '20171027',
  '20180427',
  '20180727',
  '20181026',
  '20190426',
  '20190726',
  '20191025',
  '20200501',
  '20200731',
  '20201030',
  '20210430',
  '20210730',
  '20211029'],
 'nflx': ['20150717',
  '20151016',
  '20160420',
  '20160719',
  '20161020',
  '20170719',
  '20171018',
  '20180418',
  '20180718',
 

Saving the `ten_q_forms` dictionary, and then as a sanity check, reloading it and comparing if it is the same file.

In [11]:
with open('ten_q_forms.pickle', 'wb') as handle:
    pickle.dump(ten_q_forms, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('ten_q_forms.pickle', 'rb') as handle:
    ten_q_forms_loaded = pickle.load(handle)
    
print(ten_q_forms_loaded == ten_q_forms)

True


Saving the `ten_q_forms_dates` dictionary, and then as a sanity check, reloading it and comparing if it is the same file.

In [12]:
with open('ten_q_forms_dates.pickle', 'wb') as handle:
    pickle.dump(ten_q_forms_dates, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('ten_q_forms_dates.pickle', 'rb') as handle:
    ten_q_forms_dates_loaded = pickle.load(handle)
    
print(ten_q_forms_dates_loaded == ten_q_forms_dates)

True


# Downloading 10K forms

Downloading the `10K` forms from the `secedgar` python API.

In [13]:
filings_10k = CompanyFilings(
    cik_lookup=['fb', 'aapl', 'amzn', 'nflx', 'goog'], 
    filing_type=FilingType.FILING_10K, 
    count=7, 
    user_agent='Yashveer Singh Sohi (yashveer@seas.upenn.edu)'
  )
filings_10k.save("10K/")

40it [00:04,  9.93it/s]                        


Collecting all the file names, and file paths for the forms.

In [14]:
filenames = {}
filepaths = {}
ten_ks_root_dir = "10K/"
for ticker in ['fb', 'aapl', 'amzn', 'nflx', 'goog']:
    ticker_filenames = os.listdir(ten_ks_root_dir + ticker + "/10-K/")
    ticker_filenames.sort()
    filenames[ticker] = ticker_filenames
    filepaths[ticker] = [ten_ks_root_dir + ticker + "/10-K/" + ticker_filename 
                       for ticker_filename in ticker_filenames]

- Reading all the forms
- Parsing the content
- Lemmatizing the words in the form and removing stop words
- Storing these final processed forms
- Storing the dates for each form in another dictionary

In [15]:
ten_k_forms = {}
ten_k_forms_dates = {}

date_errors = []
content_errors = []
for ticker, filepath in tqdm(filepaths.items(), total=len(filepaths), desc=" outer", position=0):
    for path in tqdm(filepath, total=len(filepath), desc=" inner loop", position=1, leave=True):
        with open(path, "r") as f:
            form_content = f.read()
        try:
            m = re.search(pattern="FILED AS OF DATE", string=form_content)
            date = "".join([s for s in form_content[m.end():m.end()+20] if s.isnumeric()])
            date_errors.append("NA")
        except:
            date = "NA"
            date_errors.append(path)
        
        try:
            form_text = BeautifulSoup(form_content.lower(), 'html.parser').get_text()
            form_tokens = lemmatize_text(form_text)
            form_tokens_clean = stop_words_removal(form_tokens)
            content_errors.append("NA")
        except:
            form_tokens_clean = "NA"
            content_errors.append(path)

        if ten_k_forms.get(ticker) == None:
            ten_k_forms[ticker] = [form_tokens_clean]
            ten_k_forms_dates[ticker] = [date]
        else:
            ten_k_forms[ticker].append(form_tokens_clean)
            ten_k_forms_dates[ticker].append(date)

 outer:   0%|          | 0/5 [00:00<?, ?it/s]
 inner loop:   0%|          | 0/7 [00:00<?, ?it/s][A
 inner loop:  14%|█▍        | 1/7 [00:11<01:07, 11.18s/it][A
 inner loop:  29%|██▊       | 2/7 [00:11<00:24,  4.86s/it][A
 inner loop:  43%|████▎     | 3/7 [00:22<00:30,  7.67s/it][A
 inner loop:  57%|█████▋    | 4/7 [00:32<00:26,  8.69s/it][A
 inner loop:  71%|███████▏  | 5/7 [00:44<00:19,  9.59s/it][A
 inner loop:  86%|████████▌ | 6/7 [00:55<00:10, 10.09s/it][A
 inner loop: 100%|██████████| 7/7 [01:07<00:00,  9.63s/it][A
 outer:  20%|██        | 1/5 [01:07<04:29, 67.42s/it]
 inner loop:   0%|          | 0/7 [00:00<?, ?it/s][A
 inner loop:  14%|█▍        | 1/7 [00:13<01:19, 13.25s/it][A
 inner loop:  29%|██▊       | 2/7 [00:25<01:02, 12.56s/it][A
 inner loop:  43%|████▎     | 3/7 [00:45<01:03, 15.98s/it][A
 inner loop:  57%|█████▋    | 4/7 [01:00<00:47, 15.69s/it][A
 inner loop:  71%|███████▏  | 5/7 [01:06<00:23, 11.98s/it][A
 inner loop:  86%|████████▌ | 6/7 [01:30<00:16, 

Sanity checks
- If the date for any form was not parsed correctly.

In [16]:
[d for d in date_errors if d!="NA"]

[]

- If the content for any form was not parsed correctly.

In [17]:
[c for c in content_errors if c != "NA"]

[]

Sample structure of how the 10K form's dates are stored.

In [18]:
ten_k_forms_dates

{'fb': ['20160128',
  '20160427',
  '20170203',
  '20180201',
  '20190131',
  '20200130',
  '20210128'],
 'aapl': ['20171103',
  '20181105',
  '20191031',
  '20201030',
  '20211029',
  '20151028',
  '20161026'],
 'amzn': ['20150130',
  '20160129',
  '20170210',
  '20180202',
  '20190201',
  '20200131',
  '20210203'],
 'nflx': ['20180129',
  '20190129',
  '20190208',
  '20200129',
  '20210128',
  '20170127',
  '20180205'],
 'goog': ['20160329',
  '20190206',
  '20170203',
  '20180206',
  '20190205',
  '20200204',
  '20210203']}

Saving the `ten_k_forms` dictionary, and then as a sanity check, reloading it and comparing if it is the same file.

In [19]:
with open('ten_k_forms.pickle', 'wb') as handle:
    pickle.dump(ten_k_forms, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('ten_k_forms.pickle', 'rb') as handle:
    ten_k_forms_loaded = pickle.load(handle)
    
print(ten_k_forms_loaded == ten_k_forms)

True


Saving the `ten_k_forms_dates` dictionary, and then as a sanity check, reloading it and comparing if it is the same file.

In [20]:
with open('ten_k_forms_dates.pickle', 'wb') as handle:
    pickle.dump(ten_k_forms_dates, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('ten_k_forms_dates.pickle', 'rb') as handle:
    ten_k_forms_dates_loaded = pickle.load(handle)
    
print(ten_k_forms_dates_loaded == ten_k_forms_dates)

True
