## Project 1 Step 3 Parse, Clean, and Count Words
In this notebook, we will mainly do the followings: 
- Read in all downloaded files of 10-K and 10-Q
- Parse them and remove some unnecessary parts in the file using BeautifulSoup
- Tokenize and lemmatize all words in the file
- Remove stop words, non-English words, and words with digits
- Use the word list and dictionaries (H4N, Fin-Neg) to calculate negative word term weighting

### Import packages and list work paths

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import os
import warnings

warnings.filterwarnings("ignore")

from datetime import date
from tqdm import tqdm
from joblib import Parallel, delayed

from bs4 import BeautifulSoup
import re
from pathlib import Path
import json

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [3]:
# Download necessary sources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [4]:
data_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/data/"
data_path_10q = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/data/10Q/"
data_path_10k = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/data/10K/"

cik_lookup_filename = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/CIK_lookup_results_cleaned.csv"
sp500_constituents_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/sp500_constituents.csv"
sp500_id_path = "/content/drive/MyDrive/Mini 5/Natural Language Processing/Project 1/sp500_w_addl_id.csv"

### Define helper functions
#### Parse the file and remove unnecessary elements

In [5]:
def parse_remove_tables(soup):
  """
  Function purpose: remove elements with <TABLE> tag in the xml/html document, per author's instruction
  """
  # remove table data
  for td in soup.find_all("td"):
    td.decompose()
  # remove table rows
  for tr in soup.find_all("tr"):
    tr.decompose()
  # remove tables
  for t in soup.find_all("table"):
    t.decompose()
  return soup

In [6]:
def parse_remove_link(soup):
  """
  Function purpose: remove elements with <a ...> tag and hyperlinks
  """
  for link in soup.find_all("a"):
    link.decompose()
  return soup

In [7]:
def parse_remove_xml(soup):
  """
  Function purpose: remove elements with xml tag
  """
  for x in soup.find_all("xml"):
    x.decompose()
  return soup

In [8]:
def parse_remove_xbrl(soup):
  """
  Function purpose: remove elements with xbrl tag
  """
  for x in soup.find_all("xbrl"):
    x.decompose()
  return soup

In [9]:
def parse_translate_ascii(soup):
  """
  Function purpose: translate "encoded" characters such as &NBSP to blank space or &AMP to & (back to ASCII form)
  """
  # replace &amp with &
  soup = re.sub(r'(?s)(&amp)[.|\s]*', '&', soup)

  # replace &nbsp with blank space
  soup = re.sub(r'(?s)(&nbsp)[.|\s]*', ' ', soup)
  return soup

#### Remove html/xml tags to convert file into usual string

In [10]:
def parse_remove_htmltags(soup):
  """
  Function purpose: remove all html/xml tags because those are not meaningful language
  """
  # remove html/xml tags
  text = soup.get_text()
  # convert all letters to lower case
  text = text.lower()
  # eliminate all punctuation and number characters
  return text

#### Tokenize and remove non-English words

In [11]:
def tokenize_remove_nonenglish(text):
  """
  Function purpose: tokenize our text and remove those non-English words from future consideration
  """
  # get an English word set
  words = set(nltk.corpus.words.words())
  # check whether a word is an english word
  text = " ".join(w for w in nltk.wordpunct_tokenize(text) if w in words or not w.isalpha())
  return text

#### Lemmatization of the string

In [12]:
def lemmatize_words(words): 
  """
  Function purpose: Lemmatize our word list
  Source: https://towardsdatascience.com/nlp-in-the-stock-market-8760d062eb92
  """
  lemmatized_words = [WordNetLemmatizer().lemmatize(word, 'v') for word in words]
  return lemmatized_words

In [13]:
def lemmatization(text):
  """
  Function purpose: Lemmatize the text
  Source: https://towardsdatascience.com/nlp-in-the-stock-market-8760d062eb92
  """
  word_pattern = re.compile('\w+')
  text_lemmatize = lemmatize_words(word_pattern.findall(text))
  return text_lemmatize

#### Remove stop words from the text

In [14]:
def remove_stop_words(text_lemmatize):
  """
  Function purpose: 
  Source: https://towardsdatascience.com/nlp-in-the-stock-market-8760d062eb92
  """
  lemma_english_stopwords = lemmatize_words(stopwords.words('english'))
  text_lemmatize = [word for word in text_lemmatize if word not in lemma_english_stopwords]
  return text_lemmatize

#### Only keep those words with at least 2 characters

In [15]:
def remove_single_letter_words(text_lemmatize):
  text_lemmatize = [word for word in text_lemmatize if len(word) > 1]
  return text_lemmatize

#### Remove all words with digits

In [16]:
def remove_words_with_digits(text_lemmatize):
  text_lemmatize = [w for w in text_lemmatize if any(ch.isdigit() for ch in w) == False]
  return text_lemmatize

#### Streamline the parsing, cleaning and process of making word lists
Here are two ways we shall consider. We will experiment two ways of parsing and cleaning.

In [17]:
def clean_method1(file_path):
  """
  Function purpose: clean the text only use beautifulsoup objects
  """
  soup = BeautifulSoup(open(file_path, 'r', encoding='utf-8'), 'xml')
  soup = parse_remove_tables(soup)
  soup = parse_remove_link(soup)
  # soup = parse_remove_xml(soup)
  # soup = parse_remove_xbrl(soup)
  text = parse_remove_htmltags(soup)
  text = tokenize_remove_nonenglish(text)
  text = lemmatization(text)
  text = remove_stop_words(text)
  text = remove_single_letter_words(text)
  text = remove_words_with_digits(text)
  return text

In [18]:
def clean_method2(file_path):
  soup = Path(file_path).read_text()
  soup = re.sub(r'(?s)\<SEC-HEADER\>(.|\s)*?\<\/SEC-HEADER\>\n', '', soup)
  soup = re.sub(r'(?s)\<DOCUMENT\>\n\<TYPE>EX(.*?)\<\/DOCUMENT\>\n', '', soup, flags=re.DOTALL)
  soup = re.sub(r'(?s)\<DOCUMENT\>\n\<TYPE>GRAPHIC(.*?)\<\/DOCUMENT\>\n', '', soup, flags=re.DOTALL)
  soup = re.sub(r'(?s)\<DOCUMENT\>\n\<TYPE>ZIP(.*?)\<\/DOCUMENT\>\n', '', soup, flags=re.DOTALL)
  soup = re.sub(r'(?s)\<DOCUMENT\>\n\<TYPE>EXCEL(.*?)\<\/DOCUMENT\>\n', '', soup, flags=re.DOTALL)
  soup = parse_translate_ascii(soup)

  soup2 = BeautifulSoup(soup, 'xml')
  soup2 = parse_remove_tables(soup2)
  soup2 = parse_remove_link(soup2)
  text = parse_remove_htmltags(soup2)
  text = tokenize_remove_nonenglish(text)
  text = lemmatization(text)
  text = remove_stop_words(text)
  text = remove_single_letter_words(text)
  text = remove_words_with_digits(text)
  return text

In [19]:
def clean_a_file(file_path, which_method=True):
  if which_method == True:
    return clean_method1(file_path)
  else:
    return clean_method2(file_path)

In [20]:
# example_path = os.path.join(data_path, "example.txt")

In [21]:
# example_wl = clean_a_file(example_path)
# print(len(example_wl))
# example_wl2 = clean_a_file(example_path, False)
# print(len(example_wl2))

In [22]:
# example10q_path = os.path.join(data_path, "example10q.txt")

In [23]:
# example10q_wl = clean_a_file(example10q_path)
# print(len(example10q_wl))
# example10q_wl2 = clean_a_file(example10q_path, False)
# print(len(example10q_wl2))

In [24]:
# example_str = Path(example_path).read_text()
# print(len(example_str))
# example10q_str = Path(example10q_path).read_text()
# print(len(example10q_str))

#### Make word lists for files with different tickers and filing dates
In this section, we will read in all 10-K and 10-Q files, parse and clean them, and save the word list generated. Since this step can take a long time to execute, we will save the result in JSON file for future use.
Our goal is to generate such dictionary of dictionaries for easy and fast O(1) look up:

{"XXX": {"20190101": [...],
         "20200101": [...]
        },

 "YYY": {"20210101": [...],
         "20220101": [...]
        },
        
 "ZZZ": {"20170101": [...],
         "20160101": [...]}
}

In [25]:
def pipeline_function_10k(some_tickers, i):
  # Make dictionary of word list
  dict_10k = {}
  # Record files that cannot be cleaned using current algorithms
  problem_files_10k = []
  problem_cnt_10k = 0
  success_cnt_10k = 0

  for ticker in tqdm(some_tickers):
    # list all 10-K files under a certain ticker and check whether it is from 2016-2021
    file_list = os.listdir(os.path.join(data_path_10k, ticker))
    # file_list = [x for x in file_list if int(x.split("-")[1]) == year_range]
    dict_for_a_ticker = {}
    for fi in file_list:
      # parse & clean the file, and make the word list
      try: 
        with open(os.path.join(data_path_10k, ticker, fi), 'r') as fp:
          file_header = [next(fp) for i in range(9)]
          filing_date = str(file_header[7][-9:-1])
        word_list = clean_a_file(os.path.join(data_path_10k, ticker, fi))
        if len(word_list) < 500:
          problem_cnt_10k += 1
          problem_files_10k.append((ticker, fi))
          continue
        dict_for_a_ticker[filing_date] = word_list
        success_cnt_10k += 1
      except:
        problem_cnt_10k += 1
        # if the file cannot be processed for some reason, record its path
        problem_files_10k.append((ticker, fi))
    dict_10k[ticker] = dict_for_a_ticker
  # save the result dictionary into a json file
  with open(os.path.join(data_path, "wordlist10k_{}.json".format(str(i))), "w") as outfile_10k:
    json.dump(dict_10k, outfile_10k)
  # save the summary dictionary into a json file
  dict_10k_summary = {"problem_count": problem_cnt_10k, "success_count": success_cnt_10k, "problem_files": problem_files_10k}
  with open(os.path.join(data_path, "wordlist10k_{}_summary.json".format(str(i))), "w") as outfile_10k_summary:
    json.dump(dict_10k_summary, outfile_10k_summary)

#### List all tickers under the 10-K directory and split tickers into several groups for excution due to RAM usage limit

In [26]:
# List all tickers under the 10-K directory
tickers_10k = os.listdir(data_path_10k)
# Split into several groups to execute separately to reduce RAM usage
begin_index = [i * (len(tickers_10k) // 5) for i in list(range(5))]
end_index = begin_index[1:]
end_index.append(len(tickers_10k))

In [27]:
print(begin_index)
print(end_index)
print(len(tickers_10k))

[0, 108, 216, 324, 432]
[108, 216, 324, 432, 540]
540


#### Iterate all tickers across groups splited above to calculcate each word list for files with different tickers and filing dates

In [28]:
for i, index_pair in enumerate(zip(begin_index, end_index)):
  print(i, index_pair[0], index_pair[1])
  some_tickers = tickers_10k[index_pair[0]:index_pair[1]]
  pipeline_function_10k(some_tickers, i)
  print("Finish {} - {}".format(index_pair[0], index_pair[1]))

0 0 108


100%|██████████| 108/108 [1:47:49<00:00, 59.90s/it]


Finish 0 - 108
1 108 216


100%|██████████| 108/108 [1:53:32<00:00, 63.08s/it]


Finish 108 - 216
2 216 324


100%|██████████| 108/108 [1:20:56<00:00, 44.97s/it]


Finish 216 - 324
3 324 432


100%|██████████| 108/108 [47:20<00:00, 26.30s/it]


Finish 324 - 432
4 432 540


100%|██████████| 108/108 [15:34<00:00,  8.65s/it]


Finish 432 - 540


In [29]:
# # Make dictionary of word list
# dict_10k = {}
# # List all tickers under the 10-K directory
# tickers_10k = os.listdir(data_path_10k)[:3]
# # Record files that cannot be cleaned using current algorithms
# problem_files_10k = []
# problem_cnt_10k = 0
# success_cnt_10k = 0
# # we only use 2016-2021 data per updated instructions
# year_range = set(range(16, 22))

In [30]:
# for ticker in tqdm(tickers_10k):
#   # list all 10-K files under a certain ticker and check whether it is from 2016-2021
#   file_list = os.listdir(os.path.join(data_path_10k, ticker))
#   # file_list = [x for x in file_list if int(x.split("-")[1]) == year_range]
#   dict_for_a_ticker = {}
#   for fi in file_list:
#     # parse & clean the file, and make the word list
#     try: 
#       with open(os.path.join(data_path_10k, ticker, fi), 'r') as fp:
#         file_header = [next(fp) for i in range(9)]
#         filing_date = str(file_header[7][-9:-1])
#       word_list = clean_a_file(os.path.join(data_path_10k, ticker, fi))
#       if len(word_list) < 500:
#         problem_cnt_10k += 1
#         problem_files_10k.append((ticker, fi))
#         continue
#       dict_for_a_ticker[filing_date] = word_list
#       success_cnt_10k += 1
#     except:
#       problem_cnt_10k += 1
#       # if the file cannot be processed for some reason, record its path
#       problem_files_10k.append((ticker, fi))
#   dict_10k[ticker] = dict_for_a_ticker

In [31]:
# # save the result dictionary into a json file
# with open(os.path.join(data_path, "wordlist10k_{}.json".format()), "w") as outfile_10k:
#   json.dump(dict_10k, outfile_10k)

In [32]:
# for key, item in dict_10k.items():
#   for _, i in item.items():
#     print(len(i))

In [33]:
# # Make dictionary of word list
# dict_10q = {}
# # List all tickers under the 10-K directory
# tickers_10q = os.listdir(data_path_10q)
# # Record files that cannot be cleaned using current algorithms
# problem_files_10q = []
# problem_cnt_10q = 0
# success_cnt_10q = 0

In [34]:
# for ticker in tqdm(tickers_10q):
#   # list all 10-Q files under a certain ticker and check whether it is from 2016-2021
#   file_list = os.listdir(os.path.join(data_path_10q, ticker))
#   # print(file_list)
#   # file_list = [x for x in file_list if int(x.split("-")[1]) in year_range]
#   # print(file_list)
#   dict_for_a_ticker = {}
#   for fi in file_list:
#     # parse & clean the file, and make the word list
#     try: 
#       with open(os.path.join(data_path_10q, ticker, fi), 'r') as fp:
#         file_header = [next(fp) for i in range(9)]
#         filing_date = str(file_header[7][-9:-1])
#       word_list = clean_a_file(os.path.join(data_path_10q, ticker, fi))
#       if len(word_list) < 500:
#         problem_cnt_10q += 1
#         problem_files_10q.append((ticker, fi))
#         continue
#       dict_for_a_ticker[filing_date] = word_list
#       success_cnt_10q += 1
#     except:
#       problem_cnt_10q += 1
#       # if the file cannot be processed for some reason, record its path
#       problem_files_10q.append((ticker, fi))
#   dict_10q[ticker] = dict_for_a_ticker

In [35]:
# # save the result dictionary into a json file
# with open(os.path.join(data_path, "wordlist10q.json"), "w") as outfile_10q:
#   json.dump(dict_10q, outfile_10q)

In [36]:
# print("success number of 10-K: {}".format(success_cnt_10k))
# print("success number of 10-Q: {}".format(success_cnt_10q))
# print("failing number of 10-K: {}".format(problem_cnt_10k))
# print("failing number of 10-Q: {}".format(problem_cnt_10q))