<a href="https://colab.research.google.com/github/thowley1207/capstone_project/blob/07/07_obtain_and_clean_8k_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://raw.githubusercontent.com/thowley1207/capstone_project/main/colab_initialization/initializer.py
!pip install --no-dependencies wrds

import numpy as np
import pandas as pd
import os
import requests
import re
import string
import pickle
from bs4 import BeautifulSoup as bs

import initializer
initializer.initialize_colab()

In [None]:
'''
SET PROCESSED DATA SUBDIRECTORIES AND FORM TYPE PREFIX
WHEN APPLICABLE, THIS FORM TYPE PREFIX WILL BE USED MOVING FORWARD
'''

linking_data_subdir = 'data/edgar_wrds_linking/'
q_cleaned_text_data_subdir = 'data/sec_edgar/8k_text_cleaned_quarterly/'
sec_edgar_data_subdir = 'data/sec_edgar/'
file_prefix = '8k_'

'''
FILE NAMES CARRIED DOWN FROM PRIOR WORK
'''

event_subset_file_name = 'event_subset.pkl'
master_index_all_periods_file_name = 'master_index_all_periods.pkl'

'''
NEW FILE NAMES FOR USE BELOW
'''

text_cleaned_file_name = 'text_cleaned.pkl'

In [None]:
'''
READ IN EVENT SUBSET DATA
'''

event_subset = pd.read_pickle(
    linking_data_subdir +
    file_prefix +
    event_subset_file_name)

'''
READ IN ALL PERIOD MASTER INDEX DATA
'''

master_index_all_periods = pd.read_pickle(
    linking_data_subdir +
    file_prefix +
    master_index_all_periods_file_name)

**HELPER FUNCTION: for use obtaining each event's raw 8K HTML**

        get_8k_text(webloc_8k)

* **Function input parameters are:**
        # The web location of a given event's 8K document
        webloc_8k

* **Function returns:**
        # The raw HTML version of the event's 8K document text
        r.text

In [None]:
header_content = {'User-Agent': 'Georgia Tech thowley3@georgiatech.edu',
                  'Host': 'www.sec.gov'}

def get_8k_text(webloc_8k):

    request_args = {'url': webloc_8k,
                    'headers': header_content}
    r = requests.get(**request_args)
    return r.text

**HELPER FUNCTION: for use in cleaning each event's raw 8K HTML**

* **NOTE: Code leverages the text cleaning process defined in the NLP Code Tutorial Part 1 provided in class resources***

        clean_8k_text(text)

* **Function input parameters are:**
        # The raw HTML output obtained from the SEC EDGAR data related to an individual event (8K)
        input_text

* **Function returns:**
        # The string version of the section of the event's 8K text relevant for use in LLM classification
        text

In [None]:
def clean_8k_text(input_text):

    input_text = re.sub(r'(\r\n|\r|\n)',' ', input_text)

    # remove certain text with regex query
    input_text = re.sub(
        r'<DOCUMENT>\s*<TYPE>(?:GRAPHIC|ZIP|EXCEL|PDF|XML|JSON).*?</DOCUMENT>',
        ' ', input_text)
    input_text = re.sub(r'<SEC-HEADER>.*?</SEC-HEADER>',' ', input_text)
    input_text = re.sub(r'<IMS-HEADER>.*?</IMS-HEADER>',
                        ' ', input_text)

    # replace characters to correct them
    input_text = re.sub(r'&nbsp;', ' ', input_text)
    input_text = re.sub(r'&#160;', ' ', input_text)
    input_text = re.sub(r'&amp;', '&', input_text)
    input_text = re.sub(r'&#38;','&', input_text)

    # replace other encoded characters to whitespace
    input_text = re.sub(r'&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});',
                        ' ', input_text)

    soup = bs(input_text, 'html.parser')

    for tag in soup.find_all('xbrl'):
        # don't remove if there is item detail
        fp_result = tag(text=re.compile(r'(?i)item\s*\d', re.IGNORECASE))
        event = len(fp_result)

    ## if no item details remove that part
    # decompose() method removes a tag as well as its inner content.
        if (event==0):
            tag.decompose()

    # remove tables
    for tag in soup.find_all('table'):
        temp_text = tag.get_text()
        numbers = sum(c.isdigit() for c in temp_text)
        letters = sum(c.isalpha() for c in temp_text)
        ratio_number_letter = 1.0

        if (numbers + letters) > 0:
            ratio_number_letter = numbers/(numbers + letters)

        event = 0

        if( (event==0) and ( ratio_number_letter > 0.1)):
            tag.decompose()

    ## remove other text between tags used for styling
    text = soup.get_text()
    text = re.sub(r'<(?:ix|link|xbrli|xbrldi).*?>.*?<\/.*?>', ' ', text)

    ## remove extra whitespace from sentences
    text = "".join(line.strip() for line in text.split("\n"))

    ## some additional cleaning
    text = re.sub(r'--;', ' ', text)
    text = re.sub(r'__', ' ', text)

    cleanr = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    text = re.sub(cleanr, ' ', text)

    temp_match = re.search(r'^.*?item(\s)*\d', text, flags=re.IGNORECASE)
    if temp_match != None:
        text = re.sub(r'^.*?item(\s)*\d', '', text, count=1,
                      flags=re.IGNORECASE)

    ## replace more than one whitespace with single whitespace
    text = re.sub(r'\s+', ' ', text)
    return text

 **Step 1:**

* **Create A List Containing a Sample of Record Dicts From The Event Subset Data**
    * For each quarterly period in the event subset, generate a sample of 500 events for which we will obtain 8K data
    * Create a list containing each period's sample
    * Will allow us to write the output in pieces for each period to monitor the text obtention process and manage any errors (disconnecting, etc.)
    * First period is 2005 Q1, last period is 2018 Q4
    * 56 periods; 28,000 events
    * Each record dict is a list of dictionaries, with each individual dictionary corresponding to an individual event
    * Each dictionary key is the event ID we created when we initially created the all-period master index dataset
    * Each dictionary value is the web location corresponding to that event ID's 8k doc for the event

In [None]:
sec_html_prefix = 'https://www.sec.gov/Archives/'

event_subset = event_subset[['event_id','period']]
event_subset = event_subset.groupby('period').sample(n=500, random_state=1
                                                ).reset_index(drop = True)

master_index_all_periods['webloc'] = (
    sec_html_prefix + master_index_all_periods['filename'])

master_index_all_periods = master_index_all_periods[['event_id','webloc']]

event_subset = event_subset.merge(master_index_all_periods, on='event_id')

n = 500
event_subset_lst = [event_subset[i:i+n].to_dict('records') for i
                    in range(0,len(event_subset),n)]

 **Step 2:**

* **Obtain, Clean, and Write 8K Text For Each Event**
    * For each period in the event subset:
        * Set the name of the output location for the event-8K text data using the name of the current period
        * Create an empty dictionary clean_text_output_dict for storing the cleaned text output for the current period
        * For each event in the period sample:
            * Get the raw 8K text using the get_8k_text helper function
            * Get the clean 8K text using the clean_8k_text helper function
            * Add the clean 8K text to the clean_text_output_dict
        * Write the period's clean_text_output_dict to the file location specified as a pickle file

* **NOTE:**
    * A handful of the event's raw 8K text causes an error when attempting to clean the text due to the HTML being irregularly formatted
    * To handle this data, I've included a try-except clause that skips any data that raises an exception and does not include this data in the cleaned text output
    * Try-except clause simply skips this event and continues iteration
    * Insignificant number of documents cause this issue (20 / 28,000) so decision was made not to replace this data

In [None]:
for i in range(1,len(event_subset_lst)):

    file_name = f"{event_subset_lst[i][0]['period']}.pkl"
    output_loc = q_cleaned_text_data_subdir + file_name

    clean_text_output_dict = {}

    for event in event_subset_lst[i]:

        text_8k = get_8k_text(event['webloc'])
        try:
            text_8k = clean_8k_text(text_8k)
        except:
            print(f"""
Error with index: {event['event_id']}, webloc: {event['webloc']}""")
            continue
        clean_text_output_dict[event['event_id']] = text_8k


    with open(output_loc, 'wb') as f:
        pickle.dump(clean_text_output_dict, f)

    print(f"""
Cleaned 8k text output generated and written to {output_loc}.""")

 **Step 3:**

* **Create a Combined Cleaned 8K Text Dataframe From The Cleaned 8K Text Data**
    * Create an empty list for holding each individual 8k text dataframe
    * Read in the event_id - 8k text dictionary created for each period above
    * Create a dataframe containing the event_id - 8K text data from each dict
    * Add each dataframe to the list, and create a single combined dataframe from the list of dataframes above
        * Set the name of the output location for the event-8K text data using the name of the current period
        * Create an empty dictionary clean_text_output_dict for storing the cleaned text output for the current period
        * For each event in the period sample:
            * Get the raw 8K text using the get_8k_text helper function
            * Get the clean 8K text using the clean_8k_text helper function
            * Add the clean 8K text to the clean_text_output_dict
        * Write the period's clean_text_output_dict to the file location specified as a pickle file

In [None]:
combined_event_8k_text_lst = []

for i in range(len(event_subset_lst)):
    file_name = f"{event_subset_lst[i][0]['period']}.pkl"
    input_loc = q_cleaned_text_data_subdir + file_name

    with open(input_loc,'rb') as f:
        text = pickle.load(f)

    text_df = pd.DataFrame.from_dict(text,
                                    orient='index',
                                    columns = ['text_8k']
                                    ).reset_index(names = ['event_id'])

    combined_event_8k_text_lst.append(text_df)

combined_cleaned_8k_text = pd.concat(combined_event_8k_text_lst
                                    ).sort_values(by = ['event_id']
                                                 ).reset_index(
                                                     drop = True)

In [None]:
combined_cleaned_8k_text.to_pickle((
    sec_edgar_data_subdir +
    file_prefix +
    text_cleaned_file_name
    ))

In [None]:
event_subset = pd.read_pickle(
    sec_edgar_data_subdir +
    file_prefix +
    text_cleaned_file_name)

In [None]:
event_subset.head()

Unnamed: 0,event_id,text_8k
0,32,.01. Entry into a Material Definitive Agreemen...
1,37,.01. Entry into a Material Definitive Agreemen...
2,60,.01. Entry into a Material Definitive Agreemen...
3,202,.01. COMPLETION OF ACQUISITION OR DISPOSITION ...
4,234,.01. Entry into a Material Definitive Agreemen...
