# EDGAR Financial Data Parser

The full documentation about usage and dependencies can be viewed at [README.md](README.md).

# 1. Import

In [12]:
import os
import re
import pandas as pd

# 2. Data Cleaning

Define functions for the preprocessing

In [13]:
def clean_html_content(html_content: str) -> str:
    """ 
    Function to clean the contents of HTML by removing the unnecessary tags
    and restructuring the contents
    """
    
    # Remove the html tags
    html_content = re.sub(r'</\w+>', '\n', html_content)
    html_content = re.sub(r'<\w+>', '', html_content)
    html_content = re.sub(r'<\w+/>', '', html_content)
    html_content = re.sub(r'<\w+\s.*>', '', html_content)
    html_content = re.sub(r'<!.*>', '', html_content)

    # Remove enters
    html_content = '\n'.join(line for line in html_content.split('\n') if line.strip() != '')

    # Reformat the texts
    html_content = re.sub(r'\n\s+', '\n', html_content)
    html_content = re.sub(r'\n\)', ')', html_content)
    html_content = re.sub(r'\$\s*\n', '$', html_content)
    html_content = re.sub(r'\n\s*%', '%', html_content)
    html_content = re.sub(r'\(\$', '$(', html_content)
    html_content = re.sub(r'\s+\)', ')', html_content)
    html_content = re.sub(r'(\$[\d.,]+)\s+\n', r'\1\n', html_content)

    return html_content

def extract_eps(html_content: str) -> str:
    """ 
    Function to extract the value of the EPS
    """

    # Define eps_patterns for searching
    eps_patterns = [
        r'[Bb]asic and diluted earnings per share\n(.*)\n',
        r'[Cc]ore earnings per share\n\$(.*)\n',
        r'Earnings \(loss\) per share from continuing operations\nNet loss attributable to Valaris\n.*\n\$(.*)\n',
        r'(?i)earnings per basic\/diluted share.*?\n(?:.*?\n)*?\$([\d.]+)\s?\/\s?\$',
        # r'(?i)(?:^.*\b(?:earnings|income|per|share)\b.*\n)?\bbasic\b.*?\b(?:per|share)?\b.*?\n(?:.*?\n){0,2}?\$(.*)',
        # r'(?i)(?:^.*\b(?:earnings|income|per|share)\b.*\n)?\b(?:earnings|basic)\b.*?\b(?:per|share)?\b.*?\n(?:.*?\n){0,2}?\$([\d.\(\)]+)\n',
        r'(?i)(?:^.*(?:earnings|income|per|share).*\n)?basic.*?(?:per|share)?.*?\n(?:.*?\n){0,2}?\$([\d.\(\)]+)\n',
    ]

    # Define loss patterns in case there is no eps
    loss_patterns = [
        r'(?i)Net \(loss\).*?per share.*?\n\$(.*)\n',
        r'(?i)Net \(Loss\) Earnings\nBasic\n\$(.*)\n',
    ]

    # Search for eps_patterns 
    # if there is no eps, search for loss value
    for eps_pattern in eps_patterns:
        eps_match = re.search(eps_pattern, html_content, re.MULTILINE)
        if eps_match:
            eps = eps_match.group(1)
            break
        else:
            for loss_pattern in loss_patterns:
                loss_match = re.search(string=html_content, pattern=loss_pattern)
                if loss_match:
                    eps = loss_match.group(1)
                    break
    return eps

def extract_negative_value(eps: str) -> float:
    """ 
    Function to properly convert the value in a parentheses to a negative value
    """

    # Extract the negative value
    eps_minus_match = re.search(pattern=r'\(([\d.]+)\)', string=eps)

    # Add the minus sign
    if eps_minus_match:
        eps = -1 * float(eps_minus_match.group(1)) 
    else:
        eps = float(eps)

    return eps

In [14]:
# Get all the file names
file_names = os.listdir(path=r'./data/')
file_names = [file_name.split(sep='.')[0] for file_name in file_names]

In [15]:
# Create a blank list for a dataframe
df = []

for file_name in file_names:
    # Define the path to the files
    input_file_path = os.path.join('./data/', f'{file_name}.html')

    # Read the file from the list
    with open(file=input_file_path, mode='r', encoding='utf-8') as file:
        html_content = file.read()

        # Clean the html content
        html_content = clean_html_content(html_content)

        # Extract the eps value
        eps = extract_eps(html_content)

        # Correct convert the value to negative if it is enclosed by parentheses
        eps = extract_negative_value(eps)

        # Add the eps value to the result dataframe
        df.append({'filename': file_name, 'EPS': eps})

# 3. Generate the Results and Unit Testing

Define a unit test function

In [16]:
def test_eps_extraction(df: pd.DataFrame) -> None:
    """
    Function to test the EPS extraction from the dataframe.
    This function asserts that the EPS values for specific filenames
    in the dataframe are as expected.
    """
    
    # Check the given test cases
    assert df.loc[df.filename == '0001564590-20-019726', 'EPS'].values[0] == 0.08
    assert df.loc[df.filename == '0000066570-20-000013', 'EPS'].values[0] == 1.12
    assert df.loc[df.filename == '0000008947-20-000044', 'EPS'].values[0] == -0.41
    assert df.loc[df.filename == '0001564590-20-019431', 'EPS'].values[0] == 1.08
    assert df.loc[df.filename == '0001564590-20-019396', 'EPS'].values[0] == -3.15

In [17]:
# Convert the results to a dataframe
df = pd.DataFrame(df)

# Test the data correctness
test_eps_extraction(df)

# Show the results
df

Unnamed: 0,filename,EPS
0,0000004977-20-000054,0.78
1,0000008947-20-000044,-0.41
2,0000046080-20-000050,-0.51
3,0000066570-20-000013,1.12
4,0000314808-20-000062,-15.19
5,0000706129-20-000012,0.26
6,0000846617-20-000024,0.47
7,0000874766-20-000033,1.34
8,0001323885-20-000027,-0.42
9,0001373715-20-000098,0.25


After running the cell below, the results will be generated in the CSV file called [results.csv](results.csv)

In [18]:
# Save the result
df.to_csv(r'.\results.csv', index=False, header=True)