In [224]:
from bs4 import BeautifulSoup
import os
import re
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#setting the directory to current directory, put the ipynb in the folder with the html files
folder_path = os.getcwd()

#hold the content of each HTML file
html_contents = []
#hold the names of the html file
html_ls = []
#result df to hold the final output
result = pd.DataFrame(columns=['filename', 'EPS'])

#loop each file in the directory
for filename in os.listdir(folder_path):
    if filename.endswith('.html'):  # find html files
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
            html_contents.append(html_content)  
            html_ls.append(filename)
            #print(f"Read file: {filename}")  #quality check

In [225]:
texts=[
'Net earnings per share - basic', 
'basic',
'GAAP net income and earnings per share',
'Earnings per share  to common shareholders',
'Earnings (loss) per common share',
'Net earnings Per Common Share',
'Adjusted earnings per share',
'Net income per common share',
'Earnings per share common shareholders:',
'LOSS PER SHARE - BASIC'
]

# Function to normalize text
def normalize_text(text):
    text = text.lower()  #convert all str to lower case
    text = re.sub(r'[^a-z0-9\s]', '', text)  #remove punctuation
    tokens = word_tokenize(text)  #tokenize
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]  #stemming
    return ' '.join(tokens)


In [226]:
# In the soup, use machine learning to find the target cell that holds EPS label
def find_most_similar_cell(soup):

    #extract all cells from soup
    cells = soup.find_all('td')
    if not cells:
        return "No cells found."

    #extract and normalizes cells texts
    cell_texts = [cell.get_text(" ", strip=True) for cell in cells]
    normalized_cell_texts = [normalize_text(text) for text in cell_texts]

    #normalize dictionary texts
    normalized_texts = [normalize_text(text) for text in texts]

    #vectorize all texts
    vectorizer = TfidfVectorizer()
    all_texts = normalized_texts + normalized_cell_texts
    tfidf_matrix = vectorizer.fit_transform(all_texts)

     #calculate cosine similarity between each input text and each cell
    highest_similarity = -1
    most_similar_cell_text = ""
    for i, text in enumerate(normalized_texts):
        similarity_scores = cosine_similarity(tfidf_matrix[i:i+1], tfidf_matrix[len(normalized_texts):])
        for idx in similarity_scores.argsort()[0][::-1]:  #iterate over indices from highest to lowest similarity
            candidate_text = cell_texts[idx]
            if len(candidate_text.split()) >= 3:  #check if the candidate text has more than three words, ex'Basic'
                max_similarity = similarity_scores[0, idx]
                if max_similarity > highest_similarity:
                    highest_similarity = max_similarity
                    most_similar_cell_text = candidate_text
                break  #stop after finding the first match with more than three words

    return most_similar_cell_text




#finetune our text dictionary for accuracy
for i in range(50):
 soup = BeautifulSoup(html_contents[i], 'html.parser')
 print(find_most_similar_cell(soup))


Net earnings per share – basic
Earnings (loss) per common share
Per Common Share
Adjusted earnings per diluted share
LOSS PER SHARE - BASIC AND DILUTED
Basic earnings per share
Basic earnings per common share
Diluted earnings per common share:
Net income per common share:
BASIC EARNINGS PER SHARE:
Basic and diluted loss per share
Net income per common share:
Net income per share attributable to UCT common stockholders:
Net income (loss) per common share:
Net loss per common share:
Loss Per Common Share:
Shares used to compute basic net income per share
Earnings per share attributable
    to Tetra Tech:
Income (loss) per share—basic
NET EARNINGS PER COMMON SHARE:
Basic Earnings per Share
Net (loss) income per share attributable to the Company
Earnings per common share:
Basic net income per share
Earnings per share:
Net loss per share:
Net income per common share
Basic and diluted earnings per share
Earnings per share:
Net income per common share
Basic and diluted net (loss) income per c

In [227]:
#write a function to parse EPS value from cell with string information, it inputs str and output float
def parse_number(price):
    try:
        # Strip the dollar sign, any spaces, and commas, and right brakcet
        price = price.strip().replace("$", "").replace(",", "").replace(")", "")
        
        # Return 0 if the stripped price is an empty string
        if price == "":
            return 0
        
        # Check if the number is enclosed in a left parenthesis
        if price.startswith("("):
            # Remove the left parenthesis
            price = price[1:]
            # Convert to negative float, even if there's no closing parenthesis
            return -float(price)
        else:
            # Convert to positive float
            return float(price)
    except ValueError as e:
        print(f"Error converting '{price}' to float: {e}")
        raise

In [228]:
#write a function to find the EPS for the label in the soup tables, it will return a EPS value
def find_value_after_label(soup, label):

    # Find all tables
    tables = soup.find_all('table')
    
    # Iterate over each table
    for table in tables:
        rows = table.find_all('tr')
        
        # Iterate over each row within the table
        for i, row in enumerate(rows):
            cells = row.find_all(['td'])
            
            # Check if the label exists in this row
            for j, cell in enumerate(cells):
                if label in cell.get_text(strip=True):
                    # Label found, check the same row for valid value
                    for value_cell in cells[j+1:]:
                        value_text = value_cell.get_text(strip = True)
                        if re.search(r'[0-9]', value_text): # Make sure there are numbers in the cell
                            return parse_number(value_text)
                        
                    # If not found in the same row, check the next row if exists
                    if i + 1 < len(rows):
                        next_row_cells = rows[i + 1].find_all(['td'])
                        for value_cell in next_row_cells[1:]: #start from second cell so no 'Basic'
                            value_text = value_cell.get_text(strip = True)
                            if re.search(r'[0-9]', value_text): # Make sure there are numbers in the cell
                                return parse_number(value_text)
                            
    return 0  # Return 0 if no valid value is found


for i in range(50) :
 soup = BeautifulSoup(html_contents[i], 'html.parser')
 print('file',i+1,html_ls[i])
 expression = find_most_similar_cell(soup)
 val = find_value_after_label(soup, expression)
 result.loc[len(result)] = [html_ls[i], val] 
 print(expression,val)

file 1 0000004977-20-000054.html
Net earnings per share – basic 0.78
file 2 0000008947-20-000044.html
Earnings (loss) per common share -0.41
file 3 0000046080-20-000050.html
Per Common Share 0
file 4 0000066570-20-000013.html
Adjusted earnings per diluted share 1.18
file 5 0000314808-20-000062.html
LOSS PER SHARE - BASIC AND DILUTED -15.19
file 6 0000706129-20-000012.html
Basic earnings per share 0.26
file 7 0000846617-20-000024.html
Basic earnings per common share 0.47
file 8 0000874766-20-000033.html
Diluted earnings per common share: 0.74
file 9 0000875320-20-000014.html
Net income per common share: 2.32
file 10 0000892537-20-000010.html
BASIC EARNINGS PER SHARE: 0.71
file 11 0000895419-20-000042.html
Basic and diluted loss per share -0.57
file 12 0000939057-20-000186.html
Net income per common share: 0.61
file 13 0000950103-20-008424.html
Net income per share attributable to UCT common stockholders: 0.24
file 14 0001008654-20-000048.html
Net income (loss) per common share: -0.16
fi

In [229]:
#control for outliers
result.loc[abs(result['EPS']) > 100, 'EPS'] = 0
result

Unnamed: 0,filename,EPS
0,0000004977-20-000054.html,0.78
1,0000008947-20-000044.html,-0.41
2,0000046080-20-000050.html,0.0
3,0000066570-20-000013.html,1.18
4,0000314808-20-000062.html,-15.19
5,0000706129-20-000012.html,0.26
6,0000846617-20-000024.html,0.47
7,0000874766-20-000033.html,0.74
8,0000875320-20-000014.html,2.32
9,0000892537-20-000010.html,0.71


In [230]:
result.to_csv('output.csv', index=False, encoding='utf-8')