In [None]:
import json
from pathlib import Path
import time
import datetime

import requests
from fake_useragent import UserAgent
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains

import stock

ua = UserAgent()

In [None]:
def convert_to_number(text) -> int | float:
    try:
        text = text.replace("$", "")
        if "." in text:
            return float(text)
        else:
            return int(text)
    except:
        #stock.logger.exception("Failed to convert to number : \"{}\"".format(text))
        return np.nan

In [None]:
def process_data(base_url, ticker, data_json):
    target_suffix_and_keys = [
        ("eps-earnings-per-share-diluted", "Diluted EPS"),
        ("revenue", "Total Revenue"),
        ("gross-profit", "Gross Profit"),
        ("operating-income", "Operating Income"),
        ("ebitda", "EBIT"),
        ("net-income", "Net Income"),
    ]

    headers = {'User-Agent': str(ua.chrome)}
    for (suffix, json_key) in target_suffix_and_keys:
        # print(json_key)
        url = base_url + suffix
        res = requests.get(url, headers=headers)

        bs4 = BeautifulSoup(res.content)
        table = bs4.find_all(attrs={"class": "historical_data_table"})[1]
        rows = table.find("tbody").find_all("tr")
        for row in rows:
            tds = row.find_all("td")
            quarterly_eps = convert_to_number(tds[1].text)
            year_date = tds[0].text.rsplit("-", 1)[0]
            for key in data_json.keys():
                #print(key, year_date)
                if year_date in key:
                    if json_key not in data_json[key]:
                        data_json[key][json_key] = quarterly_eps
                    #print(f"data exsits : {year_date}")
                    break
            else:
                data_json[tds[0].text] = {
                    json_key: quarterly_eps
                }
                
        time.sleep(0.1)

    return data_json

In [None]:
options = webdriver.ChromeOptions()
options.add_argument("--enable-javascript")
options.add_experimental_option("detach", True)
driver = webdriver.Remote(
    command_executor="http://localhost:4444/wd/hub",
    options=options
)

In [None]:
stock_list_path = stock.DATA_DIR / "us_stock_codes.csv"

with open(stock_list_path, "r") as f:
    stock_list = [row.strip().split(",")[0] for row in f.readlines()[1:]]

In [None]:
candidate.find_element(By.TAG_NAME, "a")

In [None]:
base_url = "https://www.macrotrends.net"

for ticker in stock_list[2:]:
    try:
        driver.get("https://www.macrotrends.net/stocks/research")
        input = driver.find_element(By.TAG_NAME, "input")
        input.send_keys(ticker)
        time.sleep(1)
        candidate = driver.find_element(By.CLASS_NAME, "typeahead__item")
        href = candidate.find_element(By.TAG_NAME, "a").get_attribute("href")
        url = href.rsplit("/", 1)[0] + "/"

        data_json_path = stock.DATA_DIR / "codes" / f"{ticker}.json"
        with open(data_json_path, "r") as f:
            data_json = json.load(f)

        data_json = process_data(url, ticker, data_json)
        
        with open(data_json_path, "w") as f:
            json.dump(data_json, f, indent=4)

        stock.logger.info(f"Success : {data_json_path}")    
    except KeyboardInterrupt:
        pass
    except:
        stock.logger.exception(f"Failed : {ticker}")