In [2]:
import keyboard
import nest_asyncio
import os
import pandas as pd
import pymupdf
import re
import time
from dotenv import load_dotenv
from IPython.display import display, clear_output
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.remote.webdriver import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from seleniumbase import SB


In [3]:
load_dotenv()
nest_asyncio.apply()

### Select stocks to query

In [None]:
df = pd.read_csv('SPX Index Weightings Composition.csv', usecols=['Members', '2024-09-20'])
df = df.sort_values(by='2024-09-20', ascending=False)
# print(df.head(55))

top_stocks = df['Members'].head(55)
top_stocks = [stock.split()[0] for stock in top_stocks]
print(top_stocks)

In [None]:
top_stocks = ['AAPL', 'ABBV', 'ABT', 'ACGBY', 'ACN', 'ADBE', 'AI', 'ALV', 'AMD', 'AMGN', 'AMZN', 'ASML', 'AVGO', 'BABA', 'BAC', 'BACHF', 'BX', 'CAT', 'CHL', 'CICHY', 'CIHKY', 'COST', 'CRM', 'CSCO', 'CVX', 'DHR', 'GE', 'GOOG', 'HD', 'HDB', 'HYMTF', 'IBM', 'IBN', 'IDCBY', 'INFY', 'INTU', 'ISRG', 'JNJ', 'JPM', 'KB', 'KO', 'KWEIF', 'LIN', 'LLY', 'LTOUF', 'MA', 'MC', 'MCD', 'META', 'MPNGF', 'MRK', 'MSFT', 'NFLX', 'NOW', 'NVDA', 'OR', 'ORCL', 'PCCYF', 'PEP', 'PG', 'PKX', 'PM', 'PNGAY', 'QCOM', 'RS', 'SAN', 'SAP', 'SHG', 'SIE', 'SIEGY', 'SONY', 'SSDIY', 'SSNLF', 'SU', 'TCEHY', 'TMO', 'TSLA', 'TTE', 'TXN', 'UNH', 'V', 'WFC', 'WMT', 'XOM']

TPX500 = ['HTHIY', 'ITOCY', 'MITSY', 'TAK', 'NTDOY', 'HMC']

### Access data

In [6]:
def find_quarter(title):
    pattern = r'Q\d 2\d{3}'
    match = re.search(pattern, title)
    if match:
        return match.group()
    return None

def reformat_quarters(quarters):
    reformatted_quarters = {f"{year} {quarter}" for quarter, year in (q.split() for q in quarters)}
    return reformatted_quarters

def scroll_to_bottom(sb):
    SCROLL_PAUSE_TIME = 0.1
    last_height = sb.cdp.evaluate("document.body.scrollHeight")

    while True:
        # Scroll down to the bottom
        sb.cdp.scroll_to_y(last_height)

        # Wait to load the page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = sb.cdp.evaluate("document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def sanitise_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', " ", filename)

In [None]:
with SB(uc=True, test=True) as sb:
    url = "https://seekingalpha.com/"
    sb.activate_cdp_mode(url)

    # Login
    sb.cdp.click_if_visible('[data-test-id="header-button-sign-in"]')
    sb.cdp.press_keys('input[autocomplete="username"]', os.getenv('EMAIL'), timeout=2)
    sb.cdp.press_keys('input[name="password"]', os.getenv('PASSWORD'), timeout=2)
    sb.cdp.click_if_visible('[data-test-id="sign-in-button"]')

    for stock in top_stocks:
        # Set folder paths
        folder_path = stock
        earnings_path = f'{stock}/Earnings Call Transcripts'
        others_path = f'{stock}/Other Transcripts'
        os.makedirs(folder_path, exist_ok=True)
        os.makedirs(earnings_path, exist_ok=True)
        os.makedirs(others_path, exist_ok=True)
        quarters = {'Q1 2019', 'Q2 2019', 'Q3 2019', 'Q4 2019',
            'Q1 2020', 'Q2 2020', 'Q3 2020', 'Q4 2020',
            'Q1 2021', 'Q2 2021', 'Q3 2021', 'Q4 2021',
            'Q1 2022', 'Q2 2022', 'Q3 2022', 'Q4 2022',
            'Q1 2023', 'Q2 2023', 'Q3 2023', 'Q4 2023',
            'Q1 2024', 'Q2 2024', 'Q3 2024', 'Q4 2024'}

        time.sleep(1)
        
        # Search stock in search bar
        sb.cdp.press_keys('[data-test-id="search-input"]', stock, timeout=2)
        
        # Click on first symbol item in search results
        sb.cdp.click('ul.mb-0.list-none.px-0 > li:nth-child(1) a')

        # Click on transcripts
        sb.cdp.click_if_visible('[data-test-id="Transcripts & Insights"]')

        time.sleep(1)

        scroll_to_bottom(sb)

        post_list_info = []
        post_list_items = sb.cdp.find_elements('[data-test-id="post-list-item"]')
        for post in post_list_items:
            title_element = post.query_selector('[data-test-id="post-list-item-title"]')
            title = title_element.text
            sanitised_title = sanitise_filename(title)
            
            relative_url = title_element.href
            full_url = f"https://seekingalpha.com{relative_url}"
            
            date_element = post.query_selector('[data-test-id="post-list-date"]')
            date = date_element.text
            
            post_list_info.append((date, sanitised_title, full_url))

        for date, title, url in post_list_info:
            if '2017' in date:
                break

            # Select only transcripts from stock
            if 'Transcript' in title and stock in title:
                sb.cdp.open(url)
            else:
                continue

            scroll_to_bottom(sb)
            
            # Prep html
            html_source = sb.cdp.get_page_source()

            # Prep txt
            content_string = ""
            paragraphs = sb.cdp.find_elements('p')
            for paragraph in paragraphs:
                content_string += paragraph.text + "\n"

            # Save to local
            if 'Earnings' in title or 'Earning' in title:
                html_file_path = os.path.join(earnings_path, f'{title}.html')
                txt_file_path = os.path.join(earnings_path, f'{title}.txt')

                quarter = find_quarter(title)
                if quarter is not None:
                    quarters.discard(quarter)
            else:
                html_file_path = os.path.join(others_path, f'{title}.html')
                txt_file_path = os.path.join(others_path, f'{title}.txt')

            with open(html_file_path, "w", encoding="utf-8") as file:
                    file.write(html_source)
            with open(txt_file_path, "w", encoding="utf-8") as file:
                    file.write(content_string)
            print(title)

        # Check for missing quarters
        if quarters:
            reformatted_quarters = reformat_quarters(quarters)
            quarters_list = sorted(list(reformatted_quarters))
            log_file_path = os.path.join(earnings_path, "missing_quarters_log.txt")
            with open(log_file_path, "w", encoding="utf-8") as log_file:
                log_file.write("Missing quarters between 2019 Q1 and 2024 Q4:\n")
                for quarter in quarters_list:
                    log_file.write(f"{quarter}\n")
            print(f"Missing {len(quarters)} quarters.")
        else:
            print("No missing quarters.")   

        sb.cdp.refresh()

Toyota Motor Corporation (TM) Q2 2025 Earnings Conference Call Transcript
Toyota Motor Corporation (TM) Q4 2024 Earnings Call Transcript
Toyota Motor Corporation (TM) Q4 2023 Earnings Call Transcript
Toyota Motor Corporation (TM) Q2 2023 Earnings Call Transcript
Toyota Motor Corporation (TM) Management on Q4 2022 Results - Earnings Call Transcript
Toyota Motor Corporation (TM) Management on Q2 2022 Results - Earnings Call Transcript
Toyota Motor Corporation (TM) Management on Q3 2021 Results - Earnings Call Transcript
Toyota Motor Corp (TM) CEO Akio Toyoda on Q2 2021 (Session 2) Results - Earnings Call Transcript
Toyota Motor Corp (TM) CEO Akio Toyoda on Q2 2021 (Session 1) Results - Earnings Call Transcript
Toyota Motor Corp (TM) CEO Akio Toyoda on Q4 2020 Results (Session 2) - Earnings Call Transcript
Toyota Motor Corp (TM) on Q4 2020 Results (Session 1) - Earnings Call Transcript
Toyota Motor (TM) Management on Q1 2019 Results - Earnings Call Transcript
Missing 15 quarters.
Sony Gro

### Old Selenium Implementation

In [None]:
for stock in test_stocks:

    # Set folder paths
    folder_path = stock
    earnings_path = f'{stock}/Earnings Call Transcripts'
    others_path = f'{stock}/Other Transcripts'
    os.makedirs(folder_path, exist_ok=True)
    os.makedirs(earnings_path, exist_ok=True)
    os.makedirs(others_path, exist_ok=True)

    search_bar = wait.until(EC.presence_of_element_located((By.ID, 'main-search-id')))
    search_bar.send_keys(stock)
    time.sleep(1)
    search_result_symbols = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-test-id="symbol-search-result-item"]')))
    most_relevant_link = search_result_symbols[0].find_element(By.CSS_SELECTOR, 'a.fVsJB')
    most_relevant_link.uc_click()

    transcripts = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[data-test-id="Transcripts & Insights"]')))
    transcripts.uc_click()

    scroll_to_bottom(driver)
    time.sleep(1)

    post_list_items = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-test-id="post-list-item"]')))
    post_list_info = []
    for item in post_list_items:
        date_element = item.find_element(By.CSS_SELECTOR, 'span[data-test-id="post-list-date"]')
        date = date_element.text
        title_element = item.find_element(By.CSS_SELECTOR, 'a[data-test-id="post-list-item-title"]')
        title = title_element.text
        url = title_element.get_attribute('href')
        post_list_info.append((date, title, url))

    for date, title, url in post_list_info:
        if '2018' in date:
            break

        # Select only transcripts
        if 'Transcript' in title:
            driver.uc_open_with_reconnect(url, reconnect_time=1)
        else:
            continue
        
        # Prep html
        wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        html_source = driver.page_source

        # Prep txt
        content_string = ""
        content_container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.T2G6W[data-test-id="content-container"]')))
        scroll_to_bottom(driver)
        time.sleep(1)
        paragraphs = wait.until(EC.presence_of_all_elements_located((By.TAG_NAME, 'p')))
        for paragraph in paragraphs:
            content_string += paragraph.text + "\n"

        # Save to local
        if 'Earnings Call Transcript' in title:
            html_file_path = os.path.join(earnings_path, f'{title}.html')
            txt_file_path = os.path.join(earnings_path, f'{title}.txt')

            quarter = find_quarter(title)
            if quarter is not None:
                quarters.discard(quarter)
        else:
            html_file_path = os.path.join(others_path, f'{title}.html')
            txt_file_path = os.path.join(others_path, f'{title}.txt')

        with open(html_file_path, "w", encoding="utf-8") as file:
                file.write(html_source)
        with open(txt_file_path, "w", encoding="utf-8") as file:
                file.write(content_string)
        print(title)

    # Check for missing quarters
    if quarters:
        quarters_list = sorted(list(quarters))
        log_file_path = os.path.join(earnings_path, "missing_quarters_log.txt")
        with open(log_file_path, "w", encoding="utf-8") as log_file:
            log_file.write("Missing quarters between Q1 2019 and Q4 2024:\n")
            for quarter in quarters_list:
                log_file.write(f"{quarter}\n")
        print(f"Missing quarters logged to {log_file_path}.")
    else:
        print("No missing quarters.")    

Microsoft Corporation (MSFT) Presents at Morgan Stanley TMT Conference (Transcript)
Microsoft Corporation (MSFT) Q2 2025 Earnings Call Transcript
Microsoft Corporation (MSFT) Q1 2025 Earnings Call Transcript
Microsoft Corporation (MSFT) Goldman Sachs Communacopia + Technology Conference (Transcript)
Microsoft Corporation (MSFT) Citi 2024 Global TMT Conference (Transcript)
Microsoft Corporation (MSFT) Deutsche Bank's 2024 Technology Conference (Transcript)
Microsoft Corporation (MSFT) Q4 2024 Earnings Call Transcript
Microsoft Corporation (MSFT) BofA Securities 2024 Global Technology Conference (Transcript)
Microsoft Corporation (MSFT) Jefferies Software Conference (Transcript)
Microsoft Corporation (MSFT) J.P. Morgan's 52nd Annual Global Technology, Media and Communications Conference (Transcript)
Microsoft Corporation (MSFT) Q3 2024 Earnings Call Transcript
Microsoft Corporation (MSFT) Morgan Stanley Technology, Media and Telecom Conference (Transcript)
Microsoft Corporation (MSFT) Q2

TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF773ABDF85+26693]
	(No symbol) [0x00007FF773A1EAD0]
	(No symbol) [0x00007FF7738A91CA]
	(No symbol) [0x00007FF7738FF747]
	(No symbol) [0x00007FF7738FF97C]
	(No symbol) [0x00007FF7739533C7]
	(No symbol) [0x00007FF77392792F]
	(No symbol) [0x00007FF7739500B8]
	(No symbol) [0x00007FF7739276C3]
	(No symbol) [0x00007FF7738F0490]
	(No symbol) [0x00007FF7738F1743]
	GetHandleVerifier [0x00007FF773E1436D+3525677]
	GetHandleVerifier [0x00007FF773E27F3B+3606523]
	GetHandleVerifier [0x00007FF773E1CEE3+3561379]
	GetHandleVerifier [0x00007FF773B87C0A+853194]
	(No symbol) [0x00007FF773A2990F]
	(No symbol) [0x00007FF773A25674]
	(No symbol) [0x00007FF773A25816]
	(No symbol) [0x00007FF773A14D89]
	BaseThreadInitThunk [0x00007FF8C5ABE8D7+23]
	RtlUserThreadStart [0x00007FF8C657BF2C+44]


In [33]:
# Do it for one stock first
search_bar = wait.until(EC.presence_of_element_located((By.ID, 'main-search-id')))
search_bar.send_keys('AAPL')
time.sleep(1)
search_result_symbols = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-test-id="symbol-search-result-item"]')))
most_relevant_link = search_result_symbols[0].find_element(By.CSS_SELECTOR, 'a.fVsJB')
most_relevant_link.click()

In [36]:
transcripts = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[data-test-id="Transcripts & Insights"]')))
transcripts.click()

In [173]:
scroll_to_bottom(driver)

In [174]:
post_list_items = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '[data-test-id="post-list-item"]')))

In [175]:

    post_list_info = []
    for item in post_list_items:
        date_element = item.find_element(By.CSS_SELECTOR, 'span[data-test-id="post-list-date"]')
        date = date_element.text
        title_element = item.find_element(By.CSS_SELECTOR, 'a[data-test-id="post-list-item-title"]')
        title = title_element.text
        url = title_element.get_attribute('href')
        post_list_info.append((date, title, url))

In [176]:
len(post_list_info)

84

In [177]:
    for date, title, url in post_list_info:
        print(date)
        print(title)

        # Stop downloading transcripts before 2019
        if '2018' in date:
            break

        # Select only transcripts
        if 'Transcript' in title:
            driver.get(url)
            print('clicked')
        else:
            print('continue')
            continue


Thu, Jan. 30
Apple outlines 2025 revenue growth targets amid strong Q1 results and Apple Intelligence expansion
continue
Thu, Jan. 30
Apple Inc. (AAPL) Q1 2025 Earnings Call Transcript
clicked
Thu, Oct. 31, 2024
Apple Inc. (AAPL) Q4 2024 Earnings Call Transcript
clicked
Thu, Aug. 01, 2024
Apple Inc. (AAPL) Q3 2024 Earnings Call Transcript


KeyboardInterrupt: 

In [120]:
for item in post_list_items:
    # time.sleep(1)
    date_element = item.find_element(By.CSS_SELECTOR, 'span[data-test-id="post-list-date"]')
    date = date_element.text

    # Stop downloading transcripts before 2019
    if '2018' in date:
        break

    title_element = item.find_element(By.CSS_SELECTOR, 'a[data-test-id="post-list-item-title"]')
    title = title_element.text

    # Separate earnings call transcripts from other transcripts
    if 'Transcript' in title:
        title_element.click()
        print(1)
        driver.back()

    


1


StaleElementReferenceException: Message: stale element reference: stale element not found in the current frame
  (Session info: chrome=134.0.6998.36); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
	GetHandleVerifier [0x0056CED3+25523]
	(No symbol) [0x004F23B4]
	(No symbol) [0x003C06E3]
	(No symbol) [0x003C70CE]
	(No symbol) [0x003C935A]
	(No symbol) [0x003C93D7]
	(No symbol) [0x00408401]
	(No symbol) [0x00408ECB]
	(No symbol) [0x003FE201]
	(No symbol) [0x0042D844]
	(No symbol) [0x003FE124]
	(No symbol) [0x0042DA74]
	(No symbol) [0x0044F273]
	(No symbol) [0x0042D5F6]
	(No symbol) [0x003FC55F]
	(No symbol) [0x003FD8A4]
	GetHandleVerifier [0x00872713+3193843]
	GetHandleVerifier [0x008869E9+3276489]
	GetHandleVerifier [0x00880F0C+3253228]
	GetHandleVerifier [0x0060B0C0+673184]
	(No symbol) [0x004FB43D]
	(No symbol) [0x004F8568]
	(No symbol) [0x004F8709]
	(No symbol) [0x004EAE90]
	BaseThreadInitThunk [0x75025D49+25]
	RtlInitializeExceptionChain [0x76F7CDEB+107]
	RtlGetAppContainerNamedObjectPath [0x76F7CD71+561]


In [45]:
test = post_list_items[3]
title_element = test.find_element(By.CSS_SELECTOR, 'a[data-test-id="post-list-item-title"]')
date_element = test.find_element(By.CSS_SELECTOR, 'span[data-test-id="post-list-date"]')

title = title_element.text
date = date_element.text
    
print(f"Title: {title}")
print(f"Date: {date}")

Title: Apple Inc. (AAPL) Q3 2024 Earnings Call Transcript
Date: Thu, Aug. 01, 2024


In [100]:
for item in post_list_items:
    if 'Earnings' in item.text:
        item.click()
        break

# AAPL first item is earning call insights but is not chosen because earning is not in title. Pass since the task wants transcript not insights


In [60]:
html_source = driver.page_source

# Define the folder path
folder_path = "Data Scraping"

# Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

# Define the file path
file_path = os.path.join(folder_path, "page_source.html")

# Save the HTML source to a file
with open(file_path, "w", encoding="utf-8") as file:
    file.write(html_source)

print(f"HTML file created successfully at {file_path}.")

HTML file created successfully at Data Scraping\page_source.html.


In [182]:
content_string = ""

content_container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.T2G6W[data-test-id="content-container"]')))

In [183]:
paragraphs = content_container.find_elements(By.TAG_NAME, 'p')

for paragraph in paragraphs:
    content_string += paragraph.text + "\n"

In [51]:
print(content_string)

Apple Inc. (NASDAQ:AAPL) Q1 2025 Earnings Conference Call January 30, 2025 5:00 PM ET
Company Participants
Suhasini Chandramouli - Director, Investor Relations
Tim Cook - Chief Executive Officer
Kevan Parekh - Chief Financial Officer
Conference Call Participants
Erik Woodring - Morgan Stanley
Ben Reitzes - Melius
Michael Ng - Goldman Sachs
Amit Daryanani - Evercore
Wamsi Mohan - Bank of America
Samik Chatterjee - JPMorgan
David Vogt - UBS
Krish Sankar - TD Cowen
Richard Kramer - Arete Research
Atif Malik - Citi
Ben Bollin - Cleveland Research Company
Suhasini Chandramouli
Good afternoon, and welcome to the Apple Q1 Fiscal Year 2025 Earnings Conference Call. My name is Suhasini Chandramouli, Director of Investor Relations. Today's call is being recorded.
Speaking first today are Apple CEO, Tim Cook, and he will be followed by CFO, Kevan Parekh. After that, we'll open the call to questions from analysts.
Please note that some of the information you'll hear during our discussion today wil

In [57]:
pdf_document = pymupdf.open()

# Add a page
page = pdf_document.new_page()

# Define the text insertion point
text_insertion_point = pymupdf.Point(72, 72)  # 1 inch from top-left corner

# Add text to the page
page.insert_text(text_insertion_point, content_string, fontsize=12)

# Save the PDF with name .pdf
pdf_document.save("text_string.pdf")

print("PDF created successfully.")

PDF created successfully.


In [59]:
with open("text_string.txt", "w", encoding="utf-8") as file:
    file.write(content_string)

In [104]:
driver.back()

In [None]:
for i in range(len(paragraphs)):
    try:
        content_string += paragraphs[i].text + "\n"
        print('yes')
    except StaleElementReferenceException:
        content_container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.T2G6W[data-test-id="content-container"]')))
        paragraphs = content_container.find_elements(By.TAG_NAME, 'p')
        content_string += paragraphs[i].text + "\n"
        print('no')