In [1]:
import os
import time
import logging
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# Selenium-related imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

In [3]:
def selenium_manual_login():
    """
    Opens a visible browser window and navigates to the Seeking Alpha login page.
    Waits for the user to manually log in and then presses ENTER in the notebook to continue.
    Returns the session cookies for use with requests.
    """
    chrome_options = Options()
    # For manual login, do not run headless so you can see and interact with the page.
    # chrome_options.add_argument("--headless")  <-- Don't use headless mode.
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/105.0.0.0 Safari/537.36"
    )
    
    # Use webdriver-manager to automatically install and manage ChromeDriver.
    chrome_driver_path = "F:/OneDrive - KAIST/Labs/N22_Prof_Ryonhee/chromedriver-win64/chromedriver.exe"

    service = Service(chrome_driver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    try:
        logging.info("Navigating to Seeking Alpha login page...")
        driver.get("https://seekingalpha.com/account/login")
        
        # Instruct the user to log in manually.
        logging.info("Please log in manually in the opened browser window. Once you have logged in and the page has fully loaded, press ENTER here to continue...")
        input("Press ENTER after logging in...")
        
        # Optionally, wait for a login indicator (e.g., user profile element) to confirm login.
        try:
            WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, "//a[contains(@class, 'user-profile')]")))
            logging.info("Login success indicator detected.")
        except Exception as e:
            logging.warning("Login indicator not detected. Ensure you have successfully logged in.")
        
        # Extract cookies from the current session.
        cookies = driver.get_cookies()
        return cookies
    except Exception as e:
        logging.error(f"Error during manual login: {e}")
        return None
    finally:
        driver.quit()


In [4]:
def convert_cookies_for_requests(cookies_list):
    """
    Converts Selenium cookies (a list of dicts) into a dictionary for use with `requests`.
    """
    cookies_dict = {}
    for cookie in cookies_list:
        cookies_dict[cookie["name"]] = cookie["value"]
    return cookies_dict

In [5]:
# --- API Access Section ---

HEADERS = {
    "User-Agent": ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/105.0.0.0 Safari/537.36"),
    "Referer": "https://seekingalpha.com/"
}


In [6]:
def fetch_listing_page(page_number, cookies):
    """
    Fetches the listing page of earnings call transcripts using the authenticated session cookies.
    """
    base_url = "https://seekingalpha.com/api/v3/articles"
    params = {
        "filter[category]": "earnings::earnings-call-transcripts",
        "filter[since]": "0",
        "filter[until]": "0",
        "include": "author,primaryTickers,secondaryTickers",
        "isMounting": "true",
        "page[size]": "50",
        "page[number]": str(page_number)
    }
    try:
        response = requests.get(
            base_url,
            headers=HEADERS,
            params=params,
            cookies=cookies,
            timeout=10
        )
        if response.status_code == 200:
            logging.info(f"Listing page {page_number} fetched successfully.")
            return response.json()
        else:
            logging.error(f"Error fetching listing page {page_number}: HTTP {response.status_code}")
            return None
    except Exception as e:
        logging.error(f"Exception fetching listing page {page_number}: {e}")
        return None

In [7]:
def fetch_article_details(article_id, cookies):
    """
    Fetches the details of a specific article (earnings call transcript) using the article ID.
    """
    url = f"https://seekingalpha.com/api/v3/articles/{article_id}"
    params = {
        "include": ("author,primaryTickers,secondaryTickers,otherTags,"
                    "presentations,presentations.slides,author.authorResearch,"
                    "author.userBioTags,co_authors,promotedService,sentiments")
    }
    try:
        response = requests.get(
            url,
            headers=HEADERS,
            params=params,
            cookies=cookies,
            timeout=10
        )
        if response.status_code == 200:
            logging.info(f"Article {article_id} details fetched successfully.")
            return response.json()
        else:
            logging.error(f"Error fetching details for article {article_id}: HTTP {response.status_code}")
            return None
    except Exception as e:
        logging.error(f"Exception fetching details for article {article_id}: {e}")
        return None


In [8]:
def parse_article_data(article_json):
    """
    Extracts the transcript text (by converting HTML to plain text), publication details,
    and a company/ticker symbol from the article's JSON data.
    """
    attributes = article_json.get("data", {}).get("attributes", {})
    transcript_html = attributes.get("body", "")
    published_at = attributes.get("published_at")

    # Determine publication year and quarter using the published_at timestamp.
    if published_at:
        try:
            pub_date = datetime.fromisoformat(published_at.rstrip("Z"))
            year = pub_date.year
            month = pub_date.month
            if month <= 3:
                quarter = "Q1"
            elif month <= 6:
                quarter = "Q2"
            elif month <= 9:
                quarter = "Q3"
            else:
                quarter = "Q4"
        except Exception as e:
            logging.warning(f"Error parsing published_at date: {e}")
            year, quarter = "Unknown", "Unknown"
    else:
        year, quarter = "Unknown", "Unknown"

    # Use the primary ticker as a proxy for company name.
    company = "Unknown_Company"
    for entry in article_json.get("included", []):
        if entry.get("type") == "primaryTickers":
            attributes_ticker = entry.get("attributes", {})
            company = attributes_ticker.get("symbol", "Unknown_Company")
            break

    # Use BeautifulSoup to strip HTML tags.
    soup = BeautifulSoup(transcript_html, "html.parser")
    transcript_text = soup.get_text(separator="\n").strip()

    return {
        "company": company,
        "year": year,
        "quarter": quarter,
        "transcript": transcript_text,
    }


In [9]:
def save_transcript(data, base_dir="Transcripts"):
    """
    Saves the transcript into a file under a directory structure:
    Transcripts/<Company>/<Year>/<Quarter>.txt
    """
    company_dir = data["company"].replace(" ", "_").replace("/", "-")
    directory = os.path.join(base_dir, company_dir, str(data["year"]))
    os.makedirs(directory, exist_ok=True)
    file_path = os.path.join(directory, f"{data['quarter']}.txt")
    try:
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(data["transcript"])
        logging.info(f"Saved transcript: {file_path}")
    except Exception as e:
        logging.error(f"Error saving transcript to {file_path}: {e}")


In [None]:
logging.info("Starting manual login process...")
cookies = selenium_manual_login()
if not cookies:
    logging.error("Login failed. Exiting.")
else:
    # Convert cookies for use with requests.
    session_cookies = convert_cookies_for_requests(cookies)
    
    # Loop through listing pages to fetch and save earnings call transcripts.
    page_number = 1
    while True:
        logging.info(f"Fetching listing page {page_number}...")
        listing_json = fetch_listing_page(page_number, session_cookies)
        if not listing_json:
            logging.info("No data returned for listing page; stopping.")
            break

        articles = listing_json.get("data", [])
        if not articles:
            logging.info("No more articles found; reached the end of listings.")
            break

        for article in articles:
            article_id = article.get("id")
            if not article_id:
                continue
            details_json = fetch_article_details(article_id, session_cookies)
            if not details_json:
                continue

            data = parse_article_data(details_json)
            if data["transcript"]:
                save_transcript(data)
            else:
                logging.warning(f"No transcript found for article {article_id}")
            
            time.sleep(1)  # Be polite to the server
        
        page_number += 1
        time.sleep(1)

2025-01-15 18:12:34,818 - INFO - Starting manual login process...
2025-01-15 18:12:36,257 - INFO - Navigating to Seeking Alpha login page...
2025-01-15 18:12:38,045 - INFO - Please log in manually in the opened browser window. Once you have logged in and the page has fully loaded, press ENTER here to continue...
