In [17]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [18]:
import requests

def search_financial_report(company_name, api_key, cse_id):
    """
    Uses Google Custom Search API to request a financial report for the given company.
    
    Args:
        company_name (str): Name of the company.
        api_key (str): Your Google API key.
        cse_id (str): Your Google Custom Search Engine ID.
        
    Returns:
        dict: Parsed JSON response from Google Custom Search API,
        or error information if the API response is an error.
    """
    query = f"{company_name} financial report"
    url = "https://www.googleapis.com/customsearch/v1"
    params = {
        'q': query,
        'key': api_key,
        'cx': cse_id,
        'safe': 'active',
        'lr': 'lang_en',
        'filter': 0,
        'num': 10,
        'gl': 'us',
        'hl': 'en',
        
    }

    try:
        response = requests.get(url, params=params)
        # Try to raise for status, catch HTTPError
        response.raise_for_status()
        result = response.json()
        # Check if the response indicates an error within the JSON, not just via HTTP status code
        if "error" in result:
            return {
                "error": {
                    "code": result["error"].get("code", None),
                    "message": result["error"].get("message", "Unknown error."),
                    "errors": result["error"].get("errors", []),
                    "status": result["error"].get("status", None)
                }
            }
        return result
    except requests.exceptions.HTTPError as e:
        try:
            # If response is JSON, parse error structure
            error_json = response.json()
            if "error" in error_json:
                return {
                    "error": {
                        "code": error_json["error"].get("code", None),
                        "message": error_json["error"].get("message", str(e)),
                        "errors": error_json["error"].get("errors", []),
                        "status": error_json["error"].get("status", None)
                    }
                }
        except Exception:
            # If not JSON or can't decode, return generic error
            return {"error": {"code": response.status_code, "message": str(e), "status": "HTTP_ERROR"}}
        # If error is still not handled
        return {"error": {"code": response.status_code, "message": str(e), "status": "HTTP_ERROR"}}
    except Exception as e:
        # Catch any other exceptions
        return {"error": {"code": None, "message": str(e), "status": "UNKNOWN_ERROR"}}



In [19]:
api_key = os.getenv("GOOGLE_SEARCH_API")
cse_id = os.getenv("GOOGLE_SEARCH_CX")

In [23]:
print(api_key)
print(cse_id )
search_response =search_financial_report("Apple", api_key, cse_id)

AIzaSyAmQw1qS0WrBusLOpz-EN_AgCq1s8xBCL4
e02c97188cc914136


In [25]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

# Assuming `result` is the output from search_financial_report("Apple", api_key, cse_id)

def find_annual_report_link(result):
    # Use the first link that shows up, if available
    if "items" in result and isinstance(result["items"], list) and len(result["items"]) > 0:
        first_item = result["items"][0]
        link = first_item.get("link", None)
        if link:
            print(f"Using the first search result: {link}")
            return link
        else:
            print("No valid link found in the first search result.")
            return None
    else:
        print("No search results or error in result.")
        return None

link = find_annual_report_link(search_response)




Using the first search result: https://investor.apple.com/investor-relations/default.aspx
Annual report page loaded in Selenium browser.


KeyboardInterrupt: 

In [None]:
if link:
    chrome_options = Options()
 
    # Initialize driver (path to chromedriver may be required)
    try:
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(link)
        print("Annual report page loaded in Selenium browser.")
        time.sleep(5)  # Wait for a few seconds to let page load
        html = driver.page_source
        driver.quit()

        from bs4 import BeautifulSoup
        import os
        soup = BeautifulSoup(html, "html.parser")

        # Find any link (a tag) containing "Annual Report" or "Financial Report" AND href is a pdf
        pdf_links = []
        for a in soup.find_all("a", href=True):
            link_text = a.get_text(strip=True).lower()
            href = a['href']
            if (("annual report" in link_text or "financial report" in link_text) and ".pdf" in href.lower()):
                pdf_links.append(href)

        if pdf_links:
            print(f"Found annual/financial report PDF links: {pdf_links}")
            for href in pdf_links:
                # Make href absolute if needed
                from urllib.parse import urljoin
                pdf_url = urljoin(link, href)
                print(f"Attempting to download PDF: {pdf_url}")
                try:
                    resp = requests.get(pdf_url, timeout=15)
                    if resp.ok:
                        # Extract filename from URL
                        filename = os.path.basename(pdf_url)
                        if not filename.lower().endswith(".pdf"):
                            filename += ".pdf"
                        with open(filename, "wb") as f:
                            f.write(resp.content)
                        print(f"Downloaded and saved: {filename}")
                    else:
                        print(f"Failed to download PDF {pdf_url}, status code: {resp.status_code}")
                except Exception as ex:
                    print(f"Exception downloading PDF {pdf_url}: {ex}")
        else:
            print("No Annual Report or Financial Report PDF link found on the page.")
    except Exception as e:
        print(f"Error loading the page with Selenium: {e}")
        print("Attempting to fetch the page title with requests instead.")
        try:
            resp = requests.get(link, timeout=10)
            if resp.ok:
                from bs4 import BeautifulSoup
                soup = BeautifulSoup(resp.text, "html.parser")
                title = soup.title.string if soup.title else "No title found"
                print(f"Fetched page successfully. Title: {title}")
            else:
                print(f"Failed to fetch the page, status code: {resp.status_code}")
        except Exception as ex:
            print(f"Failed to fetch the page using requests: {ex}")
else:
    print("No valid link to annual report found.")