In [1]:
# Cell 1: Imports
# -----------------------------------------------------------------
import os
import json
from scrapers.web_scraper import Scraper
from tools.download_tools import CbreTitleParserTool, CbrePDFDownloaderTool
from utils.file_utils import check_existing_files

In [2]:
# Cell 2: (Optional) Discovery Phase
# -----------------------------------------------------------------
# This part remains the same. It helps you see what options are available to use in the Control Panel below.
print("--- Starting Discovery Phase ---")
discover_scraper = Scraper(headless=True)
available_filters = {}
try:
    # Use the new, more reliable setup method
    if discover_scraper.setup_cbre_insights_page("https://www.cbre.com/insights#market-reports"):
        available_filters = discover_scraper.discover_filters()
finally:
    discover_scraper.close()

# Pretty-print the discovered filters so you know your options
print("\n--- Available Filter Options ---")
print(json.dumps(available_filters, indent=2))
print("-" * 50)

--- Starting Discovery Phase ---
🤖 Selenium WebDriver Initialized.
Navigated to: https://www.cbre.com/insights#market-reports
✓ Accepted cookies.
✓ Clicked 'Market Reports' tab.
✓ Switched to iframe.

🔎 Discovering available filters...
✓ Discovery complete.
🤖 WebDriver closed.
🤖 WebDriver closed.

--- Available Filter Options ---
{
  "Region": [
    "Americas",
    "APAC",
    "Europe",
    "MENAT",
    "Global"
  ],
  "Country": [
    "United States",
    "Australia",
    "New Zealand",
    "Canada",
    "Mainland China",
    "Search"
  ],
  "Market": [
    "New York City",
    "Los Angeles",
    "Thailand",
    "Tampa",
    "San Jose",
    "Search"
  ],
  "Property Type": [
    "Office",
    "Industrial and Logistics",
    "Retail",
    "Residential",
    "Hotel",
    "Search"
  ],
  "Industry": [
    "Banking and Financial Services",
    "Professional Services",
    "Retail",
    "Law Firms",
    "Food and Beverage",
    "Search"
  ]
}
-----------------------------------------------

In [3]:
# Cell 3: Control Panel - Define Your Intelligent Search
# -----------------------------------------------------------------

TARGET_KEYWORDS_ADDITIONAL = []
TARGET_YEAR = 2025
TARGET_PERIOD = "H1"
TARGET_SORT_ORDER = "Most Recent"
TARGET_FILTERS = {
    "Property Type": "Industrial and Logistics",
    "Country": "United States"
}

In [4]:
# Cell 4: Execution Phase - Find URLs and Download New Reports
# -----------------------------------------------------------------
print("--- Starting Execution Phase ---")

BASE_REPORT_PATH = "CBRE_Reports"
existing_files = check_existing_files(BASE_REPORT_PATH)
newly_downloaded_files = [] # List to track files downloaded in this run

# Automatically build the search terms and settings based on your choices above
final_keywords = list(TARGET_KEYWORDS_ADDITIONAL)
if TARGET_YEAR:
    final_keywords.append(str(TARGET_YEAR))
enable_smart_stopping = all([
    TARGET_SORT_ORDER == "Most Recent",
    TARGET_YEAR is not None,
    TARGET_PERIOD is not None
])

# Instantiate tools
scraper = Scraper(headless=True)
title_parser = CbreTitleParserTool() 
downloader = CbrePDFDownloaderTool(driver=scraper.driver, download_dir=scraper.download_dir)
report_urls_with_titles = {}


# --- Part 3: Main Execution Workflow ---
try:
    # 3a. Find all relevant report URLs & Titles
    if scraper.setup_cbre_insights_page("https://www.cbre.com/insights#market-reports"):
        if TARGET_FILTERS:
            for name, value in TARGET_FILTERS.items():
                scraper.apply_filter(filter_name=name, filter_value=value)
        if TARGET_SORT_ORDER:
            scraper.sort_results_by(TARGET_SORT_ORDER)
        
        scrape_config = {
            "content_container_selector": ".coveo-result-list-container",
            "link_selector": ".coveo-result-list-container a",
            "search_terms": final_keywords,
            "next_page_selector": "li.coveo-pager-next span[role='button']",
            "enable_early_stopping": enable_smart_stopping,
            "target_year": TARGET_YEAR,
            "target_period": TARGET_PERIOD
        }
        report_urls_with_titles = scraper.extract_links_from_pages(scrape_config)

        # 3b. Parse ALL titles in a single batch
        if not report_urls_with_titles:
            print("\n--- No reports found matching criteria on the website. ---")
        else:
            titles_to_parse = list(report_urls_with_titles.values())
            parsed_reports_data = title_parser._run(titles=titles_to_parse)
            
            # Create a mapping from the title back to its URL for easy lookup
            url_map = {title: url for url, title in report_urls_with_titles.items()}

            # 3c. Loop through PARSED data to download
            print(f"\n--- Download phase: processing {len(parsed_reports_data)} parsed reports ---")
            for report_data in parsed_reports_data:
                market = report_data['market_name'].replace(' ', '_').replace('/', '_').replace('.', '')
                filename = f"{market}_{report_data['year']}_{report_data['period']}.pdf"
                
                if filename in existing_files:
                    print(f"Skipped: '{filename}' already exists.")
                    continue

                report_url = url_map.get(report_data['original_title'])
                if not report_url:
                    print(f"Warning: Could not find original URL for title '{report_data['original_title']}'")
                    continue

                result = downloader._run(
                    report_url=report_url,
                    parsed_info=report_data,
                    base_save_path=BASE_REPORT_PATH
                )
                print(result)

                if result.startswith("Success"):
                    newly_downloaded_files.append(filename)
finally:
    scraper.close()

--- Starting Execution Phase ---
Base directory 'CBRE_Reports' not found. It will be created when a file is saved.
🤖 Selenium WebDriver Initialized.
Navigated to: https://www.cbre.com/insights#market-reports
✓ Accepted cookies.
✓ Clicked 'Market Reports' tab.
✓ Switched to iframe.
Applying filter: 'Property Type' -> 'Industrial and Logistics'...
✓ Filter applied.
Applying filter: 'Country' -> 'United States'...
✓ Filter applied.
Sorting results by 'Most Recent'...
✓ Sort option applied.
📄 Scraping Page 1...
   ✅ Found matching report: Northern Colorado Industrial Figures H1 2025
   ✅ Found matching report: Colorado Springs Industrial Figures H1 2025
   Navigating to page 2...
📄 Scraping Page 2...
   Navigating to page 3...
📄 Scraping Page 3...
   Navigating to page 4...
📄 Scraping Page 4...
   Navigating to page 5...
📄 Scraping Page 5...
   Navigating to page 6...
📄 Scraping Page 6...
   Navigating to page 7...
📄 Scraping Page 7...
   Navigating to page 8...
📄 Scraping Page 8...
   Nav

In [5]:
# Cell 5: Final Summary
# -----------------------------------------------------------------
print("-" * 50)
print("Archival process complete.")

if newly_downloaded_files:
    print(f"\n✅ A total of {len(newly_downloaded_files)} new reports were downloaded in this session:")
    for filename in sorted(newly_downloaded_files):
        print(f"  - {filename}")
else:
    print("\nℹ️ No new reports were found to download in this session.")
    
print("-" * 50)

--------------------------------------------------
Archival process complete.

✅ A total of 2 new reports were downloaded in this session:
  - Colorado_Springs_2025_H1.pdf
  - Northern_Colorado_2025_H1.pdf
--------------------------------------------------
