In [28]:
# Repurposed for PDF scraping - Major Course List and Sequence Downloader
"""
This program scrapes a website to download PDF files for each major.
It finds majors on the main page, navigates to each major's detail page,
and downloads the Course List and Course Sequence PDFs.
"""
%load_ext autoreload
%autoreload 2 
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
# Modules needed
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin, urlparse
import re

# Selenium for JavaScript-rendered content
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# ============================================================================
# Base URL Configuration
# ============================================================================
# URL of the Miami Dade College Associate Degree Programs page
# This page contains lists of majors in both "Associate in Arts" and "Associate in Science" tabs
base_url = "https://www.mdc.edu/academics/programs/associate.aspx"
# ============================================================================

response = requests.get(base_url)
response

<Response [200]>

In [30]:
# ============================================================================
# SETUP: Initialize Selenium WebDriver for JavaScript-rendered content
# ============================================================================
# The majors list is loaded via JavaScript, so we need Selenium to render it
# ============================================================================

def setup_selenium_driver():
    """Setup Chrome WebDriver with options"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run in background (no browser window)
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    # Add user agent to avoid being blocked
    chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    
    try:
        # Try using webdriver-manager first (automatically downloads ChromeDriver)
        try:
            from webdriver_manager.chrome import ChromeDriverManager
            from selenium.webdriver.chrome.service import Service as ChromeService
            print("Using webdriver-manager to auto-download ChromeDriver...")
            service = ChromeService(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=chrome_options)
            return driver
        except ImportError:
            # Fallback to manual ChromeDriver (must be in PATH)
            print("webdriver-manager not found. Using system ChromeDriver...")
            print("  Tip: Install webdriver-manager for automatic setup: pip install webdriver-manager")
            driver = webdriver.Chrome(options=chrome_options)
            return driver
    except Exception as e:
        print(f"Error setting up Chrome driver: {e}")
        print("\n" + "=" * 80)
        print("INSTALLATION INSTRUCTIONS:")
        print("=" * 80)
        print("Option 1 (Recommended): Install webdriver-manager")
        print("  pip install webdriver-manager")
        print("  This will automatically handle ChromeDriver installation")
        print("\nOption 2: Manual ChromeDriver installation")
        print("  1. Install Chrome browser if not installed")
        print("  2. Install selenium: pip install selenium")
        print("  3. Download ChromeDriver from https://chromedriver.chromium.org/")
        print("  4. Add ChromeDriver to your PATH or place it in the project directory")
        print("=" * 80)
        return None

# Verify we can get the page (static HTML check)
soup = BeautifulSoup(response.content, "html.parser")
print(f"Static HTML - Page title: {soup.title.string if soup.title else 'No title'}")
print(f"Static HTML - Response status: {response.status_code}")
print(f"\n⚠ Note: Majors are loaded via JavaScript, so we'll need Selenium to scrape them.")

Static HTML - Page title: Associate Degree Programs | Academics | Miami Dade College
Static HTML - Response status: 200

⚠ Note: Majors are loaded via JavaScript, so we'll need Selenium to scrape them.


In [45]:
# ============================================================================
# Finding Major Links using Selenium (JavaScript-rendered content)
# ============================================================================
# The majors are loaded via JavaScript, so we need Selenium to render the page
# Structure: <div class="tab-content"> > <div role="tabpanel" id="aa"> 
# > <ul class="list-unstyled list-divider"> > <li><a href="/accounting/">Accounting</a>
# ============================================================================

print("=" * 80)
print("Loading page with Selenium to render JavaScript content...")
print("=" * 80)

# Initialize Selenium driver
driver = setup_selenium_driver()

if driver is None:
    print("\n❌ ERROR: Could not initialize Selenium driver.")
    print("Please install ChromeDriver and try again.")
    major_links = []
else:
    try:
        # Load the page
        print(f"\nLoading URL: {base_url}")
        driver.get(base_url)
        
        # Wait for the majors list to load (wait for the ul with class to appear)
        print("Waiting for JavaScript to load majors...")
        wait = WebDriverWait(driver, 15)
        
        # Wait for at least one majors list to appear
        try:
            # Wait for the list to appear in either tab panel
            wait.until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "div.tab-content ul.list-unstyled.list-divider")
                )
            )
            print("✓ Majors list loaded!")
        except TimeoutException:
            print("⚠ Timeout waiting for majors list. Trying to proceed anyway...")
        
        # Give a bit more time for all content to load
        time.sleep(2)
        
        # Get the page source after JavaScript execution
        page_source = driver.page_source
        soup_selenium = BeautifulSoup(page_source, "html.parser")
        
        # Now find the major links
        major_links = []
        
        # Find tab-content container
        tab_content = soup_selenium.find('div', class_='tab-content')
        
        if tab_content:
            print(f"\n✓ Found tab-content container")
            
            # Find all tab panels
            tab_panels = tab_content.find_all('div', role='tabpanel')
            print(f"✓ Found {len(tab_panels)} tab panels")
            
            for tab_panel in tab_panels:
                tab_id = tab_panel.get('id', 'unknown')
                
                # Find the majors list
                majors_list = tab_panel.find('ul', class_='list-unstyled list-divider')
                
                if majors_list:
                    # Find all links in this list
                    links = majors_list.find_all('a', href=True)
                    print(f"  ✓ Tab '{tab_id}': Found {len(links)} major links")
                    
                    for link in links:
                        href = link.get('href', '')
                        text = link.get_text(strip=True)
                        
                        # Validate it's a major link
                        if href and href.startswith('/') and not href.startswith('#') and text:
                            # Determine degree type based on tab panel ID
                            if tab_id == 'aa':
                                degree_type = 'AA'  # Associate in Arts
                            elif tab_id == 'as':
                                degree_type = 'AS'  # Associate in Science
                            else:
                                degree_type = 'Unknown'
                            
                            major_links.append({
                                'text': text,
                                'href': href,
                                'degree_type': degree_type,
                                'element': link  # Keep reference for compatibility
                            })
                else:
                    print(f"  ✗ Tab '{tab_id}': No majors list found")
        else:
            print("✗ tab-content container not found in rendered page")
        
        # Remove duplicates
        seen = set()
        unique_major_links = []
        
        for link_info in major_links:
            href = link_info['href']
            if href not in seen:
                seen.add(href)
                unique_major_links.append(link_info)
        
        major_links = unique_major_links
        
        print("\n" + "=" * 80)
        print(f"RESULT: Found {len(major_links)} unique major links")
        print("=" * 80)
        
        if major_links:
            print("\nFirst 15 majors found:")
            for i, link_info in enumerate(major_links[:15]):
                print(f"  {i+1}. {link_info['text']} - {link_info['href']}")
            
            if len(major_links) > 15:
                print(f"\n  ... and {len(major_links) - 15} more")
            
            # Verify we're getting actual majors
            print("\nSample verification:")
            sample_majors = ['Accounting', 'Biology', 'Economics', 'Psychology']
            for major_name in sample_majors:
                found = any(major_name.lower() in link_info['text'].lower() for link_info in major_links)
                status = "✓" if found else "✗"
                print(f"  {status} {major_name}")
        else:
            print("\n⚠ WARNING: No major links found even after JavaScript rendering!")
            print("The page structure might have changed.")
        
        # Keep driver open for now (we'll close it later or reuse it)
        print("\n✓ Selenium driver initialized successfully")
        print("  Note: Driver will be reused for scraping individual major pages")
        
    except Exception as e:
        print(f"\n❌ ERROR: {e}")
        import traceback
        traceback.print_exc()
        major_links = []
        if driver:
            driver.quit()

Loading page with Selenium to render JavaScript content...
webdriver-manager not found. Using system ChromeDriver...
  Tip: Install webdriver-manager for automatic setup: pip install webdriver-manager

Loading URL: https://www.mdc.edu/academics/programs/associate.aspx
Waiting for JavaScript to load majors...
✓ Majors list loaded!

✓ Found tab-content container
✓ Found 2 tab panels
  ✓ Tab 'aa': Found 79 major links
  ✓ Tab 'as': Found 82 major links

RESULT: Found 156 unique major links

First 15 majors found:
  1. Accounting - /accounting/
  2. Agriculture - /agriculture
  3. Anthropology - /anthropology
  4. Architecture - /architectureaa/
  5. Area & Ethnic Studies - /ethnicstudies
  6. Art or Art Education - /art
  7. Atmospheric Science & Meteorology - /meteorology
  8. Biology - /biology/
  9. Biotechnology - /biotechnologyaa/
  10. Building Construction - /buildingconstruction
  11. Business Administration - /businessadministration/
  12. Chemistry - /chemistry
  13. Computer Ar

In [None]:
# ============================================================================
# Verify Major Links Found (Optional Check)
# ============================================================================
# This cell verifies that major links were found correctly
# ============================================================================

if 'major_links' in globals() and major_links:
    print("=" * 80)
    print(f"VERIFICATION: Found {len(major_links)} major links")
    print("=" * 80)
    
    # Count by degree type
    aa_count = 0
    as_count = 0
    unknown_count = 0
    
    for link_info in major_links:
        if isinstance(link_info, dict):
            degree_type = link_info.get('degree_type', 'Unknown')
            if degree_type == 'AA':
                aa_count += 1
            elif degree_type == 'AS':
                as_count += 1
            else:
                unknown_count += 1
    
    print(f"\nBy Degree Type:")
    print(f"  - Associate in Arts (AA): {aa_count}")
    print(f"  - Associate in Science (AS): {as_count}")
    if unknown_count > 0:
        print(f"  - Unknown: {unknown_count}")
    
    # Sample a few to verify they're actual majors
    print("\nSample of majors found:")
    for i, link_info in enumerate(major_links[:20]):
        if isinstance(link_info, dict):
            major_name = link_info.get('text', '')
            major_url = link_info.get('href', '')
            degree_type = link_info.get('degree_type', 'Unknown')
            print(f"  {i+1}. {major_name} ({degree_type}) - {major_url}")
        else:
            major_name = link_info.get_text(strip=True)
            major_url = link_info.get('href', '')
            print(f"  {i+1}. {major_name} - {major_url}")
    
    if len(major_links) > 20:
        print(f"  ... and {len(major_links) - 20} more")
    
    print(f"\n✓ Ready to proceed! You can now:")
    print("  1. Run Cell 12 to test with Accounting page")
    print("  2. Run Cell 8 to download all PDFs (will be organized by degree type)")
    
else:
    print("⚠ WARNING: No major links found in Cell 3!")
    print("Make sure Cell 3 ran successfully and found majors.")
    print("If Cell 3 failed, you may need to install ChromeDriver.")
    print("\nInstall: pip install selenium webdriver-manager")

VERIFICATION: Found 156 major links

Sample of majors found:
  1. Accounting - /accounting/
  2. Agriculture - /agriculture
  3. Anthropology - /anthropology
  4. Architecture - /architectureaa/
  5. Area & Ethnic Studies - /ethnicstudies
  6. Art or Art Education - /art
  7. Atmospheric Science & Meteorology - /meteorology
  8. Biology - /biology/
  9. Biotechnology - /biotechnologyaa/
  10. Building Construction - /buildingconstruction
  11. Business Administration - /businessadministration/
  12. Chemistry - /chemistry
  13. Computer Arts Animation - /animation/
  14. Computer Information Systems - /cis
  15. Computer Science - /computerscience
  16. Criminal Justice Administration - /criminaljustice/
  17. Dance - /dance
  18. Dietetics - /dietetics
  19. Drama or Drama Education - /drama
  20. Economics - /economics/
  ... and 136 more

✓ Ready to proceed! You can now:
  1. Run Cell 12 to test with Accounting page
  2. Run Cell 8 to download all PDFs


In [37]:
# Create directories to save PDFs if they don't exist
# Separate folders for Associate in Arts (AA) and Associate in Science (AS)
output_dir_aa = "downloaded_pdfs/Associates_in_Arts"
output_dir_as = "downloaded_pdfs/Associates_in_Science"
os.makedirs(output_dir_aa, exist_ok=True)
os.makedirs(output_dir_as, exist_ok=True)

print(f"PDFs will be saved to:")
print(f"  - Associate in Arts: {os.path.abspath(output_dir_aa)}")
print(f"  - Associate in Science: {os.path.abspath(output_dir_as)}")

# Store information about downloaded PDFs
download_info = []

PDFs will be saved to:
  - Associate in Arts: c:\Users\dylan\Documents\Dylans Doc's\Dylan's Hackathons\2025\SharkByte\sharkbyte2025\DataCollection\downloaded_pdfs\Associates_in_Arts
  - Associate in Science: c:\Users\dylan\Documents\Dylans Doc's\Dylan's Hackathons\2025\SharkByte\sharkbyte2025\DataCollection\downloaded_pdfs\Associates_in_Science


In [33]:
# Helper functions for PDF downloading
def clean_major_name(name):
    """Clean major name to be used in filename"""
    # Remove special characters and replace spaces with underscores
    name = re.sub(r'[^\w\s-]', '', name)
    name = name.replace(' ', '_')
    name = name.replace('__', '_')  # Remove double underscores
    return name.strip('_')

def download_pdf(url, filepath):
    """Download a PDF from URL to filepath"""
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        return True
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return False

print("Helper functions defined successfully")

Helper functions defined successfully


In [None]:
# ============================================================================
# PDF Detection Logic - Updated for Miami Dade College
# ============================================================================
# The PDFs are loaded via JavaScript into a <p id="prg_link"> element
# We need to use Selenium to find them, or wait for JavaScript to execute
# ============================================================================

def find_pdf_links_selenium(driver, major_url):
    """
    Find Course List and Course Sequence PDF links using Selenium.
    The PDFs are loaded via JavaScript into <p id="prg_link"> element.
    We need to wait for the JavaScript function GetPgmSheetLink to execute.
    """
    course_list_pdf = None
    course_sequence_pdf = None
    
    try:
        # First, wait for the prg_link element to exist
        wait = WebDriverWait(driver, 15)
        try:
            prg_link_element = wait.until(
                EC.presence_of_element_located((By.ID, "prg_link"))
            )
        except TimeoutException:
            # Element doesn't exist - this might not be a program page
            print(f"    ⚠ prg_link element not found on {major_url}")
            return None, None
        
        # Wait for JavaScript to populate the links
        # The GetPgmSheetLink function populates the links dynamically
        # Strategy: Wait for links to appear OR wait for JavaScript to complete
        max_wait_attempts = 3
        pdf_links_found = False
        
        for attempt in range(max_wait_attempts):
            # Check if element has any links
            pdf_links = prg_link_element.find_elements(By.TAG_NAME, "a")
            pdf_links_with_href = [link for link in pdf_links if link.get_attribute('href')]
            
            if pdf_links_with_href:
                # Check if any have .pdf in href
                for link in pdf_links_with_href:
                    href = link.get_attribute('href') or ''
                    if '.pdf' in href.lower():
                        pdf_links_found = True
                        break
                
                if pdf_links_found:
                    break
            
            # Wait a bit and try again
            if attempt < max_wait_attempts - 1:
                time.sleep(2)
        
        # Additional wait to ensure JavaScript has fully populated
        time.sleep(2)
        
        # Find all links within the prg_link element
        # Refresh the element reference to get latest content
        pdf_links = driver.find_elements(By.CSS_SELECTOR, "#prg_link a")
        
        # Filter for PDF links
        pdf_links_filtered = []
        for link in pdf_links:
            href = link.get_attribute('href') or ''
            if href and '.pdf' in href.lower():
                pdf_links_filtered.append(link)
        
        # If no PDF links found, check element text and wait for JavaScript
        if not pdf_links_filtered:
            element_text = prg_link_element.text.strip()
            element_html = prg_link_element.get_attribute('innerHTML') or ''
            
            if not element_text and not element_html:
                # Element exists but is completely empty - JavaScript hasn't populated it yet
                print(f"    ⚠ prg_link element exists but appears empty - waiting for JavaScript...")
                # Wait for JavaScript function to complete
                time.sleep(5)
                # Try again
                pdf_links = driver.find_elements(By.CSS_SELECTOR, "#prg_link a")
                pdf_links_filtered = [link for link in pdf_links 
                                     if link.get_attribute('href') and '.pdf' in link.get_attribute('href').lower()]
            elif element_text and 'course' in element_text.lower():
                # Element has text but links might not be loaded yet
                print(f"    ⚠ prg_link has text but no PDF links found - waiting for JavaScript...")
                time.sleep(3)
                pdf_links = driver.find_elements(By.CSS_SELECTOR, "#prg_link a")
                pdf_links_filtered = [link for link in pdf_links 
                                     if link.get_attribute('href') and '.pdf' in link.get_attribute('href').lower()]
        
        # Process the PDF links
        for link in pdf_links_filtered:
            href = link.get_attribute('href')
            text = link.text.strip().lower()
            
            if not href:
                continue
            
            # Check for course list PDF
            if ('course' in text and 'list' in text) or \
               ('complete' in text and 'course' in text) or \
               ('pathway' in text and 'guide' in text) or \
               '/ps/' in href.lower():
                if not course_list_pdf:  # Only set if not already found
                    course_list_pdf = href
            # Check for course sequence PDF
            elif ('course' in text and 'sequence' in text) or \
                 ('sequence' in text and 'guide' in text) or \
                 '/csg/' in href.lower():
                if not course_sequence_pdf:  # Only set if not already found
                    course_sequence_pdf = href
        
        # If we didn't find by text, try by position (first link = list, second = sequence)
        if not course_list_pdf and not course_sequence_pdf and len(pdf_links_filtered) >= 2:
            course_list_pdf = pdf_links_filtered[0].get_attribute('href')
            course_sequence_pdf = pdf_links_filtered[1].get_attribute('href')
        elif not course_list_pdf and len(pdf_links_filtered) >= 1:
            # Try to determine which is which by URL pattern
            for link in pdf_links_filtered:
                href = link.get_attribute('href')
                if '/ps/' in href.lower() or 'pathway' in href.lower():
                    course_list_pdf = href
                elif '/csg/' in href.lower() or 'csg' in href.lower():
                    course_sequence_pdf = href
        
    except TimeoutException:
        print(f"    ⚠ Timeout waiting for prg_link element on {major_url}")
        # Check if page is a program page or something else
        page_title = driver.title if driver else "Unknown"
        if 'campus' in page_title.lower() or 'hialeah' in major_url.lower() or 'homestead' in major_url.lower():
            print(f"    ⚠ This appears to be a campus page, not a program page: {major_url}")
            return None, None
        
        # Fallback: Try to find PDF links anywhere on the page
        print(f"    ⚠ Trying fallback: searching entire page for PDF links...")
        try:
            all_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='.pdf']")
            for link in all_links:
                href = link.get_attribute('href') or ''
                text = link.text.strip().lower()
                
                if '/ps/' in href.lower() or 'pathway' in href.lower() or ('course' in text and 'list' in text):
                    if not course_list_pdf:
                        course_list_pdf = href
                elif '/csg/' in href.lower() or ('course' in text and 'sequence' in text):
                    if not course_sequence_pdf:
                        course_sequence_pdf = href
        except Exception as fallback_error:
            print(f"    ⚠ Fallback search also failed: {fallback_error}")
            
    except Exception as e:
        print(f"    ⚠ Error finding PDFs with Selenium: {e}")
        import traceback
        traceback.print_exc()
    
    return course_list_pdf, course_sequence_pdf


def find_pdf_links(major_soup, major_url):
    """
    Find Course List and Course Sequence PDF links from BeautifulSoup parsed HTML.
    This is a fallback method if Selenium is not available.
    """
    course_list_pdf = None
    course_sequence_pdf = None
    
    # Method 1: Look for prg_link element (may be empty if JavaScript hasn't run)
    prg_link = major_soup.find('p', id='prg_link')
    if prg_link:
        pdf_links = prg_link.find_all('a', href=re.compile(r'\.pdf', re.I))
        
        for pdf_link in pdf_links:
            link_text = pdf_link.get_text(strip=True).lower()
            link_href = pdf_link.get('href', '').lower()
            
            # Check for course list PDF
            if ('course' in link_text and 'list' in link_text) or \
               ('complete' in link_text and 'course' in link_text):
                course_list_pdf = pdf_link.get('href')
            # Check for course sequence PDF
            elif ('course' in link_text and 'sequence' in link_text) or \
                 ('sequence' in link_text and 'guide' in link_text):
                course_sequence_pdf = pdf_link.get('href')
        
        # If found by position
        if len(pdf_links) >= 2:
            if not course_list_pdf:
                course_list_pdf = pdf_links[0].get('href')
            if not course_sequence_pdf:
                course_sequence_pdf = pdf_links[1].get('href')
    
    # Method 2: Fallback - search all PDF links on page
    if not course_list_pdf or not course_sequence_pdf:
        all_pdfs = major_soup.find_all('a', href=re.compile(r'\.pdf$', re.I))
        
        for pdf_link in all_pdfs:
            link_text = pdf_link.get_text(strip=True).lower()
            link_href = pdf_link.get('href', '').lower()
            
            if not course_list_pdf:
                if ('course' in link_text and 'list' in link_text) or \
                   ('pathway-guide' in link_href) or ('/ps/' in link_href):
                    course_list_pdf = pdf_link.get('href')
            
            if not course_sequence_pdf:
                if ('course' in link_text and 'sequence' in link_text) or \
                   ('csg' in link_href) or ('sequence' in link_href):
                    course_sequence_pdf = pdf_link.get('href')
    
    return course_list_pdf, course_sequence_pdf

print("PDF finding functions defined (Selenium and BeautifulSoup methods)")

PDF finding functions defined (Selenium and BeautifulSoup methods)


In [None]:
# ============================================================================
# MAIN SCRAPING LOOP
# ============================================================================
# This cell will iterate through all major links and download the PDFs
# Make sure Cell 3 ran successfully and found major links before running this
# ============================================================================

# Check if we have major links (from Cell 3)
if 'major_links' not in globals() or not major_links:
    print("❌ ERROR: No major links found!")
    print("Please run Cell 3 first to get the list of majors.")
else:
    # Filter out any invalid entries
    valid_major_links = []
    for link_info in major_links:
        if isinstance(link_info, dict) and link_info.get('href') and link_info.get('text'):
            valid_major_links.append(link_info)
        elif hasattr(link_info, 'get'):  # Fallback for BeautifulSoup elements
            href = link_info.get('href')
            text = link_info.get_text(strip=True)
            if href and text:
                valid_major_links.append({'text': text, 'href': href})
    
    major_links = valid_major_links
    
    if not major_links:
        print("❌ ERROR: No valid major links found!")
    else:
        print(f"Starting to process {len(major_links)} majors...")
        print("=" * 80)
        
        # Extract base domain
        base_domain = '/'.join(base_url.split('/')[:3])  # Gets https://www.mdc.edu
        
        for i, link_info in enumerate(major_links):
            # Get major name, URL, and degree type
            if isinstance(link_info, dict):
                major_name = link_info.get('text', '')
                major_url = link_info.get('href', '')
                degree_type = link_info.get('degree_type', 'Unknown')
            else:
                major_name = link_info.get_text(strip=True)
                major_url = link_info.get('href', '')
                degree_type = 'Unknown'
            
            # Skip if no name or URL
            if not major_name or not major_url:
                continue
            
            # Handle relative URLs - join with base domain
            if not major_url.startswith('http'):
                major_url = urljoin(base_domain, major_url)
            
            # Select output directory based on degree type
            if degree_type == 'AA':
                output_dir = output_dir_aa
                degree_label = 'Associate in Arts'
            elif degree_type == 'AS':
                output_dir = output_dir_as
                degree_label = 'Associate in Science'
            else:
                # Default to AA if unknown
                output_dir = output_dir_aa
                degree_label = 'Unknown (defaulting to AA)'
            
            print(f"\n[{i+1}/{len(major_links)}] Processing: {major_name} ({degree_label})")
            print(f"URL: {major_url}")
            
            try:
                # Use Selenium to get the page (PDFs are loaded via JavaScript)
                if 'driver' not in globals() or driver is None:
                    # Create a new driver if we don't have one
                    driver = setup_selenium_driver()
                    if driver is None:
                        print(f"  ✗ Could not create Selenium driver. Skipping {major_name}.")
                        download_info.append({
                            'Major': major_name,
                            'Degree Type': degree_type,
                            'Type': 'Error',
                            'Filename': 'N/A',
                            'URL': major_url,
                            'Status': 'Error: Selenium driver not available'
                        })
                        continue
                
                # Load the page with Selenium
                driver.get(major_url)
                time.sleep(2)  # Wait for JavaScript to load
                
                # Find PDF links using Selenium
                course_list_pdf, course_sequence_pdf = find_pdf_links_selenium(driver, major_url)
                
                # Clean major name for filename
                clean_name = clean_major_name(major_name)
                
                # Download Course List PDF
                if course_list_pdf:
                    if not course_list_pdf.startswith('http'):
                        course_list_pdf = urljoin(major_url, course_list_pdf)
                    
                    filename_list = f"{clean_name}_Course_list_2025.pdf"
                    filepath_list = os.path.join(output_dir, filename_list)
                    
                    # Skip if already downloaded
                    if os.path.exists(filepath_list):
                        print(f"  ⊙ Already exists: {filename_list}")
                        download_info.append({
                            'Major': major_name,
                            'Degree Type': degree_type,
                            'Type': 'Course List',
                            'Filename': filename_list,
                            'URL': course_list_pdf,
                            'Status': 'Already Exists'
                        })
                    elif download_pdf(course_list_pdf, filepath_list):
                        print(f"  ✓ Downloaded: {filename_list} -> {output_dir}")
                        download_info.append({
                            'Major': major_name,
                            'Degree Type': degree_type,
                            'Type': 'Course List',
                            'Filename': filename_list,
                            'URL': course_list_pdf,
                            'Status': 'Success'
                        })
                    else:
                        print(f"  ✗ Failed to download: {filename_list}")
                        download_info.append({
                            'Major': major_name,
                            'Degree Type': degree_type,
                            'Type': 'Course List',
                            'Filename': filename_list,
                            'URL': course_list_pdf,
                            'Status': 'Failed'
                        })
                else:
                    print(f"  ✗ Course List PDF not found")
                    download_info.append({
                        'Major': major_name,
                        'Degree Type': degree_type,
                        'Type': 'Course List',
                        'Filename': 'N/A',
                        'URL': 'N/A',
                        'Status': 'Not Found'
                    })
                
                # Download Course Sequence PDF
                if course_sequence_pdf:
                    if not course_sequence_pdf.startswith('http'):
                        course_sequence_pdf = urljoin(major_url, course_sequence_pdf)
                    
                    filename_sequence = f"{clean_name}_Course_Sequence_2025.pdf"
                    filepath_sequence = os.path.join(output_dir, filename_sequence)
                    
                    # Skip if already downloaded
                    if os.path.exists(filepath_sequence):
                        print(f"  ⊙ Already exists: {filename_sequence}")
                        download_info.append({
                            'Major': major_name,
                            'Degree Type': degree_type,
                            'Type': 'Course Sequence',
                            'Filename': filename_sequence,
                            'URL': course_sequence_pdf,
                            'Status': 'Already Exists'
                        })
                    elif download_pdf(course_sequence_pdf, filepath_sequence):
                        print(f"  ✓ Downloaded: {filename_sequence} -> {output_dir}")
                        download_info.append({
                            'Major': major_name,
                            'Degree Type': degree_type,
                            'Type': 'Course Sequence',
                            'Filename': filename_sequence,
                            'URL': course_sequence_pdf,
                            'Status': 'Success'
                        })
                    else:
                        print(f"  ✗ Failed to download: {filename_sequence}")
                        download_info.append({
                            'Major': major_name,
                            'Degree Type': degree_type,
                            'Type': 'Course Sequence',
                            'Filename': filename_sequence,
                            'URL': course_sequence_pdf,
                            'Status': 'Failed'
                        })
                else:
                    print(f"  ✗ Course Sequence PDF not found")
                    download_info.append({
                        'Major': major_name,
                        'Degree Type': degree_type,
                        'Type': 'Course Sequence',
                        'Filename': 'N/A',
                        'URL': 'N/A',
                        'Status': 'Not Found'
                    })
                
                # Be polite - add a small delay between requests
                time.sleep(1)
                
            except Exception as e:
                print(f"  ✗ Error processing {major_name}: {e}")
                download_info.append({
                    'Major': major_name,
                    'Degree Type': degree_type,
                    'Type': 'Error',
                    'Filename': 'N/A',
                    'URL': major_url,
                    'Status': f'Error: {str(e)}'
                })
        
        print("\n" + "=" * 80)
        print("Scraping complete!")
        
        # Note: We keep the driver open in case user wants to run more cells
        # Uncomment the following lines to close the driver:
        # if 'driver' in globals() and driver:
        #     try:
        #         driver.quit()
        #         print("Selenium driver closed.")
        #     except:
        #         pass

Starting to process 156 majors...

[1/156] Processing: Accounting
URL: https://www.mdc.edu/accounting/
  ✓ Downloaded: Accounting_Course_list_2025.pdf
  ✓ Downloaded: Accounting_Course_Sequence_2025.pdf

[2/156] Processing: Agriculture
URL: https://www.mdc.edu/agriculture
  ✓ Downloaded: Agriculture_Course_list_2025.pdf
  ✓ Downloaded: Agriculture_Course_Sequence_2025.pdf

[3/156] Processing: Anthropology
URL: https://www.mdc.edu/anthropology
  ✓ Downloaded: Anthropology_Course_list_2025.pdf
  ✓ Downloaded: Anthropology_Course_Sequence_2025.pdf

[4/156] Processing: Architecture
URL: https://www.mdc.edu/architectureaa/
  ✓ Downloaded: Architecture_Course_list_2025.pdf
  ✓ Downloaded: Architecture_Course_Sequence_2025.pdf

[5/156] Processing: Area & Ethnic Studies
URL: https://www.mdc.edu/ethnicstudies
  ✓ Downloaded: Area_Ethnic_Studies_Course_list_2025.pdf
  ✓ Downloaded: Area_Ethnic_Studies_Course_Sequence_2025.pdf

[6/156] Processing: Art or Art Education
URL: https://www.mdc.edu/art

In [48]:
# Create a DataFrame with download information
if download_info:
    df_downloads = pd.DataFrame(download_info)
    print(f"Created DataFrame with {len(df_downloads)} records")
    print(f"\nColumns: {list(df_downloads.columns)}")
    print(f"\nFirst few records:")
    df_downloads.head(10)
else:
    print("⚠ No download information available.")
    print("\nOptions:")
    print("1. Run Cell 8 to download PDFs (will populate download_info)")
    print("2. Run Cell 16 to scan existing PDF files and generate download_info")
    df_downloads = pd.DataFrame()

Created DataFrame with 302 records

Columns: ['Major', 'Degree Type', 'Type', 'Filename', 'URL', 'Status']

First few records:


In [49]:
# Summary statistics
if not df_downloads.empty:
    print("=" * 80)
    print("DOWNLOAD SUMMARY")
    print("=" * 80)
    print(f"Total records: {len(df_downloads)}")
    
    # Count by degree type
    if 'Degree Type' in df_downloads.columns:
        aa_count = len(df_downloads[df_downloads['Degree Type'] == 'AA'])
        as_count = len(df_downloads[df_downloads['Degree Type'] == 'AS'])
        print(f"\nBy Degree Type:")
        print(f"  - Associate in Arts (AA): {aa_count} records")
        print(f"  - Associate in Science (AS): {as_count} records")
    
    # Download results
    successful = len(df_downloads[df_downloads['Status'] == 'Success'])
    failed = len(df_downloads[df_downloads['Status'] == 'Failed'])
    not_found = len(df_downloads[df_downloads['Status'] == 'Not Found'])
    already_exists = len(df_downloads[df_downloads['Status'] == 'Already Exists'])
    errors = len(df_downloads[df_downloads['Status'].str.contains('Error', na=False)])
    
    print(f"\nDownload Results:")
    print(f"  Successful: {successful}")
    print(f"  Already Exists: {already_exists}")
    print(f"  Failed: {failed}")
    print(f"  Not Found: {not_found}")
    print(f"  Errors: {errors}")
    
    print(f"\nStatus breakdown:")
    print(df_downloads['Status'].value_counts())
    
    # Show file counts per folder
    print("\n" + "=" * 80)
    print("Files in folders:")
    print("=" * 80)
    if os.path.exists(output_dir_aa):
        aa_files = [f for f in os.listdir(output_dir_aa) if f.endswith('.pdf')]
        print(f"  {output_dir_aa}: {len(aa_files)} PDF files")
    if os.path.exists(output_dir_as):
        as_files = [f for f in os.listdir(output_dir_as) if f.endswith('.pdf')]
        print(f"  {output_dir_as}: {len(as_files)} PDF files")
    print("=" * 80)
else:
    print("No download information available. Make sure you ran the scraping loop.")

DOWNLOAD SUMMARY
Total records: 302

By Degree Type:
  - Associate in Arts (AA): 154 records
  - Associate in Science (AS): 148 records

Download Results:
  Successful: 0
  Already Exists: 302
  Failed: 0
  Not Found: 0
  Errors: 0

Status breakdown:
Status
Already Exists    302
Name: count, dtype: int64

Files in folders:
  downloaded_pdfs/Associates_in_Arts: 154 PDF files
  downloaded_pdfs/Associates_in_Science: 148 PDF files


In [50]:
# Save download log to CSV
if not df_downloads.empty:
    log_filename = 'pdf_download_log.csv'
    df_downloads.to_csv(log_filename, index=False)
    print(f"\nDownload log saved to: {log_filename}")
    print(f"Total PDFs downloaded: {len(df_downloads[df_downloads['Status'] == 'Success'])}")
else:
    print("No data to save.")


Download log saved to: pdf_download_log.csv
Total PDFs downloaded: 0


In [35]:
# ============================================================================
# TESTING SECTION - Test with Accounting page using Selenium
# ============================================================================
# This will test PDF detection using Selenium (required for JavaScript-loaded content)
# ============================================================================

# Test with Accounting page
test_major_url = "https://www.mdc.edu/accounting/"

print(f"Testing with: Accounting")
print(f"URL: {test_major_url}\n")
print("=" * 80)

# Check if we have a Selenium driver from Cell 3
if 'driver' not in globals() or driver is None:
    print("⚠ No Selenium driver found. Creating a new one...")
    driver = setup_selenium_driver()
    if driver is None:
        print("❌ Could not create Selenium driver. Please run Cell 2 first.")
    else:
        print("✓ Selenium driver created")

if 'driver' in globals() and driver:
    try:
        # Load the page with Selenium
        print(f"\nLoading page with Selenium...")
        driver.get(test_major_url)
        
        # Wait for page to load
        time.sleep(3)
        
        # Test Selenium-based PDF finding
        print("\nTesting Selenium-based PDF detection...")
        course_list, course_sequence = find_pdf_links_selenium(driver, test_major_url)
        
        print(f"\n✓ Course List PDF found: {course_list}")
        print(f"✓ Course Sequence PDF found: {course_sequence}")
        
        if course_list:
            print(f"\n  Course List URL: {course_list}")
            # Test download
            if course_list.startswith('http'):
                test_url = course_list
            else:
                base_domain = '/'.join(test_major_url.split('/')[:3])
                test_url = urljoin(base_domain, course_list)
            print(f"  Full URL: {test_url}")
        
        if course_sequence:
            print(f"\n  Course Sequence URL: {course_sequence}")
            # Test download
            if course_sequence.startswith('http'):
                test_url = course_sequence
            else:
                base_domain = '/'.join(test_major_url.split('/')[:3])
                test_url = urljoin(base_domain, course_sequence)
            print(f"  Full URL: {test_url}")
        
        # Also check what's in the prg_link element
        try:
            prg_link = driver.find_element(By.ID, "prg_link")
            print(f"\n  prg_link element text: {prg_link.text}")
            links_in_prg = prg_link.find_elements(By.TAG_NAME, "a")
            print(f"  Number of links in prg_link: {len(links_in_prg)}")
            for i, link in enumerate(links_in_prg):
                print(f"    Link {i+1}: '{link.text}' -> {link.get_attribute('href')}")
        except Exception as e:
            print(f"  ⚠ Could not inspect prg_link element: {e}")
        
        if course_list and course_sequence:
            print("\n" + "=" * 80)
            print("✓ SUCCESS: Both PDFs found! The scraper should work correctly.")
            print("=" * 80)
        elif course_list or course_sequence:
            print("\n" + "=" * 80)
            print("⚠ PARTIAL: Only one PDF found. Check the detection logic.")
            print("=" * 80)
        else:
            print("\n" + "=" * 80)
            print("✗ FAILED: No PDFs found. The page structure may have changed.")
            print("=" * 80)
            
            # Debug: Check page source
            page_source = driver.page_source
            if 'prg_link' in page_source:
                print("\n  prg_link element exists in page source")
                if 'GetPgmSheetLink' in page_source:
                    print("  GetPgmSheetLink function found - content is loaded via JavaScript")
            else:
                print("\n  prg_link element NOT found in page source")
    
    except Exception as e:
        print(f"\n✗ Error: {e}")
        import traceback
        traceback.print_exc()
else:
    print("❌ Cannot test without Selenium driver. Please install selenium and webdriver-manager.")


Testing with: Accounting
URL: https://www.mdc.edu/accounting/


Loading page with Selenium...

Testing Selenium-based PDF detection...

✓ Course List PDF found: https://www.mdc.edu/academics/programs/ps/associate-in-arts-pathway-guide-effective-2025.pdf
✓ Course Sequence PDF found: https://www.mdc.edu/academics/programs/csg/2025/AA_Accounting_csg.pdf

  Course List URL: https://www.mdc.edu/academics/programs/ps/associate-in-arts-pathway-guide-effective-2025.pdf
  Full URL: https://www.mdc.edu/academics/programs/ps/associate-in-arts-pathway-guide-effective-2025.pdf

  Course Sequence URL: https://www.mdc.edu/academics/programs/csg/2025/AA_Accounting_csg.pdf
  Full URL: https://www.mdc.edu/academics/programs/csg/2025/AA_Accounting_csg.pdf

  prg_link element text: See a complete course list (2025)
See a course sequence guide (2025)
  Number of links in prg_link: 2
    Link 1: 'See a complete course list (2025)' -> https://www.mdc.edu/academics/programs/ps/associate-in-arts-pathway-guide-

In [None]:
# ============================================================================
# ALTERNATIVE: Try to find API endpoint or check JavaScript loading
# ============================================================================
# If majors are loaded via JavaScript, we might be able to find the API endpoint
# or we'll need to use Selenium to render the page
# ============================================================================

print("=" * 80)
print("Checking for API endpoints or JavaScript loading patterns")
print("=" * 80)

# Look for script tags that might contain API calls
scripts = soup.find_all('script')
api_endpoints = []

for script in scripts:
    if script.string:
        script_content = script.string
        
        # Look for GetProgramsSimpleList function call
        if 'GetProgramsSimpleList' in script_content:
            print("\n✓ Found 'GetProgramsSimpleList' function call")
            # Try to extract the endpoint
            import re
            # Look for URL patterns in the script
            urls = re.findall(r'https?://[^\s"\'<>)]+', script_content)
            if urls:
                print("  Potential API endpoints found:")
                for url in set(urls[:5]):  # Show first 5 unique URLs
                    print(f"    - {url}")
        
        # Look for common API patterns
        api_patterns = [
            r'/api/[^\s"\'<>)]+',
            r'/webservices/[^\s"\'<>)]+',
            r'GetProgramsSimpleList\([^)]+\)'
        ]
        
        for pattern in api_patterns:
            matches = re.findall(pattern, script_content)
            if matches:
                api_endpoints.extend(matches)

if api_endpoints:
    print(f"\nFound {len(set(api_endpoints))} potential API patterns")
    for endpoint in set(api_endpoints)[:10]:
        print(f"  - {endpoint}")

# Check the script that loads the program widget
program_widget_script = soup.find('script', src=re.compile(r'program-widgets'))
if program_widget_script:
    print("\n✓ Found program-widgets script")
    script_src = program_widget_script.get('src', '')
    print(f"  Script source: {script_src}")

print("\n" + "=" * 80)
print("NEXT STEPS:")
print("1. If no majors found in Cell 3, the content is loaded via JavaScript")
print("2. Check Cell 4 output to see what's actually in the HTML")
print("3. We may need to use Selenium to render the JavaScript")
print("4. Alternatively, we might be able to call the API directly")
print("=" * 80)


In [None]:
# ============================================================================
# NEXT STEPS - READ THIS CAREFULLY
# ============================================================================
"""
STEP-BY-STEP INSTRUCTIONS:

1. INSTALL REQUIRED PACKAGES:
   - Install Selenium: pip install selenium
   - Install webdriver-manager (recommended): pip install webdriver-manager
     * This automatically handles ChromeDriver installation
   - OR manually install ChromeDriver if you prefer

2. RUN CELL 1:
   - Loads modules and fetches the page
   - Base URL is already configured for Miami Dade College

3. RUN CELL 2:
   - Sets up Selenium driver function
   - Verifies static HTML can be fetched

4. RUN CELL 3 (IMPORTANT):
   - Uses Selenium to render JavaScript and find major links
   - Should find ~79 Associate in Arts + ~82 Associate in Science majors
   - If it fails, check the error message and install ChromeDriver

5. TEST WITH ONE MAJOR (Cell 12):
   - Tests the Accounting page to see PDF structure
   - Run this to verify PDF detection works correctly
   - Check the output to see what PDFs are found

6. UPDATE PDF DETECTION LOGIC (Cell 7) IF NEEDED:
   - Based on Cell 12 output, update find_pdf_links() function if needed
   - The function should identify:
     * Course List PDF (usually contains "list" or "courses" in text/URL)
     * Course Sequence PDF (usually contains "sequence" in text/URL)
   - Adjust the text matching conditions based on actual PDF links

7. RUN THE FULL SCRAPER (Cell 8):
   - Downloads PDFs for all majors found in Cell 3
   - PDFs will be saved in the "downloaded_pdfs" folder
   - A log file "pdf_download_log.csv" will be created
   - Skips files that already exist

8. REVIEW RESULTS (Cells 9-11):
   - Check the download summary
   - Review the CSV log file
   - Re-run for any failed downloads

TROUBLESHOOTING:
- If Cell 3 fails: Install ChromeDriver or webdriver-manager
- If no majors found: Check if the page structure changed
- If PDFs not found: Update find_pdf_links() based on Cell 12 output
- If downloads fail: Check if the website requires authentication

NOTE: The majors list is loaded via JavaScript, so Selenium is required!
"""
print("=" * 80)
print("NEXT STEPS:")
print("1. Install selenium and webdriver-manager: pip install selenium webdriver-manager")
print("2. Run Cell 1 - Load modules")
print("3. Run Cell 2 - Setup Selenium")
print("4. Run Cell 3 - Find major links (uses Selenium)")
print("5. Run Cell 12 - Test with Accounting page")
print("6. Update Cell 7 if PDF detection needs adjustment")
print("7. Run Cell 8 - Download all PDFs")
print("8. Review results in Cells 9-11")
print("=" * 80)



In [38]:
# ============================================================================
# UTILITY: Reorganize Existing PDFs into Degree Type Folders
# ============================================================================
# If you've already downloaded PDFs to a single folder, this cell will
# reorganize them into the AA and AS folders based on degree type
# ============================================================================

import shutil

# Check if there's an old single folder with PDFs
old_output_dir = "downloaded_pdfs"
reorganized_count = 0

if os.path.exists(old_output_dir) and os.path.isdir(old_output_dir):
    # Check if there are PDFs in the root folder (not in subfolders)
    pdf_files = [f for f in os.listdir(old_output_dir) 
                 if f.endswith('.pdf') and os.path.isfile(os.path.join(old_output_dir, f))]
    
    if pdf_files and 'major_links' in globals():
        print("=" * 80)
        print("REORGANIZING EXISTING PDFs")
        print("=" * 80)
        print(f"Found {len(pdf_files)} PDF files in {old_output_dir}")
        print("Organizing them by degree type...\n")
        
        # Create a mapping of major names to degree types
        major_to_degree = {}
        for link_info in major_links:
            if isinstance(link_info, dict):
                major_name = link_info.get('text', '')
                degree_type = link_info.get('degree_type', 'Unknown')
                clean_name = clean_major_name(major_name)
                major_to_degree[clean_name] = degree_type
        
        # Move each PDF to the appropriate folder
        for pdf_file in pdf_files:
            # Extract major name from filename
            # Format: MajorName_Course_list_2025.pdf or MajorName_Course_Sequence_2025.pdf
            base_name = pdf_file.replace('_Course_list_2025.pdf', '').replace('_Course_Sequence_2025.pdf', '')
            
            # Find matching major
            degree_type = None
            for clean_name, deg_type in major_to_degree.items():
                if clean_name == base_name or base_name in clean_name or clean_name in base_name:
                    degree_type = deg_type
                    break
            
            if degree_type == 'AA':
                dest_dir = output_dir_aa
            elif degree_type == 'AS':
                dest_dir = output_dir_as
            else:
                # If we can't determine, check if it's already in a subfolder
                if 'Associates_in_Arts' in pdf_file or 'AA' in pdf_file:
                    dest_dir = output_dir_aa
                elif 'Associates_in_Science' in pdf_file or 'AS' in pdf_file:
                    dest_dir = output_dir_as
                else:
                    # Default to AA if unknown
                    dest_dir = output_dir_aa
                    print(f"  ⚠ Unknown degree type for {pdf_file}, moving to AA folder")
            
            # Move the file
            src_path = os.path.join(old_output_dir, pdf_file)
            dest_path = os.path.join(dest_dir, pdf_file)
            
            # Skip if file already exists in destination
            if os.path.exists(dest_path):
                print(f"  ⊙ Skipping {pdf_file} (already in {dest_dir})")
                # Optionally remove from old location
                # os.remove(src_path)
            else:
                try:
                    shutil.move(src_path, dest_path)
                    print(f"  ✓ Moved {pdf_file} -> {dest_dir}")
                    reorganized_count += 1
                except Exception as e:
                    print(f"  ✗ Error moving {pdf_file}: {e}")
        
        print(f"\n✓ Reorganized {reorganized_count} PDF files")
        print(f"  - Associate in Arts folder: {output_dir_aa}")
        print(f"  - Associate in Science folder: {output_dir_as}")
    elif not pdf_files:
        print("No PDF files found in the root downloaded_pdfs folder.")
        print("PDFs may already be organized, or you need to run the scraper first.")
    else:
        print("Major links not found. Please run Cell 3 first to load major information.")
else:
    print("No existing downloaded_pdfs folder found.")
    print("PDFs will be organized automatically when you run Cell 8.")


REORGANIZING EXISTING PDFs
Found 302 PDF files in downloaded_pdfs
Organizing them by degree type...

  ⚠ Unknown degree type for Accounting_Course_list_2025.pdf, moving to AA folder
  ✓ Moved Accounting_Course_list_2025.pdf -> downloaded_pdfs/Associates_in_Arts
  ⚠ Unknown degree type for Accounting_Course_Sequence_2025.pdf, moving to AA folder
  ✓ Moved Accounting_Course_Sequence_2025.pdf -> downloaded_pdfs/Associates_in_Arts
  ⚠ Unknown degree type for Accounting_Technology_Course_list_2025.pdf, moving to AA folder
  ✓ Moved Accounting_Technology_Course_list_2025.pdf -> downloaded_pdfs/Associates_in_Arts
  ⚠ Unknown degree type for Accounting_Technology_Course_Sequence_2025.pdf, moving to AA folder
  ✓ Moved Accounting_Technology_Course_Sequence_2025.pdf -> downloaded_pdfs/Associates_in_Arts
  ⚠ Unknown degree type for Agriculture_Course_list_2025.pdf, moving to AA folder
  ✓ Moved Agriculture_Course_list_2025.pdf -> downloaded_pdfs/Associates_in_Arts
  ⚠ Unknown degree type for Agri

In [47]:
# ============================================================================
# UTILITY: Generate Download Info from Existing Files
# ============================================================================
# This cell scans the downloaded PDF folders and creates download_info
# so you can see statistics even if files were moved/reorganized manually
# Run this cell after reorganizing files or if download_info is empty
# ============================================================================

# Check if output directories are defined
if 'output_dir_aa' not in globals() or 'output_dir_as' not in globals():
    print("⚠ Output directories not defined. Please run Cell 5 first.")
    output_dir_aa = "downloaded_pdfs/Associates_in_Arts"
    output_dir_as = "downloaded_pdfs/Associates_in_Science"

# Clear existing download_info (or create new if it doesn't exist)
download_info = []

aa_count = 0
as_count = 0
old_count = 0

# Scan AA folder
if os.path.exists(output_dir_aa):
    aa_files = [f for f in os.listdir(output_dir_aa) 
                if f.endswith('.pdf') and os.path.isfile(os.path.join(output_dir_aa, f))]
    aa_count = len(aa_files)
    
    for pdf_file in aa_files:
        # Extract major name from filename
        if '_Course_list_2025.pdf' in pdf_file:
            major_name = pdf_file.replace('_Course_list_2025.pdf', '').replace('_', ' ')
            file_type = 'Course List'
        elif '_Course_Sequence_2025.pdf' in pdf_file:
            major_name = pdf_file.replace('_Course_Sequence_2025.pdf', '').replace('_', ' ')
            file_type = 'Course Sequence'
        else:
            major_name = pdf_file.replace('.pdf', '').replace('_', ' ')
            file_type = 'Unknown'
        
        filepath = os.path.join(output_dir_aa, pdf_file)
        file_exists = os.path.exists(filepath)
        
        download_info.append({
            'Major': major_name,
            'Degree Type': 'AA',
            'Type': file_type,
            'Filename': pdf_file,
            'URL': 'N/A (existing file)',
            'Status': 'Already Exists' if file_exists else 'Not Found'
        })
else:
    print(f"⚠ Folder not found: {output_dir_aa}")

# Scan AS folder
if os.path.exists(output_dir_as):
    as_files = [f for f in os.listdir(output_dir_as) 
                if f.endswith('.pdf') and os.path.isfile(os.path.join(output_dir_as, f))]
    as_count = len(as_files)
    
    for pdf_file in as_files:
        # Extract major name from filename
        if '_Course_list_2025.pdf' in pdf_file:
            major_name = pdf_file.replace('_Course_list_2025.pdf', '').replace('_', ' ')
            file_type = 'Course List'
        elif '_Course_Sequence_2025.pdf' in pdf_file:
            major_name = pdf_file.replace('_Course_Sequence_2025.pdf', '').replace('_', ' ')
            file_type = 'Course Sequence'
        else:
            major_name = pdf_file.replace('.pdf', '').replace('_', ' ')
            file_type = 'Unknown'
        
        filepath = os.path.join(output_dir_as, pdf_file)
        file_exists = os.path.exists(filepath)
        
        download_info.append({
            'Major': major_name,
            'Degree Type': 'AS',
            'Type': file_type,
            'Filename': pdf_file,
            'URL': 'N/A (existing file)',
            'Status': 'Already Exists' if file_exists else 'Not Found'
        })
else:
    print(f"⚠ Folder not found: {output_dir_as}")

# Also check the old folder if it still exists
old_output_dir = "downloaded_pdfs"
if os.path.exists(old_output_dir) and os.path.isdir(old_output_dir):
    # Only check for files in the root, not in subdirectories
    old_files = [f for f in os.listdir(old_output_dir) 
                 if f.endswith('.pdf') and os.path.isfile(os.path.join(old_output_dir, f))]
    old_count = len(old_files)
    
    for pdf_file in old_files:
        if '_Course_list_2025.pdf' in pdf_file:
            major_name = pdf_file.replace('_Course_list_2025.pdf', '').replace('_', ' ')
            file_type = 'Course List'
        elif '_Course_Sequence_2025.pdf' in pdf_file:
            major_name = pdf_file.replace('_Course_Sequence_2025.pdf', '').replace('_', ' ')
            file_type = 'Course Sequence'
        else:
            major_name = pdf_file.replace('.pdf', '').replace('_', ' ')
            file_type = 'Unknown'
        
        # Try to determine degree type from major_links if available
        degree_type = 'Unknown'
        if 'major_links' in globals():
            for link_info in major_links:
                if isinstance(link_info, dict):
                    clean_major = clean_major_name(link_info.get('text', ''))
                    clean_file_major = major_name.replace(' ', '_')
                    if clean_major == clean_file_major or clean_major in clean_file_major or clean_file_major in clean_major:
                        degree_type = link_info.get('degree_type', 'Unknown')
                        break
        
        download_info.append({
            'Major': major_name,
            'Degree Type': degree_type,
            'Type': file_type,
            'Filename': pdf_file,
            'URL': 'N/A (existing file in old folder)',
            'Status': 'Already Exists'
        })

# Print summary
print("=" * 80)
print("SCANNED EXISTING PDF FILES")
print("=" * 80)
print(f"Found {len(download_info)} PDF file records:")
print(f"  - Associate in Arts (AA): {aa_count} files")
print(f"  - Associate in Science (AS): {as_count} files")
if old_count > 0:
    print(f"  - Old folder (root): {old_count} files")
    print(f"    ⚠ Consider running Cell 15 to reorganize these files")
print("=" * 80)
print(f"\n✓ Download info generated! Now run:")
print("  - Cell 9: Create DataFrame")
print("  - Cell 10: View summary statistics")
print("=" * 80)


SCANNED EXISTING PDF FILES
Found 302 PDF file records:
  - Associate in Arts (AA): 154 files
  - Associate in Science (AS): 148 files

✓ Download info generated! Now run:
  - Cell 9: Create DataFrame
  - Cell 10: View summary statistics


In [55]:
# ============================================================================
# BACHELOR'S DEGREE PROGRAMS - Configuration
# ============================================================================
# URL of the Miami Dade College Bachelor Degree Programs page
# This page contains lists of bachelor's degree programs
# ============================================================================

bachelor_base_url = "https://www.mdc.edu/academics/programs/bachelors.aspx"

# Create directory to save Bachelor's PDFs (single folder for all programs)
output_dir_bachelor = "downloaded_pdfs/Bachelors"
os.makedirs(output_dir_bachelor, exist_ok=True)

print(f"Bachelor's PDFs will be saved to: {os.path.abspath(output_dir_bachelor)}")

# Store information about downloaded Bachelor's PDFs
bachelor_download_info = []

# Test the URL
print(f"\nTesting Bachelor's page URL: {bachelor_base_url}")
bachelor_response = requests.get(bachelor_base_url)
print(f"Response status: {bachelor_response.status_code}")
if bachelor_response.status_code == 200:
    print("✓ Bachelor's page is accessible")
else:
    print(f"⚠ Warning: Bachelor's page returned status {bachelor_response.status_code}")


Bachelor's PDFs will be saved to: c:\Users\dylan\Documents\Dylans Doc's\Dylan's Hackathons\2025\SharkByte\sharkbyte2025\DataCollection\downloaded_pdfs\Bachelors

Testing Bachelor's page URL: https://www.mdc.edu/academics/programs/bachelors.aspx
Response status: 200
✓ Bachelor's page is accessible


In [61]:
# ============================================================================
# Finding Bachelor's Program Links using Selenium
# ============================================================================
# The bachelor's programs are loaded via JavaScript, so we need Selenium to render the page
# ============================================================================

print("=" * 80)
print("Loading Bachelor's page with Selenium to render JavaScript content...")
print("=" * 80)

# Initialize Selenium driver if not already available
if 'driver' not in globals() or driver is None:
    driver = setup_selenium_driver()

if driver is None:
    print("\n❌ ERROR: Could not initialize Selenium driver.")
    print("Please install ChromeDriver and try again.")
    bachelor_program_links = []
else:
    try:
        # Load the bachelor's page
        print(f"\nLoading URL: {bachelor_base_url}")
        driver.get(bachelor_base_url)
        
        # Wait for the programs list to load
        print("Waiting for JavaScript to load bachelor's programs...")
        wait = WebDriverWait(driver, 15)
        
        # Wait for programs list to appear (similar structure to associate page)
        # Also wait for page to be fully loaded
        try:
            # Wait for page to be in a ready state
            wait.until(lambda d: d.execute_script('return document.readyState') == 'complete')
            time.sleep(2)  # Additional wait for JavaScript
            
            # Try to find program lists
            wait.until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "div.tab-content, ul.list-unstyled.list-divider, .program-list, main, #siteContent")
                )
            )
            print("✓ Page loaded!")
        except TimeoutException:
            print("⚠ Timeout waiting for page elements. Trying to proceed anyway...")
        
        # Give a bit more time for all content to load
        time.sleep(3)
        
        # Get the page source after JavaScript execution
        page_source = driver.page_source
        soup_bachelor = BeautifulSoup(page_source, "html.parser")
        
        # Now find the program links
        bachelor_program_links = []
        
        # List of patterns to exclude (campus pages, navigation, etc.)
        exclude_patterns = [
            '/home', '/about', '/contact', '/admissions', '/enroll',
            '/hialeah', '/homestead', '/kendall', '/medical', '/north',
            '/padron', '/west', '/wolfson', '/meek', '/gibson',
            '/campus', '/academics/programs/associate', '/academics/programs/bachelor',
            '/academics/programs/', '/search', '/catalog', '/online',
            '/advisement', '/registration', '/tuition', '/financialaid',
            '/academics/', '/future-students', '/employees', '/jobs',
            '/privacy', '/sitemap', '/mobile', '/catalog/'
        ]
        
        # Method 1: Look for tab-content structure (like associate page)
        tab_content = soup_bachelor.find('div', class_='tab-content')
        if tab_content:
            print(f"\n✓ Found tab-content container")
            tab_panels = tab_content.find_all('div', role='tabpanel')
            print(f"✓ Found {len(tab_panels)} tab panels")
            
            for tab_panel in tab_panels:
                tab_id = tab_panel.get('id', 'unknown')
                majors_list = tab_panel.find('ul', class_='list-unstyled list-divider')
                
                if majors_list:
                    links = majors_list.find_all('a', href=True)
                    print(f"  ✓ Tab '{tab_id}': Found {len(links)} links")
                    
                    for link in links:
                        href = link.get('href', '')
                        text = link.get_text(strip=True)
                        
                        if href and href.startswith('/') and not href.startswith('#') and text:
                            href_lower = href.lower()
                            # Skip excluded patterns
                            if any(skip in href_lower for skip in exclude_patterns):
                                continue
                            # Include links that look like program pages (e.g., /bsn/, /biological-sciences/)
                            bachelor_program_links.append({
                                'text': text,
                                'href': href,
                                'degree_type': 'Bachelor'
                            })
        
        # Method 2: Look in main content area for program links
        # Bachelor's programs might be listed directly without tabs
        main_content = soup_bachelor.find('div', id='siteContent') or soup_bachelor.find('main') or soup_bachelor.find('div', class_='container')
        
        if main_content:
            # Look for lists in the main content
            program_lists = main_content.find_all('ul', class_='list-unstyled list-divider')
            if not program_lists:
                # Try finding any ul with links that look like programs
                all_lists = main_content.find_all('ul')
                for ul in all_lists:
                    links = ul.find_all('a', href=True)
                    # If this list has several links that look like program links, use it
                    program_like_count = 0
                    for link in links:
                        href = link.get('href', '')
                        if href and href.startswith('/') and not href.startswith('#') and len(href.split('/')) <= 3:
                            # Short paths like /bsn/ are likely program links
                            href_lower = href.lower()
                            if not any(skip in href_lower for skip in exclude_patterns):
                                program_like_count += 1
                    if program_like_count >= 3:  # If 3+ program-like links, this is probably the program list
                        program_lists.append(ul)
                        break
            
            print(f"\n✓ Found {len(program_lists)} potential program lists in main content")
            
            for program_list in program_lists:
                links = program_list.find_all('a', href=True)
                print(f"  Checking list with {len(links)} links...")
                
                for link in links:
                    href = link.get('href', '')
                    text = link.get_text(strip=True)
                    
                    # Filter for program links
                    if href and href.startswith('/') and not href.startswith('#') and text:
                        href_lower = href.lower()
                        
                        # Skip excluded patterns
                        if any(skip in href_lower for skip in exclude_patterns):
                            continue
                        
                        # Program links are typically:
                        # - Short paths like /bsn/, /biological-sciences/
                        # - Not in common navigation directories
                        # - Have meaningful text (not just "Home", "About", etc.)
                        path_parts = [p for p in href.split('/') if p]  # Get non-empty path parts
                        
                        # Skip if too many path parts (probably not a program page)
                        if len(path_parts) > 2:
                            continue
                        
                        # Skip common navigation words in text
                        text_lower = text.lower()
                        if any(word in text_lower for word in ['home', 'about', 'contact', 'directions', 'map', 'campus information']):
                            continue
                        
                        # This looks like a program link
                        bachelor_program_links.append({
                            'text': text,
                            'href': href,
                            'degree_type': 'Bachelor'
                        })
        
        # Method 3: Fallback - search all links on page and filter for program-like patterns
        if not bachelor_program_links:
            print("\n⚠ No program links found in structured containers. Searching all links...")
            all_links = soup_bachelor.find_all('a', href=True)
            print(f"  Found {len(all_links)} total links on page")
            
            for link in all_links:
                href = link.get('href', '')
                text = link.get_text(strip=True)
                
                if href and href.startswith('/') and not href.startswith('#') and text:
                    href_lower = href.lower()
                    path_parts = [p for p in href.split('/') if p]
                    
                    # Skip excluded patterns
                    if any(skip in href_lower for skip in exclude_patterns):
                        continue
                    
                    # Look for program-like links: short paths (1-2 parts), ending with /
                    if len(path_parts) <= 2 and href.endswith('/'):
                        text_lower = text.lower()
                        # Skip navigation words
                        if not any(word in text_lower for word in ['home', 'about', 'contact', 'directions', 'map']):
                            bachelor_program_links.append({
                                'text': text,
                                'href': href,
                                'degree_type': 'Bachelor'
                            })
        
        # Remove duplicates and filter out non-program links
        seen = set()
        unique_bachelor_links = []
        
        # Additional filtering: Program pages typically don't have these patterns
        program_exclude_patterns = [
            '/hialeah', '/homestead', '/kendall', '/medical', '/north',
            '/padron', '/west', '/wolfson', '/meek', '/gibson',
            '/campus', '/about/', '/contact/', '/admissions/', '/enroll/',
            '/search', '/catalog', '/online/', '/advisement/', '/registration/',
            '/tuition/', '/financialaid/', '/academics/programs/associate',
            '/academics/programs/bachelor', '/academics/programs/'
        ]
        
        for link_info in bachelor_program_links:
            href = link_info['href']
            text = link_info.get('text', '').lower()
            
            # Skip if already seen
            if href in seen:
                continue
            
            # Skip campus/navigation links
            href_lower = href.lower()
            if any(pattern in href_lower for pattern in program_exclude_patterns):
                continue
            
            # Skip if text suggests it's not a program (e.g., "Home", "About", campus names)
            if any(word in text for word in ['home', 'about', 'contact', 'admissions', 'campus', 'directions']):
                continue
            
            # Only include if it looks like a program page
            # Program pages often have specific paths like /bsn/, /biological-sciences/, etc.
            # or are short paths that aren't common directories
            seen.add(href)
            unique_bachelor_links.append(link_info)
        
        bachelor_program_links = unique_bachelor_links
        
        print("\n" + "=" * 80)
        print(f"RESULT: Found {len(bachelor_program_links)} unique bachelor's program links")
        print("=" * 80)
        
        if bachelor_program_links:
            print("\nFirst 15 programs found:")
            for i, link_info in enumerate(bachelor_program_links[:15]):
                print(f"  {i+1}. {link_info['text']} - {link_info['href']}")
            
            if len(bachelor_program_links) > 15:
                print(f"\n  ... and {len(bachelor_program_links) - 15} more")
            
            # Verify we have actual program links (not campus/navigation)
            program_count = len([link for link in bachelor_program_links 
                                if not any(campus in link['href'].lower() 
                                          for campus in ['/hialeah', '/homestead', '/kendall', '/medical', '/north', '/padron', '/west', '/wolfson'])])
            if program_count < len(bachelor_program_links):
                print(f"\n⚠ WARNING: Found {len(bachelor_program_links) - program_count} non-program links (campuses/navigation)")
                print(f"  Actual program links: {program_count}")
        else:
            print("\n⚠ WARNING: No bachelor's program links found!")
            print("The page structure might be different. Check the page manually.")
            print("\nDebug info:")
            print(f"  Page title: {soup_bachelor.title.string if soup_bachelor.title else 'No title'}")
            print(f"  Page URL: {bachelor_base_url}")
            
            # Debug: Show what we did find
            all_links = soup_bachelor.find_all('a', href=True)
            print(f"\n  Total links on page: {len(all_links)}")
            program_like_links = [link for link in all_links 
                                 if link.get('href', '').startswith('/') 
                                 and not link.get('href', '').startswith('#')
                                 and 'program' in link.get('href', '').lower()]
            if program_like_links:
                print(f"  Links with 'program' in URL: {len(program_like_links)}")
                print("  Sample:")
                for link in program_like_links[:5]:
                    print(f"    - {link.get_text(strip=True)}: {link.get('href')}")
        
    except Exception as e:
        print(f"\n❌ ERROR: {e}")
        import traceback
        traceback.print_exc()
        bachelor_program_links = []


Loading Bachelor's page with Selenium to render JavaScript content...

Loading URL: https://www.mdc.edu/academics/programs/bachelors.aspx
Waiting for JavaScript to load bachelor's programs...
✓ Page loaded!

✓ Found 1 potential program lists in main content
  Checking list with 9 links...

RESULT: Found 9 unique bachelor's program links

First 15 programs found:
  1. Bachelor of Applied Sciences in Leadership and Management Innovation - /leadershipandmanagementinnovation/
  2. Leadership and Management Innovation - Accounting - /accountingmanagement
  3. Leadership and Management Innovation - Hospitality Management - /hospitalitymanagement
  4. Leadership and Management Innovation - Human Resource Management - /humanresourcemanagement
  5. Leadership and Management Innovation - Digital Marketing Concentration - /digitalmarketingbas
  6. Bachelor of Applied Sciences in Supply Chain Management - /supplychainmanagement/
  7. Supply Chain Management – Procurement Management - /procurement-

In [63]:
# ============================================================================
# MAIN SCRAPING LOOP - Bachelor's Degree Programs
# ============================================================================
# This cell will iterate through all bachelor's program links and download the PDFs
# Make sure the previous cell ran successfully and found program links
# ============================================================================

# Check if we have bachelor's program links
if 'bachelor_program_links' not in globals() or not bachelor_program_links:
    print("❌ ERROR: No bachelor's program links found!")
    print("Please run the previous cell first to get the list of programs.")
else:
    # Filter out any invalid entries
    valid_bachelor_links = []
    for link_info in bachelor_program_links:
        if isinstance(link_info, dict) and link_info.get('href') and link_info.get('text'):
            valid_bachelor_links.append(link_info)
    
    bachelor_program_links = valid_bachelor_links
    
    if not bachelor_program_links:
        print("❌ ERROR: No valid bachelor's program links found!")
    else:
        print(f"Starting to process {len(bachelor_program_links)} bachelor's programs...")
        print("=" * 80)
        
        # Extract base domain
        base_domain = '/'.join(bachelor_base_url.split('/')[:3])  # Gets https://www.mdc.edu
        
        for i, link_info in enumerate(bachelor_program_links):
            # Get program name and URL
            program_name = link_info.get('text', '')
            program_url = link_info.get('href', '')
            
            # Skip if no name or URL
            if not program_name or not program_url:
                continue
            
            # Handle relative URLs - join with base domain
            if not program_url.startswith('http'):
                program_url = urljoin(base_domain, program_url)
            
            print(f"\n[{i+1}/{len(bachelor_program_links)}] Processing: {program_name}")
            print(f"URL: {program_url}")
            
            try:
                # Use Selenium to get the page (PDFs are loaded via JavaScript)
                if 'driver' not in globals() or driver is None:
                    driver = setup_selenium_driver()
                    if driver is None:
                        print(f"  ✗ Could not create Selenium driver. Skipping {program_name}.")
                        bachelor_download_info.append({
                            'Program': program_name,
                            'Type': 'Error',
                            'Filename': 'N/A',
                            'URL': program_url,
                            'Status': 'Error: Selenium driver not available'
                        })
                        continue
                
                # Load the page with Selenium
                driver.get(program_url)
                time.sleep(2)  # Wait for JavaScript to load
                
                # Find PDF links using Selenium
                course_list_pdf, course_sequence_pdf = find_pdf_links_selenium(driver, program_url)
                
                # Clean program name for filename
                clean_name = clean_major_name(program_name)
                
                # Download Course List PDF
                if course_list_pdf:
                    if not course_list_pdf.startswith('http'):
                        course_list_pdf = urljoin(program_url, course_list_pdf)
                    
                    filename_list = f"{clean_name}_Course_list_2025.pdf"
                    filepath_list = os.path.join(output_dir_bachelor, filename_list)
                    
                    # Skip if already downloaded
                    if os.path.exists(filepath_list):
                        print(f"  ⊙ Already exists: {filename_list}")
                        bachelor_download_info.append({
                            'Program': program_name,
                            'Type': 'Course List',
                            'Filename': filename_list,
                            'URL': course_list_pdf,
                            'Status': 'Already Exists'
                        })
                    elif download_pdf(course_list_pdf, filepath_list):
                        print(f"  ✓ Downloaded: {filename_list}")
                        bachelor_download_info.append({
                            'Program': program_name,
                            'Type': 'Course List',
                            'Filename': filename_list,
                            'URL': course_list_pdf,
                            'Status': 'Success'
                        })
                    else:
                        print(f"  ✗ Failed to download: {filename_list}")
                        bachelor_download_info.append({
                            'Program': program_name,
                            'Type': 'Course List',
                            'Filename': filename_list,
                            'URL': course_list_pdf,
                            'Status': 'Failed'
                        })
                else:
                    print(f"  ✗ Course List PDF not found")
                    bachelor_download_info.append({
                        'Program': program_name,
                        'Type': 'Course List',
                        'Filename': 'N/A',
                        'URL': 'N/A',
                        'Status': 'Not Found'
                    })
                
                # Download Course Sequence PDF
                if course_sequence_pdf:
                    if not course_sequence_pdf.startswith('http'):
                        course_sequence_pdf = urljoin(program_url, course_sequence_pdf)
                    
                    filename_sequence = f"{clean_name}_Course_Sequence_2025.pdf"
                    filepath_sequence = os.path.join(output_dir_bachelor, filename_sequence)
                    
                    # Skip if already downloaded
                    if os.path.exists(filepath_sequence):
                        print(f"  ⊙ Already exists: {filename_sequence}")
                        bachelor_download_info.append({
                            'Program': program_name,
                            'Type': 'Course Sequence',
                            'Filename': filename_sequence,
                            'URL': course_sequence_pdf,
                            'Status': 'Already Exists'
                        })
                    elif download_pdf(course_sequence_pdf, filepath_sequence):
                        print(f"  ✓ Downloaded: {filename_sequence}")
                        bachelor_download_info.append({
                            'Program': program_name,
                            'Type': 'Course Sequence',
                            'Filename': filename_sequence,
                            'URL': course_sequence_pdf,
                            'Status': 'Success'
                        })
                    else:
                        print(f"  ✗ Failed to download: {filename_sequence}")
                        bachelor_download_info.append({
                            'Program': program_name,
                            'Type': 'Course Sequence',
                            'Filename': filename_sequence,
                            'URL': course_sequence_pdf,
                            'Status': 'Failed'
                        })
                else:
                    print(f"  ✗ Course Sequence PDF not found")
                    bachelor_download_info.append({
                        'Program': program_name,
                        'Type': 'Course Sequence',
                        'Filename': 'N/A',
                        'URL': 'N/A',
                        'Status': 'Not Found'
                    })
                
                # Be polite - add a small delay between requests
                time.sleep(1)
                
            except Exception as e:
                print(f"  ✗ Error processing {program_name}: {e}")
                bachelor_download_info.append({
                    'Program': program_name,
                    'Type': 'Error',
                    'Filename': 'N/A',
                    'URL': program_url,
                    'Status': f'Error: {str(e)}'
                })
        
        print("\n" + "=" * 80)
        print("Bachelor's programs scraping complete!")
        print("=" * 80)


Starting to process 9 bachelor's programs...

[1/9] Processing: Bachelor of Applied Sciences in Leadership and Management Innovation
URL: https://www.mdc.edu/leadershipandmanagementinnovation/
  ✓ Downloaded: Bachelor_of_Applied_Sciences_in_Leadership_and_Management_Innovation_Course_list_2025.pdf
  ✓ Downloaded: Bachelor_of_Applied_Sciences_in_Leadership_and_Management_Innovation_Course_Sequence_2025.pdf

[2/9] Processing: Leadership and Management Innovation - Accounting
URL: https://www.mdc.edu/accountingmanagement
  ✓ Downloaded: Leadership_and_Management_Innovation_-_Accounting_Course_list_2025.pdf
  ✓ Downloaded: Leadership_and_Management_Innovation_-_Accounting_Course_Sequence_2025.pdf

[3/9] Processing: Leadership and Management Innovation - Hospitality Management
URL: https://www.mdc.edu/hospitalitymanagement
  ✓ Downloaded: Leadership_and_Management_Innovation_-_Hospitality_Management_Course_list_2025.pdf
  ✓ Downloaded: Leadership_and_Management_Innovation_-_Hospitality_Mana

In [64]:
# ============================================================================
# Bachelor's Programs Summary Statistics
# ============================================================================

# Create a DataFrame with bachelor's download information
if bachelor_download_info:
    df_bachelor_downloads = pd.DataFrame(bachelor_download_info)
    print("=" * 80)
    print("BACHELOR'S PROGRAMS DOWNLOAD SUMMARY")
    print("=" * 80)
    print(f"Total records: {len(df_bachelor_downloads)}")
    
    # Download results
    successful = len(df_bachelor_downloads[df_bachelor_downloads['Status'] == 'Success'])
    failed = len(df_bachelor_downloads[df_bachelor_downloads['Status'] == 'Failed'])
    not_found = len(df_bachelor_downloads[df_bachelor_downloads['Status'] == 'Not Found'])
    already_exists = len(df_bachelor_downloads[df_bachelor_downloads['Status'] == 'Already Exists'])
    errors = len(df_bachelor_downloads[df_bachelor_downloads['Status'].str.contains('Error', na=False)])
    
    print(f"\nDownload Results:")
    print(f"  Successful: {successful}")
    print(f"  Already Exists: {already_exists}")
    print(f"  Failed: {failed}")
    print(f"  Not Found: {not_found}")
    print(f"  Errors: {errors}")
    
    print(f"\nStatus breakdown:")
    print(df_bachelor_downloads['Status'].value_counts())
    
    # Show file count in folder
    print("\n" + "=" * 80)
    print("Files in Bachelor's folder:")
    print("=" * 80)
    if os.path.exists(output_dir_bachelor):
        bachelor_files = [f for f in os.listdir(output_dir_bachelor) if f.endswith('.pdf')]
        print(f"  {output_dir_bachelor}: {len(bachelor_files)} PDF files")
    print("=" * 80)
    
    # Display first few records
    print("\nFirst few records:")
    print(df_bachelor_downloads.head(10))
    
    # Save to CSV
    bachelor_log_filename = 'bachelor_download_log.csv'
    df_bachelor_downloads.to_csv(bachelor_log_filename, index=False)
    print(f"\n✓ Download log saved to: {bachelor_log_filename}")
else:
    print("No bachelor's download information available. Make sure you ran the scraping loop.")
    df_bachelor_downloads = pd.DataFrame()


BACHELOR'S PROGRAMS DOWNLOAD SUMMARY
Total records: 22

Download Results:
  Successful: 18
  Already Exists: 0
  Failed: 0
  Not Found: 4
  Errors: 0

Status breakdown:
Status
Success      18
Not Found     4
Name: count, dtype: int64

Files in Bachelor's folder:
  downloaded_pdfs/Bachelors: 18 PDF files

First few records:
                                             Program             Type  \
0                                    Future Students      Course List   
1                                    Future Students  Course Sequence   
2                                           Retirees      Course List   
3                                           Retirees  Course Sequence   
4  Bachelor of Applied Sciences in Leadership and...      Course List   
5  Bachelor of Applied Sciences in Leadership and...  Course Sequence   
6  Leadership and Management Innovation - Accounting      Course List   
7  Leadership and Management Innovation - Accounting  Course Sequence   
8  Leadership and 

In [62]:
# ============================================================================
# TESTING: Test Bachelor's Program PDF Detection
# ============================================================================
# Test with one bachelor's program page to verify PDF detection works
# ============================================================================

# Use a known working bachelor's program URL for testing (BSN)
test_program_url = "https://www.mdc.edu/bsn/"
test_program_name = "Bachelor of Science in Nursing (BSN)"

# Alternatively, try to use a program from the list if available
if 'bachelor_program_links' in globals() and bachelor_program_links:
    # Try to find BSN first (known working program)
    test_program = None
    for prog in bachelor_program_links:
        href = prog.get('href', '').lower()
        text = prog.get('text', '').lower()
        if 'bsn' in href or 'nursing' in text:
            test_program = prog
            break
    
    # If BSN found, use it
    if test_program:
        test_program_url_temp = test_program.get('href', '')
        if test_program_url_temp and not test_program_url_temp.startswith('http'):
            base_domain = '/'.join(bachelor_base_url.split('/')[:3])
            test_program_url = urljoin(base_domain, test_program_url_temp)
        else:
            test_program_url = test_program_url_temp
        test_program_name = test_program.get('text', 'BSN')
        print(f"Using program from list: {test_program_name}")
    else:
        print(f"Using known program URL: {test_program_url}")
        print("(BSN not found in program links list)")
else:
    print(f"Using known program URL: {test_program_url}")
    print("(Program links list not available)")

print(f"\nTesting with: {test_program_name}")
print(f"URL: {test_program_url}\n")
print("=" * 80)

# Check if we have a Selenium driver
if 'driver' not in globals() or driver is None:
    print("⚠ No Selenium driver found. Creating a new one...")
    driver = setup_selenium_driver()
    if driver is None:
        print("❌ Could not create Selenium driver. Please run Cell 2 first.")
    else:
        print("✓ Selenium driver created")

if 'driver' in globals() and driver:
    try:
        # Load the page with Selenium
        print(f"\nLoading page with Selenium...")
        driver.get(test_program_url)
        
        # Wait for page to load and JavaScript to execute
        print("Waiting for page and JavaScript to load...")
        time.sleep(5)  # Give more time for JavaScript
        
        # Test Selenium-based PDF finding
        print("\nTesting Selenium-based PDF detection...")
        course_list, course_sequence = find_pdf_links_selenium(driver, test_program_url)
        
        print(f"\n✓ Course List PDF found: {course_list}")
        print(f"✓ Course Sequence PDF found: {course_sequence}")
        
        if course_list:
            print(f"\n  Course List URL: {course_list}")
        if course_sequence:
            print(f"\n  Course Sequence URL: {course_sequence}")
        
        # Check what's in the prg_link element
        try:
            prg_link = driver.find_element(By.ID, "prg_link")
            print(f"\n  prg_link element text: {prg_link.text}")
            links_in_prg = prg_link.find_elements(By.TAG_NAME, "a")
            print(f"  Number of links in prg_link: {len(links_in_prg)}")
            for i, link in enumerate(links_in_prg):
                print(f"    Link {i+1}: '{link.text}' -> {link.get_attribute('href')}")
        except Exception as e:
            print(f"  ⚠ Could not inspect prg_link element: {e}")
        
        if course_list and course_sequence:
            print("\n" + "=" * 80)
            print("✓ SUCCESS: Both PDFs found! The scraper should work correctly.")
            print("=" * 80)
        elif course_list or course_sequence:
            print("\n" + "=" * 80)
            print("⚠ PARTIAL: Only one PDF found. Check the detection logic.")
            print("=" * 80)
        else:
            print("\n" + "=" * 80)
            print("✗ FAILED: No PDFs found. The page structure may be different for bachelor's programs.")
            print("=" * 80)
            
            # Debug: Check page source
            page_source = driver.page_source
            if 'prg_link' in page_source:
                print("\n  prg_link element exists in page source")
                if 'GetPgmSheetLink' in page_source:
                    print("  GetPgmSheetLink function found - content is loaded via JavaScript")
            else:
                print("\n  prg_link element NOT found in page source")
                print("  This might not be a program page")
    
    except Exception as e:
        print(f"\n✗ Error: {e}")
        import traceback
        traceback.print_exc()
else:
    print("❌ Cannot test without Selenium driver.")


Using known program URL: https://www.mdc.edu/bsn/
(BSN not found in program links list)

Testing with: Bachelor of Science in Nursing (BSN)
URL: https://www.mdc.edu/bsn/


Loading page with Selenium...
Waiting for page and JavaScript to load...

Testing Selenium-based PDF detection...

✓ Course List PDF found: https://www.mdc.edu/academics/programs/ps/N9100.pdf
✓ Course Sequence PDF found: https://www.mdc.edu/academics/programs/csg/BS_Nursing_csg.pdf

  Course List URL: https://www.mdc.edu/academics/programs/ps/N9100.pdf

  Course Sequence URL: https://www.mdc.edu/academics/programs/csg/BS_Nursing_csg.pdf

  prg_link element text: See a complete course list
See a course sequence guide
  Number of links in prg_link: 2
    Link 1: 'See a complete course list' -> https://www.mdc.edu/academics/programs/ps/N9100.pdf
    Link 2: 'See a course sequence guide' -> https://www.mdc.edu/academics/programs/csg/BS_Nursing_csg.pdf

✓ SUCCESS: Both PDFs found! The scraper should work correctly.


In [46]:
# ============================================================================
# UTILITY: Reorganize PDFs by Degree Type (AA vs AS)
# ============================================================================
# This cell will scan the Associates_in_Arts folder and move AS files to
# the Associates_in_Science folder based on major_links degree_type information
# ============================================================================

import shutil

# Check if major_links exists
if 'major_links' not in globals() or not major_links:
    print("❌ ERROR: major_links not found!")
    print("Please run Cell 3 first to load the major links with degree type information.")
else:
    # Check if output directories are defined
    if 'output_dir_aa' not in globals() or 'output_dir_as' not in globals():
        output_dir_aa = "downloaded_pdfs/Associates_in_Arts"
        output_dir_as = "downloaded_pdfs/Associates_in_Science"
        os.makedirs(output_dir_aa, exist_ok=True)
        os.makedirs(output_dir_as, exist_ok=True)
    
    # Create a mapping of major names (cleaned) to degree types
    major_to_degree = {}
    for link_info in major_links:
        if isinstance(link_info, dict):
            major_name = link_info.get('text', '')
            degree_type = link_info.get('degree_type', 'Unknown')
            clean_name = clean_major_name(major_name)
            major_to_degree[clean_name] = degree_type
    
    print("=" * 80)
    print("REORGANIZING PDFs BY DEGREE TYPE")
    print("=" * 80)
    print(f"Found {len(major_to_degree)} majors in major_links")
    print(f"  - AA: {sum(1 for v in major_to_degree.values() if v == 'AA')}")
    print(f"  - AS: {sum(1 for v in major_to_degree.values() if v == 'AS')}")
    print()
    
    # Scan the AA folder (where all files currently are)
    moved_count = 0
    kept_count = 0
    unknown_count = 0
    
    if os.path.exists(output_dir_aa):
        pdf_files = [f for f in os.listdir(output_dir_aa) 
                     if f.endswith('.pdf') and os.path.isfile(os.path.join(output_dir_aa, f))]
        
        print(f"Scanning {output_dir_aa}...")
        print(f"Found {len(pdf_files)} PDF files\n")
        
        for pdf_file in pdf_files:
            # Extract major name from filename
            # Format: MajorName_Course_list_2025.pdf or MajorName_Course_Sequence_2025.pdf
            if '_Course_list_2025.pdf' in pdf_file:
                base_name = pdf_file.replace('_Course_list_2025.pdf', '')
            elif '_Course_Sequence_2025.pdf' in pdf_file:
                base_name = pdf_file.replace('_Course_Sequence_2025.pdf', '')
            else:
                base_name = pdf_file.replace('.pdf', '')
            
            # Find matching major and get degree type
            degree_type = None
            matched_major = None
            
            # Try exact match first
            if base_name in major_to_degree:
                degree_type = major_to_degree[base_name]
                matched_major = base_name
            else:
                # Try partial matching (in case of naming differences)
                for clean_name, deg_type in major_to_degree.items():
                    # Check if base_name matches clean_name (accounting for variations)
                    if (base_name == clean_name or 
                        base_name.lower() == clean_name.lower() or
                        base_name.replace('_', '') == clean_name.replace('_', '') or
                        clean_name in base_name or 
                        base_name in clean_name):
                        degree_type = deg_type
                        matched_major = clean_name
                        break
            
            # Determine destination
            src_path = os.path.join(output_dir_aa, pdf_file)
            
            if degree_type == 'AS':
                # Move to AS folder
                dest_path = os.path.join(output_dir_as, pdf_file)
                if os.path.exists(dest_path):
                    print(f"  ⊙ Skipping {pdf_file} (already in AS folder)")
                    # Remove duplicate from AA folder
                    try:
                        os.remove(src_path)
                        print(f"    ✓ Removed duplicate from AA folder")
                    except:
                        pass
                else:
                    try:
                        shutil.move(src_path, dest_path)
                        print(f"  ✓ Moved {pdf_file} -> AS (matched: {matched_major})")
                        moved_count += 1
                    except Exception as e:
                        print(f"  ✗ Error moving {pdf_file}: {e}")
            elif degree_type == 'AA':
                # Keep in AA folder
                print(f"  ✓ Keeping {pdf_file} in AA (matched: {matched_major})")
                kept_count += 1
            else:
                # Unknown - keep in AA for now
                print(f"  ⚠ Unknown degree type for {pdf_file} (keeping in AA)")
                unknown_count += 1
        
        print("\n" + "=" * 80)
        print("REORGANIZATION SUMMARY")
        print("=" * 80)
        print(f"Files moved to AS folder: {moved_count}")
        print(f"Files kept in AA folder: {kept_count}")
        print(f"Files with unknown degree type: {unknown_count}")
        print("=" * 80)
        
        # Show final counts
        if os.path.exists(output_dir_aa):
            aa_files_final = [f for f in os.listdir(output_dir_aa) 
                              if f.endswith('.pdf') and os.path.isfile(os.path.join(output_dir_aa, f))]
            print(f"\nFinal file counts:")
            print(f"  {output_dir_aa}: {len(aa_files_final)} PDF files")
        
        if os.path.exists(output_dir_as):
            as_files_final = [f for f in os.listdir(output_dir_as) 
                              if f.endswith('.pdf') and os.path.isfile(os.path.join(output_dir_as, f))]
            print(f"  {output_dir_as}: {len(as_files_final)} PDF files")
        print("=" * 80)
        print("\n✓ Reorganization complete!")
        print("  Run Cell 16 to regenerate download_info, then Cell 9 and Cell 10 for statistics.")
    else:
        print(f"❌ Folder not found: {output_dir_aa}")
        print("Please make sure the folders exist and files are in the AA folder.")


REORGANIZING PDFs BY DEGREE TYPE
Found 154 majors in major_links
  - AA: 77
  - AS: 77

Scanning downloaded_pdfs/Associates_in_Arts...
Found 302 PDF files

  ✓ Keeping Accounting_Course_list_2025.pdf in AA (matched: Accounting)
  ✓ Keeping Accounting_Course_Sequence_2025.pdf in AA (matched: Accounting)
  ✓ Moved Accounting_Technology_Course_list_2025.pdf -> AS (matched: Accounting_Technology)
  ✓ Moved Accounting_Technology_Course_Sequence_2025.pdf -> AS (matched: Accounting_Technology)
  ✓ Keeping Agriculture_Course_list_2025.pdf in AA (matched: Agriculture)
  ✓ Keeping Agriculture_Course_Sequence_2025.pdf in AA (matched: Agriculture)
  ✓ Moved Animation_Game_Art_Course_list_2025.pdf -> AS (matched: Animation_Game_Art)
  ✓ Moved Animation_Game_Art_Course_Sequence_2025.pdf -> AS (matched: Animation_Game_Art)
  ✓ Keeping Anthropology_Course_list_2025.pdf in AA (matched: Anthropology)
  ✓ Keeping Anthropology_Course_Sequence_2025.pdf in AA (matched: Anthropology)
  ✓ Moved Applied_Artifi

In [None]:
# ============================================================================
# FINAL NOTES
# ============================================================================
"""
After completing the scraping:

1. Verify all PDFs downloaded correctly by checking the file sizes
2. Review the download log CSV file for any errors
3. Manually check a few PDFs to ensure they're the correct files
4. If some PDFs failed to download:
   - Check the error messages in the log
   - Verify the URLs are correct
   - Check if the website requires authentication
   - Some PDFs might be loaded via JavaScript (may need Selenium)

5. Organize the PDFs:
   - The PDFs are saved with naming format: {Major}_Course_list_2025.pdf
   - You can move them to appropriate folders based on program type
   - Use the cleaner.py module if you need to process the PDF content

6. If you need to re-run for specific majors:
   - Filter the major_links list before running the main loop
   - Or modify the loop to skip already-downloaded files
"""
print("Scraping workflow complete!")
print(f"Check the '{output_dir}' folder for downloaded PDFs")
print("Check 'pdf_download_log.csv' for download status")