# **SMU Course Scraping Using Selenium**

<div style="background-color:#FFD700; padding:15px; border-radius:5px; border: 2px solid #FF4500;">
    
  <h1 style="color:#8B0000;">⚠️🚨 SCRAPE THIS DATA AT YOUR OWN RISK 🚨⚠️</h1>
  
  <p><strong>📌 If you need the data, please contact me directly.</strong> Only available for **existing students**.</p>

  <h3>🔗 📩 How to Get the Data?</h3>
  <p>📨 <strong>Reach out to me for access</strong> instead of scraping manually.</p>

</div>

<br>

<div style="background-color:#FFF8DC; padding:12px; border-radius:5px; border: 1px solid #DAA520;">
    
  <h2 style="color:#8B8000;">✨ Looking for the Latest Model? Consider V4! ✨</h2>
  <p>👉 <a href="V4_example_prediction.ipynb"><strong>Check out V4 Here</strong></a></p>

</div>

### **Objective**
This script is designed to scrape SMU course details from the BOSS system using Selenium. The process involves:
1. Logging into the system manually to bypass authentication.
2. Iteratively scraping class details for specified academic years and terms.
3. Writing the scraped data to structured CSV files.

### **Script Structure**
1. **Setup**: Import libraries and initialize Selenium WebDriver.
2. **Login**: Wait for manual login and authentication.
3. **Scraping Logic**:
    - `scrape_class_details`: Scrapes course details for a specific class number, academic year, and term.
    - `main`: Manages the scraping process for multiple academic years and terms.
4. **Execution**: Log in and start scraping.


---

## **1. Setup**

In [1]:
import os
os.environ['PGGSSENCMODE'] = 'disable'

import re
import csv
import time
import pickle
import logging
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from dotenv import load_dotenv
from typing import Dict, List, Optional, Tuple
import psycopg2
from urllib.parse import parse_qs, urlparse
from pathlib import Path
from typing import Dict, List, Optional, Tuple


---

## **2. Login Handling**

In [2]:
def wait_for_manual_login(driver):
    print("Please log in manually and complete the Microsoft Authenticator process.")
    print("Waiting for BOSS dashboard to load...")
    
    # Create a WebDriverWait object with 2-minute timeout
    wait = WebDriverWait(driver, 120)
    
    try:
        # Wait for the username label that appears after successful login
        wait.until(EC.presence_of_element_located((By.ID, "Label_UserName")))
        
        # Verify additional elements to confirm we're fully logged in
        wait.until(EC.presence_of_element_located((By.XPATH, "//a[contains(text(),'Sign out')]")))
        
        # Get the username for confirmation
        username = driver.find_element(By.ID, "Label_UserName").text
        print(f"Login successful! Logged in as {username}")
        
    except TimeoutException:
        print("Login failed or timed out. Could not detect login elements.")
        raise Exception("Login failed")
    
    # Additional small delay to ensure everything is loaded
    time.sleep(1)
    
    return True


---

## **3. Scrape data**

### **3.1 Scrape all data from BOSS**
1. Take all existing AddedInfo files with `SelectedClassNumber` min and max.
2. Scrape entire webpage on BOSS, seperate them by AY and Term
3. Create an overall scraping logic for future. Past AY2024T3B.

In [9]:
def read_class_number_ranges(directory='classTimings'):
    """
    Read existing CSV files to determine min and max class numbers for each AY_TERM.
    
    Args:
        directory: Directory containing CSV files (format: [AY]_[Term]AddedInfo.csv)
    
    Returns:
        Dictionary mapping AY_TERM to min/max class numbers
    """
    class_number_ranges = {}
    
    # Create directory if it doesn't exist
    os.makedirs(directory, exist_ok=True)
    
    # Regex pattern to match files like 2021-22_T1AddedInfo.csv
    pattern = re.compile(r'(\d{4}-\d{2}_T\d[AB]?)AddedInfo\.csv')
    
    try:
        files_found = False
        for filename in os.listdir(directory):
            match = pattern.match(filename)
            if match:
                files_found = True
                # Extract AY_TERM from filename
                ay_term = match.group(1)
                filepath = os.path.join(directory, filename)
                
                min_class = None
                max_class = None
                
                with open(filepath, 'r', encoding='utf-8') as file:
                    reader = csv.DictReader(file)
                    for row in reader:
                        try:
                            class_num = int(row.get('SelectedClassNumber', '').strip())
                            if min_class is None or class_num < min_class:
                                min_class = class_num
                            if max_class is None or class_num > max_class:
                                max_class = class_num
                        except (ValueError, TypeError):
                            continue
                
                if min_class is not None and max_class is not None:
                    class_number_ranges[ay_term] = {'min': min_class, 'max': max_class}
                else:
                    # Default values if no valid class numbers found
                    class_number_ranges[ay_term] = {'min': 1000, 'max': 5000}
                    
        # If no files found, return empty dictionary
        if not files_found:
            print("No class timing files found. Will use default ranges.")
            return {}
            
    except FileNotFoundError:
        print(f"Directory '{directory}' not found. Creating it.")
        os.makedirs(directory, exist_ok=True)
    
    return class_number_ranges

In [10]:
def scrape_and_save_html(driver, start_ay_term='2021-22_T1', end_ay_term='2024-25_T3B', base_dir='classTimingsFull'):
    """
    Scrapes class details from BOSS and saves them as HTML files
    
    Args:
        driver: WebDriver instance that is already logged in
        start_ay_term: Starting academic year and term (e.g., '2021-22_T1')
        end_ay_term: Ending academic year and term (e.g., '2024-25_T3B')
        base_dir: Base directory to save the HTML files
    """
    # Term code mapping for URL parameters
    term_code_map = {'T1': '10', 'T2': '20', 'T3A': '31', 'T3B': '32'}
    
    # Define all possible terms in order
    all_terms = ['T1', 'T2', 'T3A', 'T3B']
    
    # Define all possible academic years
    all_academic_years = ['2021-22', '2022-23', '2023-24', '2024-25', '2025-26', '2026-27']
    
    # Generate all possible AY_TERM combinations
    all_ay_terms = []
    for ay in all_academic_years:
        for term in all_terms:
            all_ay_terms.append(f"{ay}_{term}")
    
    # Find the indices of the start and end terms
    try:
        start_idx = all_ay_terms.index(start_ay_term)
        end_idx = all_ay_terms.index(end_ay_term)
    except ValueError:
        print("Invalid start or end term provided. Using full range.")
        start_idx = 0
        end_idx = len(all_ay_terms) - 1
    
    # Select the range to scrape
    ay_terms_to_scrape = all_ay_terms[start_idx:end_idx+1]
    
    # First, read existing class ranges if available
    class_number_ranges = read_class_number_ranges('classTimings')
    print(f"Found class number ranges: {class_number_ranges}")
    
    # Create base directory if needed
    os.makedirs(base_dir, exist_ok=True)
    
    # Process each AY_TERM
    for ay_term in ay_terms_to_scrape:
        print(f"Processing {ay_term}...")
        
        # Parse AY_TERM for URL
        ay, term = ay_term.split('_')
        ay_short = ay[2:4]  # last two digits of first year
        term_code = term_code_map.get(term, '10')
        
        # Get min/max class numbers or use defaults
        ranges = class_number_ranges.get(ay_term, {'min': 1000, 'max': 5000})
        min_class = ranges.get('min', 1000)
        max_class = ranges.get('max', 5000)
        
        # Create folder for AY_TERM
        folder_path = os.path.join(base_dir, ay_term)
        os.makedirs(folder_path, exist_ok=True)
        
        consecutive_empty = 0
        
        # Scrape each class number in range
        for class_num in range(min_class, max_class + 1):
            url = f"https://boss.intranet.smu.edu.sg/ClassDetails.aspx?SelectedClassNumber={class_num:04}&SelectedAcadTerm={ay_short}{term_code}&SelectedAcadCareer=UGRD"
            
            try:
                driver.get(url)
                
                # Wait for EITHER the success element OR the error element to appear
                wait = WebDriverWait(driver, 15)
                try:
                    # Wait for either the class header OR the error details element
                    element = wait.until(EC.any_of(
                        EC.presence_of_element_located((By.ID, "lblClassInfoHeader")),
                        EC.presence_of_element_located((By.ID, "lblErrorDetails"))
                    ))
                    
                    # Check if "No record found" is in the error details
                    error_elements = driver.find_elements(By.ID, "lblErrorDetails")
                    has_data = True
                    
                    for error in error_elements:
                        if "No record found" in error.text:
                            has_data = False
                            break
                        
                except Exception as e:
                    print(f"Wait error: {e}")
                    has_data = False
                
                if not has_data:
                    consecutive_empty += 1
                    print(f"No record found for {ay_term}, class {class_num:04}. Consecutive empty: {consecutive_empty}")
                    
                    if consecutive_empty >= 100:
                        print(f"100 consecutive empty records reached for {ay_term}, moving on.")
                        break
                    
                    # No need to wait 30 seconds if we already know it's empty
                    time.sleep(2)  # Small pause before next request
                    continue
                
                # Reset consecutive empty counter if data found
                consecutive_empty = 0
                
                # Save HTML file
                filename = f"SelectedAcadTerm={ay_short}{term_code}&SelectedClassNumber={class_num:04}.html"
                filepath = os.path.join(folder_path, filename)
                
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(driver.page_source)
                
                print(f"Saved {filepath}")
                
                # Small pause between requests
                time.sleep(2)
                
            except Exception as e:
                print(f"Error processing {url}: {str(e)}")
                time.sleep(5)  # Wait a bit longer after an error
    
    print("Scraping completed.")
    driver.quit()

In [11]:
def generate_scraped_filepaths_csv(base_dir='classTimingsFull', output_csv='scraped_filepaths.csv'):
    """
    Generates a CSV file with paths to all valid HTML files (those without "No record found")
    
    Args:
        base_dir: Base directory where HTML files are stored
        output_csv: Name of the output CSV file
    
    Returns:
        Path to the generated CSV file
    """
    filepaths = []
    
    # Check if base directory exists
    if not os.path.exists(base_dir):
        print(f"Directory '{base_dir}' does not exist.")
        return None
    
    # Walk through directory structure
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith('.html'):
                filepath = os.path.join(root, file)
                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        content = f.read()
                        if 'No record found' not in content:
                            filepaths.append(filepath)
                except Exception as e:
                    print(f"Error reading file {filepath}: {str(e)}")
    
    # Write to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Filepath'])
        for path in filepaths:
            writer.writerow([path])
    
    print(f"Generated CSV file with {len(filepaths)} valid file paths at {output_csv}")
    return output_csv

In [12]:
if __name__ == "__main__":
    # Set up WebDriver - REMOVED headless mode to allow manual login
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    
    try:
        # Initialize the driver
        driver = webdriver.Chrome(service=service, options=options)
        
        # Step 1: Navigate to login page and wait for manual login
        driver.get("https://boss.intranet.smu.edu.sg/")
        wait_for_manual_login(driver)
        
        # Step 2: Now that we're logged in, proceed with scraping
        # You can optionally run a test scrape first
        # test_scrape_class_details(driver)
        
        # Step 3: Run the main scraping function with the authenticated driver
        scrape_and_save_html(driver, '2025-26_T1', '2025-26_T1', 'classTimingsFull')
        
        # Step 4: Generate CSV with valid file paths
        generate_scraped_filepaths_csv('classTimingsFull', 'scraped_filepaths.csv')
        
    finally:
        # Ensure driver is closed properly
        if driver:
            driver.quit()
        print("Process completed!")

Please log in manually and complete the Microsoft Authenticator process.
Waiting for BOSS dashboard to load...
Login successful! Logged in as Welcome, TAN ZHONG YAN
Found class number ranges: {'2021-22_T1': {'min': 1002, 'max': 2889}, '2021-22_T2': {'min': 1002, 'max': 2957}, '2021-22_T3A': {'min': 1002, 'max': 1038}, '2021-22_T3B': {'min': 1002, 'max': 1033}, '2022-23_T1': {'min': 1002, 'max': 2954}, '2022-23_T2': {'min': 1002, 'max': 2920}, '2022-23_T3A': {'min': 1002, 'max': 1031}, '2022-23_T3B': {'min': 1002, 'max': 1027}, '2023-24_T1': {'min': 1002, 'max': 2982}, '2023-24_T2': {'min': 1002, 'max': 2964}, '2023-24_T3A': {'min': 1002, 'max': 1028}, '2023-24_T3B': {'min': 1003, 'max': 1033}, '2024-25_T1': {'min': 1002, 'max': 2945}, '2024-25_T2': {'min': 1002, 'max': 2786}}
Processing 2025-26_T1...
No record found for 2025-26_T1, class 1000. Consecutive empty: 1
No record found for 2025-26_T1, class 1001. Consecutive empty: 2
Saved classTimingsFull\2025-26_T1\SelectedAcadTerm=2510&Se

### **3.2 Extract needed data from all scraped websites**

### 3.2.1 Test Script

In [None]:
import csv
import os
import re
import pandas as pd
import psycopg2
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import logging
import random
import importlib.util

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SchemaConformanceTestRunner:
    def __init__(self):
        self.test_results = {
            'database_connection': False,
            'table_access': {},
            'scraping_success_count': 0,
            'scraping_error_count': 0,
            'total_files_tested': 0,
            'missing_files_count': 0,
            'csv_generation': {},
            'schema_validation': {},
            'errors': []
        }
        
        # Test database configuration for localhost Supabase
        self.test_db_config = {
            'host': os.getenv('DB_HOST'),
            'database': os.getenv('DB_NAME'),
            'user': os.getenv('DB_USER'),
            'password': os.getenv('DB_PASSWORD'),
            'port': int(os.getenv('DB_PORT')),
            'gssencmode': 'disable'  # ✅ FIX GSSAPI ISSUE
        }
        
        # Expected schema structure based on Prisma schema
        self.expected_schemas = {
            'courses_updates.csv': {
                'required_columns': ['id', 'course_area', 'enrolment_requirements'],  # Only ID + update fields
                'update_columns': ['course_area', 'enrolment_requirements'],
                'table_name': 'courses'
            },
            'classes_updates.csv': {
                'required_columns': ['id', 'grading_basis', 'course_outline_url'],  # Only ID + update fields
                'update_columns': ['grading_basis', 'course_outline_url'],
                'table_name': 'classes'  
            },
            'acad_term.csv': {
                'required_columns': ['id', 'acad_year_start', 'acad_year_end', 'term', 'boss_id', 'start_dt', 'end_dt'],
                'table_name': 'acad_term'
            },
            'class_timing.csv': {
                'required_columns': ['class_id', 'start_date', 'end_date', 'day_of_week', 'start_time', 'end_time', 'venue'],
                'table_name': 'class_timing'
            },
            'class_exam_timing.csv': {
                'required_columns': ['class_id', 'date', 'day_of_week', 'start_time', 'end_time', 'venue'],
                'table_name': 'class_exam_timing'
            },
            # New temp CSV files for missing class IDs
            'temp_classes_new.csv': {
                'required_columns': ['temp_class_id', 'section', 'course_id', 'professor_id', 'acad_term_id', 'grading_basis', 'course_outline_url', 'boss_id'],
                'table_name': 'classes'
            },
            'temp_class_timing.csv': {
                'required_columns': ['temp_class_id', 'start_date', 'end_date', 'day_of_week', 'start_time', 'end_time', 'venue'],
                'table_name': 'class_timing'
            },
            'temp_class_exam_timing.csv': {
                'required_columns': ['temp_class_id', 'date', 'day_of_week', 'start_time', 'end_time', 'venue'],
                'table_name': 'class_exam_timing'
            },
            'existing_classes_for_cleanup.csv': {
                'required_columns': ['course_id', 'class_id', 'section', 'acad_term_id', 'professor_id', 'grading_basis', 'course_outline_url', 'boss_id'],
                'table_name': 'classes'
            },
            'new_courses.csv': {
                'required_columns': ['id', 'code', 'name', 'description', 'credit_units', 'belong_to_university', 'belong_to_faculty', 'course_area', 'enrolment_requirements'],
                'table_name': 'courses'
            },
            'courses_manual_review.csv': {
                'required_columns': ['id', 'code', 'name', 'description', 'credit_units', 'belong_to_university', 'belong_to_faculty', 'course_area', 'enrolment_requirements', 'filepath'],
                'table_name': 'courses'
            }
        }

    def test_database_connection(self):
        """Test connection to localhost Supabase database and table access"""
        logger.info("Testing database connection...")
        try:
            connection = psycopg2.connect(
                host=self.test_db_config['host'],
                database=self.test_db_config['database'],
                user=self.test_db_config['user'],
                password=self.test_db_config['password'],
                port=self.test_db_config['port']
            )
            
            cursor = connection.cursor()
            cursor.execute('SELECT 1')
            result = cursor.fetchone()
            
            if result == (1,):
                logger.info("✅ Database connection successful")
                self.test_results['database_connection'] = True
                self.test_table_access(cursor)
                
            cursor.close()
            connection.close()
            
        except Exception as e:
            logger.error(f"❌ Database connection failed: {e}")
            self.test_results['errors'].append(f"Database connection error: {e}")
            self.test_results['database_connection'] = False

    def test_table_access(self, cursor):
        """Test access to required database tables"""
        required_tables = ['courses', 'classes', 'acad_term', 'class_timing', 'class_exam_timing']
        
        for table in required_tables:
            try:
                cursor.connection.rollback()
                cursor.execute(f"SELECT COUNT(*) FROM {table}")
                count = cursor.fetchone()[0]
                
                self.test_results['table_access'][table] = {
                    'accessible': True,
                    'record_count': count
                }
                logger.info(f"✅ Table '{table}': {count} records accessible")
                
            except Exception as e:
                cursor.connection.rollback()
                logger.error(f"❌ Table '{table}' access failed: {e}")
                self.test_results['table_access'][table] = {
                    'accessible': False,
                    'error': str(e)
                }

    def load_actual_filepaths(self, sample_size=20):
        """Load actual filepaths from scraped_filepaths.csv"""
        try:
            if not os.path.exists('scraped_filepaths.csv'):
                logger.error("❌ scraped_filepaths.csv not found")
                return []
            
            filepaths_df = pd.read_csv('scraped_filepaths.csv')
            filepath_column = 'Filepath' if 'Filepath' in filepaths_df.columns else 'filepath'
            
            existing_files = []
            for filepath in filepaths_df[filepath_column]:
                if pd.notna(filepath) and os.path.exists(str(filepath).strip()):
                    existing_files.append(str(filepath).strip())
            
            logger.info(f"Found {len(existing_files)} existing files")
            
            if len(existing_files) > sample_size:
                selected_files = random.sample(existing_files, sample_size)
            else:
                selected_files = existing_files
                
            logger.info(f"Selected {len(selected_files)} files for testing")
            return selected_files
            
        except Exception as e:
            logger.error(f"Error loading filepaths: {e}")
            return []

    def import_extractor_class(self):
        """Import AfterClassDataExtractor class"""
        try:
            # In Jupyter notebook, try to access the globally defined class
            if 'AfterClassDataExtractor' in globals():
                return globals()['AfterClassDataExtractor']
            else:
                raise ImportError("AfterClassDataExtractor not found in global scope")
        except Exception as e:
            raise ImportError(f"Could not access AfterClassDataExtractor: {e}")

    def test_extractor_functionality(self, test_filepaths):
        """Test the AfterClassDataExtractor with schema validation"""
        logger.info("Testing AfterClassDataExtractor functionality...")
        
        try:
            AfterClassDataExtractor = self.import_extractor_class()
            logger.info("✅ Successfully imported AfterClassDataExtractor")
            
            extractor = AfterClassDataExtractor(self.test_db_config)
            extractor.setup_selenium_driver()
            logger.info("✅ Selenium WebDriver initialized")
            
            extractor.connect_database()
            logger.info("✅ Extractor database connection successful")
            
            # Test caching functionality
            self.test_caching_functionality(extractor)
            
            # Test file processing with schema validation
            self.test_file_processing_with_validation(extractor, test_filepaths)
            
            # Test CSV generation and schema conformance
            self.test_csv_generation_and_schema(extractor)
            
            extractor.cleanup()
            logger.info("✅ Extractor cleanup completed")
            
        except Exception as e:
            error_msg = f"Extractor functionality test failed: {e}"
            logger.error(f"❌ {error_msg}")
            self.test_results['errors'].append(error_msg)

    def test_caching_functionality(self, extractor):
        """Test the database caching functionality"""
        logger.info("Testing database caching...")
        try:
            if not extractor.load_cached_tables():
                logger.info("Cache not found, downloading from database...")
                extractor.download_and_cache_tables()
                logger.info("✅ Database tables downloaded and cached")
            else:
                logger.info("✅ Cached tables loaded successfully")
            
            if hasattr(extractor, 'courses_df') and extractor.courses_df is not None:
                logger.info(f"✅ Courses cache: {len(extractor.courses_df)} records")
            if hasattr(extractor, 'classes_df') and extractor.classes_df is not None:
                logger.info(f"✅ Classes cache: {len(extractor.classes_df)} records")
                
        except Exception as e:
            logger.error(f"❌ Caching functionality failed: {e}")
            self.test_results['errors'].append(f"Caching error: {e}")

    def test_file_processing_with_validation(self, extractor, test_filepaths):
        """Test processing files with enhanced data validation"""
        logger.info(f"Testing file processing with {len(test_filepaths)} files...")
        
        # Test hyphen parsing patterns first
        self.validate_hyphen_parsing_patterns(test_filepaths)
        
        # Process ALL files instead of just first 5
        for i, filepath in enumerate(test_filepaths, 1):
            # Log progress every 100 files
            if i % 100 == 0 or i <= 10:  # Show first 10 individual files, then every 100
                logger.info(f"Processing file {i}/{len(test_filepaths)}: {os.path.basename(filepath)}")
            
            try:
                success = extractor.process_html_file(filepath)
                if success:
                    self.test_results['scraping_success_count'] += 1
                    if i <= 10:  # Only log individual success for first 10 files
                        logger.info(f"✅ Successfully processed: {os.path.basename(filepath)}")
                    
                # Validate data extraction (only for first 10 files to avoid spam)
                if i <= 10:
                    self.validate_extracted_data(extractor, filepath)
                    self.validate_course_code_extraction(extractor, filepath)
                
                else:
                    self.test_results['scraping_error_count'] += 1
                    if i <= 10:  # Only log individual failures for first 10 files
                        logger.warning(f"⚠️ Failed to process: {os.path.basename(filepath)}")
                    
                self.test_results['total_files_tested'] += 1
                
            except Exception as e:
                self.test_results['scraping_error_count'] += 1
                self.test_results['total_files_tested'] += 1
                error_msg = f"Error processing {os.path.basename(filepath)}: {e}"
                if i <= 10:  # Only log individual errors for first 10 files
                    logger.error(f"❌ {error_msg}")
                self.test_results['errors'].append(error_msg)
            
            # Progress update every 100 files
            if i % 100 == 0:
                success_rate = (self.test_results['scraping_success_count'] / self.test_results['total_files_tested']) * 100
                logger.info(f"📊 Progress: {i}/{len(test_filepaths)} files processed ({success_rate:.1f}% success rate)")

    def validate_extracted_data(self, extractor, filepath):
        """Validate that extracted data conforms to schema requirements"""
        try:
            # Validate academic term parsing
            if hasattr(extractor, 'acad_term') and extractor.acad_term:
                latest_term = extractor.acad_term[-1]
                self.validate_acad_term_structure(latest_term, filepath)
            
            # Validate grading basis mapping
            if hasattr(extractor, 'classes_updates') and extractor.classes_updates:
                latest_class = extractor.classes_updates[-1]
                self.validate_grading_basis(latest_class, filepath)
            
            # Validate timing data structure
            if hasattr(extractor, 'class_timing') and extractor.class_timing:
                latest_timing = extractor.class_timing[-1]
                self.validate_timing_structure(latest_timing, filepath)
                
        except Exception as e:
            logger.warning(f"⚠️ Data validation failed for {os.path.basename(filepath)}: {e}")

    def validate_course_code_extraction(self, extractor, filepath):
        """Enhanced validation of course code extraction patterns"""
        try:
            # Check if any recent course codes look incomplete or malformed
            if hasattr(extractor, 'courses_updates') and extractor.courses_updates:
                latest_course = extractor.courses_updates[-1]
                course_id = latest_course.get('id', '')
                
                # Look up the actual course code for this ID
                if hasattr(extractor, 'courses_df') and not extractor.courses_df.empty:
                    matching_course = extractor.courses_df[extractor.courses_df['id'] == course_id]
                    if not matching_course.empty:
                        actual_code = matching_course.iloc[0]['code']
                        logger.debug(f"✅ Course ID {course_id} maps to code: {actual_code}")
                        
                        # Check for suspiciously short course codes (likely parsing errors)
                        if len(actual_code) <= 3:
                            logger.warning(f"⚠️ Suspiciously short course code: '{actual_code}' in {os.path.basename(filepath)}")
                        
                        # Check for course codes that should contain hyphens
                        if actual_code in ['COR', 'LAW', 'SE'] and len(actual_code) <= 3:
                            logger.error(f"❌ Incomplete course code detected: '{actual_code}' - likely parsing error in {os.path.basename(filepath)}")
                            
            # Also check for common parsing error patterns in error logs
            if hasattr(extractor, 'errors') and extractor.errors:
                recent_errors = [err for err in extractor.errors[-5:] if 'Course not found in cache' in err.get('error', '')]
                for error in recent_errors:
                    context = error.get('context', {})
                    course_code = context.get('course_code', '')
                    header = context.get('header', '')
                    
                    # Detect hyphen parsing issues
                    if len(course_code) <= 3 and '-' in header:
                        logger.error(f"❌ Hyphen parsing issue: extracted '{course_code}' from '{header}'")
                        
        except Exception as e:
            logger.debug(f"Course code validation error: {e}")

    def validate_hyphen_parsing_patterns(self, test_filepaths):
        """Test specific patterns that commonly cause parsing issues"""
        test_patterns = [
            "COR-STAT1202 - G8",
            "COR-OBHR1309 - G2", 
            "LAW204_624 - G3",
            "SE401 - G1",
            "CS101 - G2"
        ]
        
        logger.info("🔍 Testing hyphen parsing patterns...")
        
        # This would ideally be tested with the actual extractor method
        # but for now we'll just log the patterns we expect to handle
        for pattern in test_patterns:
            logger.debug(f"Expected to parse: '{pattern}'")

    def validate_acad_term_structure(self, term_data, filepath):
        """Validate academic term data structure"""
        required_fields = ['id', 'acad_year_start', 'acad_year_end', 'term', 'boss_id', 'start_dt', 'end_dt']
        
        for field in required_fields:
            if field not in term_data:
                raise ValueError(f"Missing required field '{field}' in acad_term")
        
        # Validate ID format (should be like AY202122T1)
        if not re.match(r'^AY\d{6}T[12]$|^AY\d{6}T3[AB]$', term_data['id']):
            raise ValueError(f"Invalid acad_term ID format: {term_data['id']}")
        
        # Validate term values
        if term_data['term'] not in ['1', '2', '3A', '3B']:
            raise ValueError(f"Invalid term value: {term_data['term']}")
        
        logger.debug(f"✅ Academic term validation passed for {os.path.basename(filepath)}")

    def validate_grading_basis(self, class_data, filepath):
        """Validate grading basis conforms to enum"""
        if 'grading_basis' in class_data and class_data['grading_basis'] is not None:
            valid_values = ['Pass/Fail', 'Graded', 'NA']  # Match @map values from Prisma
            if class_data['grading_basis'] not in valid_values:
                raise ValueError(f"Invalid grading_basis: {class_data['grading_basis']}. Must be one of {valid_values}")
        
        logger.debug(f"✅ Grading basis validation passed for {os.path.basename(filepath)}")

    def validate_timing_structure(self, timing_data, filepath):
        """Validate timing data structure"""
        required_fields = ['class_id', 'start_date', 'end_date', 'day_of_week', 'start_time', 'end_time', 'venue']
        
        for field in required_fields:
            if field not in timing_data:
                raise ValueError(f"Missing required field '{field}' in class_timing")
        
        # Validate day_of_week format (should be 3 characters)
        if len(str(timing_data['day_of_week'])) > 3:
            raise ValueError(f"day_of_week too long: {timing_data['day_of_week']}")
        
        logger.debug(f"✅ Timing structure validation passed for {os.path.basename(filepath)}")

    def test_csv_generation_and_schema(self, extractor):
        """Test CSV generation and validate schema conformance"""
        logger.info("Testing CSV generation and schema conformance...")
        try:
            output_dir = 'test_output'
            extractor.save_csv_files(output_dir)
            
            for csv_file, schema_info in self.expected_schemas.items():
                filepath = os.path.join(output_dir, csv_file)
                
                if os.path.exists(filepath):
                    self.validate_csv_schema(filepath, csv_file, schema_info)
                else:
                    logger.warning(f"⚠️ Expected file not generated: {csv_file}")
                    self.test_results['csv_generation'][csv_file] = {'generated': False}
            
            # Validate error logging
            self.validate_error_logging(extractor, output_dir)
            
        except Exception as e:
            error_msg = f"CSV generation test failed: {e}"
            logger.error(f"❌ {error_msg}")
            self.test_results['errors'].append(error_msg)

    def validate_csv_schema(self, filepath, csv_file, schema_info):
        """Validate CSV file against expected schema"""
        try:
            df = pd.read_csv(filepath)
            
            # Check required columns
            missing_columns = []
            for col in schema_info['required_columns']:
                if col not in df.columns:
                    missing_columns.append(col)
            
            if missing_columns:
                error_msg = f"{csv_file}: Missing columns {missing_columns}"
                logger.error(f"❌ {error_msg}")
                self.test_results['schema_validation'][csv_file] = {
                    'valid': False,
                    'error': error_msg
                }
                return
            
            # Validate data types and formats
            validation_result = self.validate_data_types(df, csv_file)
            
            self.test_results['csv_generation'][csv_file] = {
                'generated': True,
                'record_count': len(df),
                'columns': list(df.columns)
            }
            
            self.test_results['schema_validation'][csv_file] = validation_result
            
            if validation_result['valid']:
                logger.info(f"✅ {csv_file}: {len(df)} records, schema valid")
            else:
                logger.warning(f"⚠️ {csv_file}: Schema validation issues - {validation_result['warnings']}")
                
        except Exception as e:
            logger.error(f"❌ Error validating {csv_file}: {e}")
            self.test_results['schema_validation'][csv_file] = {
                'valid': False,
                'error': str(e)
            }

    def validate_data_types(self, df, csv_file):
        """Validate data types in CSV"""
        warnings = []
        
        if csv_file == 'acad_term.csv':
            # Validate acad_term specific formats
            for idx, row in df.head(3).iterrows():
                if not re.match(r'^AY\d{6}T[12]$|^AY\d{6}T3[AB]$', str(row.get('id', ''))):
                    warnings.append(f"Invalid ID format in row {idx}: {row.get('id')}")
                
                if row.get('term') not in ['1', '2', '3A', '3B']:
                    warnings.append(f"Invalid term value in row {idx}: {row.get('term')}")
        
        elif csv_file == 'classes_updates.csv':
            for idx, row in df.head(10).iterrows():
                # Validate class ID exists
                class_id = row.get('id')
                if pd.isna(class_id):
                    warnings.append(f"Missing class ID in row {idx}")
                
                # Validate grading basis values
                grading = row.get('grading_basis')
                if pd.notna(grading) and grading not in ['Pass/Fail', 'Graded', 'NA']:  # Use @map values
                    warnings.append(f"Invalid grading_basis in row {idx}: {grading}")

        elif csv_file == 'courses_updates.csv':
            for idx, row in df.head(10).iterrows():
                # Validate course ID exists  
                course_id = row.get('id')
                if pd.isna(course_id):
                    warnings.append(f"Missing course ID in row {idx}")
        
        elif csv_file == 'class_timing.csv':
            # Validate timing specific formats
            for idx, row in df.head(3).iterrows():
                day_of_week = str(row.get('day_of_week', ''))
                if len(day_of_week) > 3:
                    warnings.append(f"day_of_week too long in row {idx}: {day_of_week}")
        
        return {
            'valid': len(warnings) == 0,
            'warnings': warnings
        }

    def validate_error_logging(self, extractor, output_dir):
        """Enhanced validation of error logging functionality"""
        try:
            error_file = os.path.join(output_dir, 'processing_errors.csv')
            
            if hasattr(extractor, 'errors') and extractor.errors:
                logger.info(f"✅ Error logging working: {len(extractor.errors)} errors logged")
                
                if os.path.exists(error_file):
                    error_df = pd.read_csv(error_file)
                    expected_columns = ['filepath', 'error', 'type', 'table']
                    
                    missing_error_cols = [col for col in expected_columns if col not in error_df.columns]
                    if missing_error_cols:
                        logger.warning(f"⚠️ Error CSV missing columns: {missing_error_cols}")
                    else:
                        logger.info(f"✅ Error CSV structure valid: {len(error_df)} error records")
                        
                        # Validate table column has meaningful values
                        if 'table' in error_df.columns:
                            table_counts = error_df['table'].value_counts()
                            logger.info(f"✅ Errors by table: {dict(table_counts)}")
                            
                            # Check for duplicate errors
                            duplicates = error_df.groupby(['filepath', 'error', 'table']).size()
                            duplicate_count = len(duplicates[duplicates > 1])
                            if duplicate_count > 0:
                                logger.warning(f"⚠️ Found {duplicate_count} duplicate error entries")
                            else:
                                logger.info("✅ No duplicate errors found")
                                
        except Exception as e:
            logger.warning(f"⚠️ Error validation failed: {e}")

    def run_all_tests(self):
        """Run comprehensive schema conformance test suite"""
        logger.info("🚀 Starting Schema Conformance Tests")
        logger.info("=" * 60)
        
        # Test 1: Database Connection
        logger.info("📊 Testing Database Connectivity...")
        self.test_database_connection()
        
        # Test 2: Load test files
        logger.info("📁 Loading test HTML files...")
        test_filepaths = self.load_actual_filepaths(100)
        
        if not test_filepaths:
            logger.error("❌ No valid HTML files found for testing")
            self.generate_test_report()
            return
        
        # Test 3: Schema conformance testing
        logger.info("🔧 Testing Schema Conformance...")
        self.test_extractor_functionality(test_filepaths)
        
        # Generate report
        self.generate_test_report()

    def generate_test_report(self):
        """Generate comprehensive test report"""
        logger.info("=" * 60)
        logger.info("📊 SCHEMA CONFORMANCE TEST RESULTS")
        logger.info("=" * 60)
        
        # Database connectivity
        logger.info("🗄️ DATABASE CONNECTIVITY:")
        db_status = "✅ PASSED" if self.test_results['database_connection'] else "❌ FAILED"
        logger.info(f"  Connection Status: {db_status}")
        
        for table, result in self.test_results['table_access'].items():
            if result.get('accessible'):
                logger.info(f"  {table}: ✅ {result['record_count']} records")
            else:
                logger.info(f"  {table}: ❌ {result.get('error', 'Access failed')}")
        
        # File processing
        logger.info("\n🔧 FILE PROCESSING:")
        total_tested = self.test_results['total_files_tested']
        success_count = self.test_results['scraping_success_count']
        error_count = self.test_results['scraping_error_count']
        
        if total_tested > 0:
            success_rate = (success_count / total_tested) * 100
            logger.info(f"  Processed: {success_count}/{total_tested} files ({success_rate:.1f}% success)")
        else:
            logger.info("  No files were processed")
        
        # CSV generation and schema validation
        logger.info("\n📄 CSV GENERATION & SCHEMA VALIDATION:")

        # Group CSV files by type
        update_csvs = ['courses_updates.csv', 'classes_updates.csv']
        insert_csvs = ['acad_term.csv', 'class_timing.csv', 'class_exam_timing.csv']
        temp_csvs = ['temp_classes_new.csv', 'temp_class_timing.csv', 'temp_class_exam_timing.csv']
        cleanup_csvs = ['existing_classes_for_cleanup.csv']
        course_csvs = ['new_courses.csv', 'courses_manual_review.csv']

        logger.info("  UPDATE CSVs (existing records):")
        for csv_file in update_csvs:
            self._log_csv_status(csv_file)

        logger.info("  INSERT CSVs (new records with existing class IDs):")
        for csv_file in insert_csvs:
            self._log_csv_status(csv_file)

        logger.info("  TEMP CSVs (new classes + related timing):")
        for csv_file in temp_csvs:
            self._log_csv_status(csv_file)

        logger.info("  CLEANUP CSVs (for database maintenance):")
        for csv_file in cleanup_csvs:
            self._log_csv_status(csv_file)

        logger.info("  COURSE CSVs (new courses to be created):")
        for csv_file in course_csvs:
            self._log_csv_status(csv_file)


        def _log_csv_status(self, csv_file):
            """Helper method to log CSV status"""
            csv_result = self.test_results['csv_generation'].get(csv_file, {})
            schema_result = self.test_results['schema_validation'].get(csv_file, {})
            
            if csv_result.get('generated'):
                record_count = csv_result.get('record_count', 0)
                if schema_result.get('valid'):
                    logger.info(f"    {csv_file}: ✅ {record_count} records, schema valid")
                else:
                    warnings = schema_result.get('warnings', [])
                    logger.warning(f"    {csv_file}: ⚠️ {record_count} records, schema issues: {len(warnings)} warnings")
            else:
                logger.info(f"    {csv_file}: ❌ Not generated")
        
        # Error summary
        if self.test_results['errors']:
            logger.info(f"\n⚠️ ERRORS ENCOUNTERED ({len(self.test_results['errors'])}):")
            for i, error in enumerate(self.test_results['errors'][:3], 1):
                logger.info(f"  {i}. {error}")
            if len(self.test_results['errors']) > 3:
                logger.info(f"  ... and {len(self.test_results['errors']) - 3} more errors")
        else:
            logger.info("\n✅ No critical errors encountered!")
        
        # Recommendations
        logger.info("\n📋 RECOMMENDATIONS:")
        
        schema_issues = [k for k, v in self.test_results['schema_validation'].items() if not v.get('valid')]
        if schema_issues:
            logger.info(f"  • Fix schema issues in: {', '.join(schema_issues)}")
            logger.info("  • Check column names match Prisma schema exactly")
            logger.info("  • Validate data type conversions and enum mappings")
        
        if error_count > 0:
            logger.info("  • Review processing_errors.csv for parsing issues")
        
        if success_count > 0:
            logger.info("  • ✅ Schema conformance testing complete!")
            logger.info("  • Check 'test_output' folder for generated CSV files")
        
        logger.info("=" * 60)

    def _log_csv_status(self, csv_file):
        """Helper method to log CSV status"""
        csv_result = self.test_results['csv_generation'].get(csv_file, {})
        schema_result = self.test_results['schema_validation'].get(csv_file, {})
        
        if csv_result.get('generated'):
            record_count = csv_result.get('record_count', 0)
            if schema_result.get('valid'):
                logger.info(f"    {csv_file}: ✅ {record_count} records, schema valid")
            else:
                warnings = schema_result.get('warnings', [])
                logger.warning(f"    {csv_file}: ⚠️ {record_count} records, schema issues: {len(warnings)} warnings")
        else:
            logger.info(f"    {csv_file}: ❌ Not generated")

if __name__ == "__main__":
    # Run the schema conformance test suite
    test_runner = SchemaConformanceTestRunner()
    test_runner.run_all_tests()

2025-05-26 18:39:02,182 - INFO - 🚀 Starting Schema Conformance Tests
2025-05-26 18:39:02,183 - INFO - 📊 Testing Database Connectivity...
2025-05-26 18:39:02,185 - INFO - Testing database connection...
2025-05-26 18:39:02,254 - INFO - ✅ Database connection successful
2025-05-26 18:39:02,273 - INFO - ✅ Table 'courses': 1861 records accessible
2025-05-26 18:39:02,290 - INFO - ✅ Table 'classes': 3437 records accessible
2025-05-26 18:39:02,304 - INFO - ✅ Table 'acad_term': 0 records accessible
2025-05-26 18:39:02,319 - INFO - ✅ Table 'class_timing': 0 records accessible
2025-05-26 18:39:02,334 - INFO - ✅ Table 'class_exam_timing': 0 records accessible
2025-05-26 18:39:02,335 - INFO - 📁 Loading test HTML files...
2025-05-26 18:39:03,053 - INFO - Found 12976 existing files
2025-05-26 18:39:03,054 - INFO - Selected 1000 files for testing
2025-05-26 18:39:03,054 - INFO - 🔧 Testing Schema Conformance...
2025-05-26 18:39:03,055 - INFO - Testing AfterClassDataExtractor functionality...
2025-05-26 

### 3.2.2 Actual Script

In [4]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class AfterClassDataExtractor:
    def __init__(self, db_config: Dict[str, str]):
        """Initialize with database configuration for Supabase"""
        self.db_config = db_config
        self.connection = None
        self.driver = None
        
        # Local cache for courses and classes
        self.courses_df = None
        self.classes_df = None
        self.courses_cache = {}  # Cache for course code to UUID mapping
        self.classes_cache = {}  # Cache for class lookups
        
        # CSV data storage
        self.courses_updates = []
        self.classes_updates = []
        self.acad_term = []
        self.class_timing = []
        self.class_exam_timing = []
        self.errors = []
        self.error_context = {}  # Track which operations are affected per file

        # New temporary data structures for missing class IDs
        self.temp_classes = []  # New classes to be created
        self.temp_class_timing = []  # Timing records with temp_class_id
        self.temp_class_exam_timing = []  # Exam timing records with temp_class_id
        self.course_class_mapping = []  # Existing class IDs for courses (for cleanup)
        self.temp_class_id_counter = 1  # Incremental counter for temp class IDs

        # New data structures for missing courses
        self.new_courses = []  # New courses to be created
        self.manual_review_courses = []  # Courses needing manual faculty assignment

    def setup_selenium_driver(self):
        """Set up Selenium WebDriver for local file access"""
        try:
            options = Options()
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')
            options.add_argument('--headless')  # Run in headless mode for efficiency
            options.add_argument('--disable-gpu')
            
            service = Service(ChromeDriverManager().install())
            self.driver = webdriver.Chrome(service=service, options=options)
            logger.info("Selenium WebDriver initialized successfully")
        except Exception as e:
            logger.error(f"Failed to initialize Selenium WebDriver: {e}")
            raise

    def connect_database(self):
        """Connect to Supabase PostgreSQL database"""
        try:
            self.connection = psycopg2.connect(
                host=self.db_config['host'],
                database=self.db_config['database'],
                user=self.db_config['user'],
                password=self.db_config['password'],
                port=self.db_config.get('port', 5432)
            )
            logger.info("Database connection established")
        except Exception as e:
            logger.error(f"Database connection failed: {e}")
            raise

    def download_and_cache_tables(self, cache_dir: str = 'db_cache'):
        """Download entire courses and classes tables and cache locally"""
        try:
            os.makedirs(cache_dir, exist_ok=True)
            
            # Download courses table - USE ACTUAL DATABASE COLUMN NAMES from @map
            logger.info("Downloading courses table...")
            courses_query = """
                SELECT id, code, name, description, credit_units, 
                       belong_to_university, belong_to_faculty, 
                       course_area, enrolment_requirements, created_at, updated_at
                FROM courses
            """
            self.courses_df = pd.read_sql_query(courses_query, self.connection)
            
            # Save to cache files
            courses_cache_file = os.path.join(cache_dir, 'courses_cache.pkl')
            self.courses_df.to_pickle(courses_cache_file)
            self.courses_df.to_csv(os.path.join(cache_dir, 'courses_cache.csv'), index=False)
            
            logger.info(f"Downloaded {len(self.courses_df)} courses")
            
            # Download classes table - USE ACTUAL DATABASE COLUMN NAMES from @map
            logger.info("Downloading classes table...")
            classes_query = """
                SELECT id, section, course_id, professor_id, acad_term_id,
                       grading_basis, course_outline_url, boss_id, created_at, updated_at
                FROM classes
            """
            self.classes_df = pd.read_sql_query(classes_query, self.connection)
            
            # Save to cache files
            classes_cache_file = os.path.join(cache_dir, 'classes_cache.pkl')
            self.classes_df.to_pickle(classes_cache_file)
            self.classes_df.to_csv(os.path.join(cache_dir, 'classes_cache.csv'), index=False)
            
            logger.info(f"Downloaded {len(self.classes_df)} classes")
            
            # Build lookup caches
            self._build_lookup_caches()
            
            logger.info("Database tables cached successfully")
            
        except Exception as e:
            logger.error(f"Error downloading and caching tables: {e}")
            raise

    def load_cached_tables(self, cache_dir: str = 'db_cache'):
        """Load cached tables from local files"""
        try:
            courses_cache_file = os.path.join(cache_dir, 'courses_cache.pkl')
            classes_cache_file = os.path.join(cache_dir, 'classes_cache.pkl')
            
            if os.path.exists(courses_cache_file) and os.path.exists(classes_cache_file):
                self.courses_df = pd.read_pickle(courses_cache_file)
                self.classes_df = pd.read_pickle(classes_cache_file)
                
                self._build_lookup_caches()
                
                # ✅ CHECK FOR EMPTY CACHE
                if len(self.courses_df) == 0 or len(self.classes_df) == 0:
                    logger.warning("Cache files exist but are empty, will re-download")
                    return False  # Force re-download
                
                logger.info(f"Loaded cached tables: {len(self.courses_df)} courses, {len(self.classes_df)} classes")
                return True
            else:
                logger.info("Cache files not found, will download from database")
                return False
        except Exception as e:
            logger.error(f"Error loading cached tables: {e}")
            return False

    def _build_lookup_caches(self):
        """Build lookup caches from DataFrames"""
        # Build course code to ID mapping
        for _, row in self.courses_df.iterrows():
            self.courses_cache[row['code']] = row['id']
        
        # Build class lookup cache (course_id + section + acad_term_id -> class_id)
        for _, row in self.classes_df.iterrows():
            if pd.notna(row['acad_term_id']) and pd.notna(row['section']):
                cache_key = f"{row['course_id']}_{row['section']}_{row['acad_term_id']}"
                self.classes_cache[cache_key] = row['id']

    def get_course_id_by_code(self, course_code: str) -> Optional[str]:
        """Get course UUID by course code using local cache"""
        return self.courses_cache.get(course_code)

    def get_class_id(self, course_id: str, section: str, acad_term_id: str) -> Optional[int]:
        """Get class ID using local cache"""
        cache_key = f"{course_id}_{section}_{acad_term_id}"
        return self.classes_cache.get(cache_key)

    def get_all_class_ids_for_course(self, course_id: str) -> List[Dict]:
        """Get all existing class IDs for a course for cleanup purposes"""
        try:
            existing_classes = []
            for _, row in self.classes_df.iterrows():
                if row['course_id'] == course_id:
                    existing_classes.append({
                        'course_id': course_id,
                        'class_id': row['id'],
                        'section': row['section'],
                        'acad_term_id': row['acad_term_id'],
                        'professor_id': row['professor_id'],
                        'grading_basis': row['grading_basis'],
                        'course_outline_url': row['course_outline_url'],
                        'boss_id': row['boss_id']
                    })
            return existing_classes
        except Exception as e:
            logger.error(f"Error querying existing classes for course {course_id}: {e}")
            return []

    def determine_faculty_from_course_code(self, course_code: str) -> Optional[int]:
        """Determine faculty ID from course code by querying existing courses"""
        try:
            # Extract alphabetical prefix from course code
            prefix_match = re.match(r'^([A-Z-]+)', course_code)
            if not prefix_match:
                return None
            
            course_prefix = prefix_match.group(1)
            
            # Query cached courses for this prefix
            matching_faculties = set()
            for _, row in self.courses_df.iterrows():
                if row['code'].startswith(course_prefix):
                    matching_faculties.add(row['belong_to_faculty'])
            
            # If exactly one faculty, return it
            if len(matching_faculties) == 1:
                return list(matching_faculties)[0]
            else:
                logger.debug(f"Multiple or no faculties found for prefix '{course_prefix}': {matching_faculties}")
                return None
                
        except Exception as e:
            logger.error(f"Error determining faculty for {course_code}: {e}")
            return None

    def extract_course_information(self, filepath: str) -> Optional[Dict]:
        """Extract complete course information from HTML"""
        try:
            import uuid
            
            # Get course code and name from header
            class_header_text = self.safe_find_element_text(By.ID, 'lblClassInfoHeader')
            course_name = self.safe_find_element_text(By.ID, 'lblClassSection')
            
            if not class_header_text or not course_name:
                return None
            
            course_code, _ = self.parse_course_and_section(class_header_text)
            if not course_code:
                return None
            
            # Extract course description
            description = self.safe_find_element_text(By.ID, 'lblCourseDescription')
            if not description:
                description = "Course description not available."
            
            # Extract credit units
            credit_units_text = self.safe_find_element_text(By.ID, 'lblUnits')
            try:
                credit_units = float(credit_units_text) if credit_units_text else 1.0
            except (ValueError, TypeError):
                credit_units = 1.0
            
            # Extract course areas
            course_areas = self.safe_find_element_text(By.ID, 'lblCourseAreas')
            if course_areas:
                # Clean up HTML tags
                course_areas = re.sub(r'<[^>]+>', '', course_areas)
            
            # Extract enrollment requirements
            enrolment_requirements = self.safe_find_element_text(By.ID, 'lblEnrolmentRequirements')
            
            # Determine faculty
            faculty_id = self.determine_faculty_from_course_code(course_code)
            
            # Create course record
            course_record = {
                'id': str(uuid.uuid4()),  # Generate UUID
                'code': course_code,
                'name': course_name.strip(),
                'description': description.strip(),
                'credit_units': credit_units,
                'belong_to_university': 1,  # SMU
                'belong_to_faculty': faculty_id,  # Will be None if needs manual review
                'course_area': course_areas,
                'enrolment_requirements': enrolment_requirements,
                'filepath': filepath  # For tracking
            }
            
            return course_record
            
        except Exception as e:
            logger.error(f"Error extracting course information from {filepath}: {e}")
            return None


    def load_html_file(self, filepath: str) -> bool:
        """Load HTML file using Selenium"""
        try:
            # Convert to absolute path and use file:// protocol
            html_file = Path(filepath).resolve()
            file_url = html_file.as_uri()
            
            self.driver.get(file_url)
            logger.debug(f"Loaded HTML file: {filepath}")
            return True
        except Exception as e:
            logger.error(f"Error loading HTML file {filepath}: {e}")
            return False

    def parse_acad_term(self, term_text: str) -> Dict[str, any]:
        """Parse academic term text and return structured data"""
        try:
            # Examples: "2021-22 August Term", "2021-22 Session 1", "2021-22 Session 2"
            pattern = r'(\d{4})-(\d{2})\s+(.*)'
            match = re.search(pattern, term_text)
            
            if not match:
                raise ValueError(f"Cannot parse term: {term_text}")
            
            start_year = int(match.group(1))
            end_year_short = int(match.group(2))
            term_desc = match.group(3).lower()
            
            # Convert 2-digit year to 4-digit
            if end_year_short < 50:  # Assuming years after 2000
                end_year = 2000 + end_year_short
            else:
                end_year = 1900 + end_year_short
            
            # Determine term code - match schema requirements
            if 'august' in term_desc or 'session 1' in term_desc or 'term 1' in term_desc:
                term_code = '1'
            elif 'january' in term_desc or 'session 2' in term_desc or 'term 2' in term_desc:
                term_code = '2'
            elif '3a' in term_desc:
                term_code = '3A'
            elif '3b' in term_desc:
                term_code = '3B'
            else:
                raise ValueError(f"Cannot determine term code from: {term_desc}")
            
            acad_term_id = f"AY{start_year}{end_year_short:02d}T{term_code}"
            
            return {
                'id': acad_term_id,
                'acad_year_start': start_year,
                'acad_year_end': end_year,
                'term': term_code,
                'term_text': term_text
            }
        except Exception as e:
            logger.error(f"Error parsing academic term '{term_text}': {e}")
            return None

    def parse_date_range(self, date_text: str) -> Tuple[Optional[datetime], Optional[datetime]]:
        """Parse date range text and return start and end dates"""
        try:
            # Example: "23-Aug-2021 to 14-Nov-2021"
            pattern = r'(\d{1,2}-\w{3}-\d{4})\s+to\s+(\d{1,2}-\w{3}-\d{4})'
            match = re.search(pattern, date_text)
            
            if not match:
                raise ValueError(f"Cannot parse date range: {date_text}")
            
            start_date_str = match.group(1)
            end_date_str = match.group(2)
            
            start_date = datetime.strptime(start_date_str, '%d-%b-%Y')
            end_date = datetime.strptime(end_date_str, '%d-%b-%Y')
            
            return start_date, end_date
        except Exception as e:
            logger.error(f"Error parsing date range '{date_text}': {e}")
            return None, None

    def parse_single_date(self, date_text: str) -> Optional[datetime]:
        """Parse single date text"""
        try:
            return datetime.strptime(date_text, '%d-%b-%Y')
        except Exception as e:
            logger.error(f"Error parsing date '{date_text}': {e}")
            return None

    def extract_course_outline_url(self) -> Optional[str]:
        """Extract course outline URL from HTML using Selenium"""
        try:
            course_outline_img = self.driver.find_element(By.ID, 'imgCourseOutline')
            onclick_text = course_outline_img.get_attribute('onclick')
            if onclick_text:
                # Extract URL from: window.open('URL','','toolbar=no, width=700, resizable=yes')
                url_match = re.search(r"window\.open\('([^']+)'", onclick_text)
                if url_match:
                    return url_match.group(1)
        except Exception as e:
            logger.debug(f"Course outline URL not found or error: {e}")
        return None

    def extract_boss_id_from_filepath(self, filepath: str) -> Optional[int]:
        """Extract BOSS ID from filepath"""
        try:
            # Example: "SelectedAcadTerm=2110&SelectedClassNumber=1002.html"
            filename = os.path.basename(filepath)
            match = re.search(r'SelectedClassNumber=(\d+)', filename)
            if match:
                return int(match.group(1))
        except Exception as e:
            logger.error(f"Error extracting BOSS ID from '{filepath}': {e}")
        return None

    def parse_course_and_section(self, class_header_text: str) -> Tuple[Optional[str], Optional[str]]:
        """Enhanced parsing for course code and section to handle course codes with hyphens"""
        try:
            # Clean the text first - remove HTML tags, normalize whitespace
            clean_text = re.sub(r'<[^>]+>', '', class_header_text)  # Remove HTML tags
            clean_text = re.sub(r'\s+', ' ', clean_text.strip())    # Normalize whitespace
            clean_text = re.sub(r'[\r\n]+', ' ', clean_text)        # Replace newlines with spaces
            
            logger.debug(f"Original header: '{class_header_text}' -> Cleaned: '{clean_text}'")
            
            # Try multiple regex patterns to handle different formats
            patterns = [
                # PRIORITY 1: Course codes with internal hyphens - look for " - " (space-hyphen-space) separator
                r'([A-Z0-9_-]+)\s+-\s+(.+)',  # "COR-STAT1202 - G8" or "LAW204_624 - G3"
                
                # PRIORITY 2: Split format with spaces: "COR 2100 - G1" or "COR\n2100 - G1"  
                r'([A-Z]+)\s+(\d+[A-Z0-9_]*)\s+-\s+(.+)',
                
                # PRIORITY 3: Standard format without internal hyphens: "CS101 - G1"
                r'([A-Z0-9_]+)\s+-\s+(.+)',
                
                # PRIORITY 4: Fallback for any separator variations
                r'([A-Z0-9_\s-]+?)\s*[-–—]\s*(.+)'  # Handle different dash types
            ]
            
            for i, pattern in enumerate(patterns):
                match = re.match(pattern, clean_text, re.IGNORECASE)
                if match:
                    if i == 1:  # Pattern with separate letter and number groups
                        course_code = match.group(1) + match.group(2)  # Combine "COR" + "2100"
                        section = match.group(3)
                    else:
                        course_code = match.group(1)
                        section = match.group(2)
                    
                    # Final cleanup - preserve internal hyphens in course code
                    course_code = course_code.upper().strip()
                    section = section.strip()
                    
                    # Remove any trailing/leading spaces but keep internal structure
                    course_code = re.sub(r'\s+', '', course_code)  # Remove internal spaces only
                    
                    logger.debug(f"Pattern {i+1} matched: '{clean_text}' -> course='{course_code}', section='{section}'")
                    return course_code, section
            
            # If no patterns match, log for debugging
            logger.warning(f"Could not parse course header: '{class_header_text}' (cleaned: '{clean_text}')")
            return None, None
            
        except Exception as e:
            logger.error(f"Error parsing course header '{class_header_text}': {e}")
            return None, None

    def log_database_error(self, filepath: str, error_msg: str, affected_table: str, context: Dict = None):
        """Enhanced error logging with table and context information"""
        error_record = {
            'filepath': filepath,
            'error': error_msg,
            'type': 'database_error',
            'table': affected_table,
            'timestamp': datetime.now().isoformat(),
            'context': context or {}
        }
        
        self.errors.append(error_record)
        
        # Also log to console with table information
        logger.error(f"❌ [{affected_table}] {error_msg} in {os.path.basename(filepath)}")
        
        # Track error context per file to avoid duplicates
        file_key = os.path.basename(filepath)
        if file_key not in self.error_context:
            self.error_context[file_key] = set()
        
        error_signature = f"{affected_table}:{error_msg}"
        if error_signature not in self.error_context[file_key]:
            self.error_context[file_key].add(error_signature)
            return True  # New error
        else:
            return False  # Duplicate error

    def log_parse_error(self, filepath: str, error_msg: str, affected_element: str):
        """Enhanced parse error logging"""
        self.errors.append({
            'filepath': filepath,
            'error': error_msg,
            'type': 'parse_error',
            'table': 'N/A',
            'element': affected_element,
            'timestamp': datetime.now().isoformat()
        })
        
        logger.error(f"❌ [PARSE] {error_msg} in {os.path.basename(filepath)}")

    def safe_find_element_text(self, by: By, value: str) -> Optional[str]:
        """Safely find element and return its text"""
        try:
            element = self.driver.find_element(by, value)
            return element.text.strip() if element else None
        except Exception:
            return None

    def process_html_file(self, filepath: str) -> bool:
        """Process a single HTML file and extract all data using Selenium"""
        try:
            # Load HTML file
            if not self.load_html_file(filepath):
                return False
            
            # Extract basic class information
            class_header_text = self.safe_find_element_text(By.ID, 'lblClassInfoHeader')
            if not class_header_text:
                self.errors.append({
                    'filepath': filepath,
                    'error': 'Missing class header',
                    'type': 'parse_error'
                })
                return False
            
            # Parse course code and section with enhanced pattern matching
            course_code, section = self.parse_course_and_section(class_header_text)
            if not course_code or not section:
                self.errors.append({
                    'filepath': filepath,
                    'error': f'Cannot parse course code/section from: {class_header_text}',
                    'type': 'parse_error'
                })
                return False
            
            # Debug logging to see what's being extracted
            logger.debug(f"Extracted course_code: '{course_code}' from header: '{class_header_text}'")

            # Get course ID from local cache
            course_id = self.get_course_id_by_code(course_code)
            if not course_id:
                # Course not found - extract course information for creation
                logger.info(f"Course not found in cache: {course_code}, extracting course information...")
                
                course_record = self.extract_course_information(filepath)
                if course_record:
                    if course_record['belong_to_faculty'] is not None:
                        # Faculty determined automatically
                        self.new_courses.append(course_record)
                        course_id = course_record['id']  # Use the generated UUID
                        logger.info(f"✅ Created new course record: {course_code} -> Faculty {course_record['belong_to_faculty']}")
                    else:
                        # Faculty needs manual review
                        self.manual_review_courses.append(course_record)
                        logger.warning(f"⚠️ Course {course_code} needs manual faculty assignment")
                        # Continue processing but log the issue
                        self.log_database_error(filepath, f'Course {course_code} needs manual faculty assignment', 'courses_manual_review', {
                            'course_code': course_code, 
                            'header': class_header_text,
                            'reason': 'multiple_or_no_faculties_found'
                        })
                        return False
                else:
                    self.log_database_error(filepath, f'Failed to extract course information for: {course_code}', 'courses', {
                        'course_code': course_code, 
                        'header': class_header_text,
                        'reason': 'extraction_failed'
                    })
                    return False
            
            # Extract academic term
            term_text = self.safe_find_element_text(By.ID, 'lblClassInfoSubHeader')
            if not term_text:
                self.errors.append({
                    'filepath': filepath,
                    'error': 'Missing academic term',
                    'type': 'parse_error'
                })
                return False
            
            term_data = self.parse_acad_term(term_text)
            if not term_data:
                self.errors.append({
                    'filepath': filepath,
                    'error': f'Cannot parse academic term: {term_text}',
                    'type': 'parse_error'
                })
                return False
            
            # Extract course areas
            course_areas = self.safe_find_element_text(By.ID, 'lblCourseAreas')
            if course_areas:
                # Clean up HTML tags if any
                course_areas = re.sub(r'<[^>]+>', '', course_areas)
            
            # Extract enrollment requirements
            enrolment_req = self.safe_find_element_text(By.ID, 'lblEnrolmentRequirements')
            
            # Extract grading basis - match Prisma enum exactly
            grading_text = self.safe_find_element_text(By.ID, 'lblGradingBasis')
            grading_basis = None
            if grading_text.lower() in ['graded']:
                grading_basis = 'Graded'  # @map("Graded")
            elif grading_text.lower() in ['pass/fail', 'pass fail']:
                grading_basis = 'Pass/Fail'  # @map("Pass/Fail")
            else:
                grading_basis = 'NA'  # @map("NA")
            
            # Extract course outline URL
            course_outline_url = self.extract_course_outline_url()
            
            # Extract period dates
            period_text = self.safe_find_element_text(By.ID, 'lblDates')
            start_dt, end_dt = None, None
            if period_text:
                start_dt, end_dt = self.parse_date_range(period_text)
            
            # Extract BOSS ID
            boss_id = self.extract_boss_id_from_filepath(filepath)
            
            # Add course update record - match database column names exactly
            if course_id:
                self.courses_updates.append({
                    'id': course_id,  # Resolved database ID
                    'course_area': course_areas,
                    'enrolment_requirements': enrolment_req
                })
            else:
                self.log_database_error(filepath, f'Course ID not found for code: {course_code}','courses_updates', {'course_code': course_code, 'operation': 'update'})
            
            # Add academic term record - match database column names exactly
            acad_term_record = {
                'id': term_data['id'],
                'acad_year_start': term_data['acad_year_start'],
                'acad_year_end': term_data['acad_year_end'],
                'term': term_data['term'],
                'boss_id': boss_id,
                'start_dt': start_dt.isoformat() if start_dt else None,
                'end_dt': end_dt.isoformat() if end_dt else None
            }
            self.acad_term.append(acad_term_record)
            
            # Handle class records - try to find existing, create temp if not found
            if course_id:
                # Try with acad_term_id first, fallback to just course_id + section
                class_id = self.get_class_id(course_id, section, term_data['id'])
                if not class_id:
                    # Fallback: find class by course_id + section only (since acad_term might not be set)
                    class_id = self.get_class_id_by_course_and_section(course_id, section)
                
                if class_id:
                    # Found existing class - add to updates
                    self.classes_updates.append({
                        'id': class_id,  # Resolved database ID
                        'grading_basis': grading_basis,
                        'course_outline_url': course_outline_url
                    })
                    
                    # Extract meeting information with existing class_id
                    self.extract_meeting_information(class_id, filepath)
                    
                else:
                    # Class ID not found - create temporary records
                    temp_class_id = self.temp_class_id_counter
                    self.temp_class_id_counter += 1
                    
                    # Store existing class IDs for this course for cleanup
                    existing_classes = self.get_all_class_ids_for_course(course_id)
                    for existing_class in existing_classes:
                        # Add to mapping for cleanup, avoid duplicates
                        if existing_class not in self.course_class_mapping:
                            self.course_class_mapping.append(existing_class)
                    
                    # Create new temp class record with all required fields from schema
                    temp_class_record = {
                        'temp_class_id': temp_class_id,
                        'section': section,
                        'course_id': course_id,
                        'professor_id': None,  # Will need to be set manually or defaulted
                        'acad_term_id': term_data['id'],
                        'grading_basis': grading_basis,
                        'course_outline_url': course_outline_url,
                        'boss_id': boss_id
                    }
                    self.temp_classes.append(temp_class_record)
                    
                    # Extract meeting information with temp_class_id
                    self.extract_meeting_information_temp(temp_class_id, filepath)
                    
                    logger.info(f"✅ Created temp class record: temp_class_id={temp_class_id} for {course_code}-{section}")
            else:
                self.log_database_error(filepath, f'Course ID not found for code: {course_code}','courses_updates', {'course_code': course_code, 'operation': 'update'})

            return True
            
        except Exception as e:
            self.errors.append({
                'filepath': filepath,
                'error': str(e),
                'type': 'processing_error'
            })
            logger.error(f"Error processing file {filepath}: {e}")
            return False

    def get_class_id_by_course_and_section(self, course_id: str, section: str) -> Optional[int]:
        """Get class ID using only course_id and section (fallback when acad_term_id not set)"""
        try:
            # Find classes by course_id and section only
            matching_classes = []
            for _, row in self.classes_df.iterrows():
                if row['course_id'] == course_id and str(row['section']).strip() == str(section).strip():
                    matching_classes.append(row['id'])
            
            if len(matching_classes) == 1:
                return matching_classes[0]
            elif len(matching_classes) > 1:
                logger.warning(f"Multiple classes found for {course_id}-{section}, using first: {matching_classes}")
                return matching_classes[0]
            else:
                return None
                
        except Exception as e:
            logger.error(f"Error in fallback class lookup for {course_id}-{section}: {e}")
            return None

    def extract_meeting_information(self, class_id: Optional[int], filepath: str):
        """Extract class timing and exam timing information using Selenium"""
        try:
            # Find the meeting information table
            meeting_table = self.driver.find_element(By.ID, 'RadGrid_MeetingInfo_ctl00')
            
            # Find all data rows in tbody (skip header)
            tbody = meeting_table.find_element(By.TAG_NAME, 'tbody')
            rows = tbody.find_elements(By.TAG_NAME, 'tr')
            
            for row in rows:
                cells = row.find_elements(By.TAG_NAME, 'td')
                if len(cells) < 8:
                    continue
                
                meeting_type = cells[0].text.strip()
                start_date_text = cells[1].text.strip()
                end_date_text = cells[2].text.strip()
                day_of_week = cells[3].text.strip()
                start_time = cells[4].text.strip()
                end_time = cells[5].text.strip()
                venue = cells[6].text.strip()
                
                if meeting_type == 'CLASS':
                    # Parse dates for class timing
                    start_date = self.parse_single_date(start_date_text)
                    end_date = self.parse_single_date(end_date_text)
                    
                    # Use database column names exactly as per schema @map
                    timing_record = {
                        'class_id': class_id,
                        'start_date': start_date.isoformat() if start_date else None,
                        'end_date': end_date.isoformat() if end_date else None,
                        'day_of_week': day_of_week,
                        'start_time': start_time,
                        'end_time': end_time,
                        'venue': venue
                    }
                    self.class_timing.append(timing_record)
                
                elif meeting_type == 'EXAM':
                    # Parse date for exam timing
                    exam_date = self.parse_single_date(start_date_text)
                    
                    # Use database column names exactly as per schema @map
                    exam_record = {
                        'class_id': class_id,
                        'date': exam_date.isoformat() if exam_date else None,
                        'day_of_week': day_of_week,
                        'start_time': start_time,
                        'end_time': end_time,
                        'venue': ''  # Leave empty as specified in schema
                    }
                    self.class_exam_timing.append(exam_record)
        
        except Exception as e:
            self.errors.append({
                'filepath': filepath,
                'error': f'Error extracting meeting information: {str(e)}',
                'type': 'parse_error'
            })

    def extract_meeting_information_temp(self, temp_class_id: int, filepath: str):
        """Extract class timing and exam timing information for temp classes"""
        try:
            # Find the meeting information table
            meeting_table = self.driver.find_element(By.ID, 'RadGrid_MeetingInfo_ctl00')
            
            # Find all data rows in tbody (skip header)
            tbody = meeting_table.find_element(By.TAG_NAME, 'tbody')
            rows = tbody.find_elements(By.TAG_NAME, 'tr')
            
            for row in rows:
                cells = row.find_elements(By.TAG_NAME, 'td')
                if len(cells) < 8:
                    continue
                
                meeting_type = cells[0].text.strip()
                start_date_text = cells[1].text.strip()
                end_date_text = cells[2].text.strip()
                day_of_week = cells[3].text.strip()
                start_time = cells[4].text.strip()
                end_time = cells[5].text.strip()
                venue = cells[6].text.strip()
                
                if meeting_type == 'CLASS':
                    # Parse dates for class timing
                    start_date = self.parse_single_date(start_date_text)
                    end_date = self.parse_single_date(end_date_text)
                    
                    # Use temp_class_id for timing records
                    timing_record = {
                        'temp_class_id': temp_class_id,
                        'start_date': start_date.isoformat() if start_date else None,
                        'end_date': end_date.isoformat() if end_date else None,
                        'day_of_week': day_of_week,
                        'start_time': start_time,
                        'end_time': end_time,
                        'venue': venue
                    }
                    self.temp_class_timing.append(timing_record)
                
                elif meeting_type == 'EXAM':
                    # Parse date for exam timing
                    exam_date = self.parse_single_date(start_date_text)
                    
                    # Use temp_class_id for exam timing records
                    exam_record = {
                        'temp_class_id': temp_class_id,
                        'date': exam_date.isoformat() if exam_date else None,
                        'day_of_week': day_of_week,
                        'start_time': start_time,
                        'end_time': end_time,
                        'venue': ''  # Leave empty as specified in schema
                    }
                    self.temp_class_exam_timing.append(exam_record)
        
        except Exception as e:
            self.errors.append({
                'filepath': filepath,
                'error': f'Error extracting temp meeting information: {str(e)}',
                'type': 'parse_error'
            })

    def save_csv_files(self, output_dir: str = 'extracted_data'):
        """Save all extracted data to CSV files"""
        os.makedirs(output_dir, exist_ok=True)
        
        # Save courses updates
        if self.courses_updates:
            df = pd.DataFrame(self.courses_updates)
            df.to_csv(os.path.join(output_dir, 'courses_updates.csv'), index=False)
            logger.info(f"Saved {len(self.courses_updates)} course update records")
        
        # Save classes updates
        if self.classes_updates:
            df = pd.DataFrame(self.classes_updates)
            df.to_csv(os.path.join(output_dir, 'classes_updates.csv'), index=False)
            logger.info(f"Saved {len(self.classes_updates)} class update records")
        
        # Save academic terms - use database table name exactly
        if self.acad_term:
            df = pd.DataFrame(self.acad_term)
            # Remove duplicates based on ID
            df = df.drop_duplicates(subset=['id'])
            df.to_csv(os.path.join(output_dir, 'acad_term.csv'), index=False)
            logger.info(f"Saved {len(df)} academic term records")
        
        # Save class timings - use database table name exactly
        if self.class_timing:
            df = pd.DataFrame(self.class_timing)
            df.to_csv(os.path.join(output_dir, 'class_timing.csv'), index=False)
            logger.info(f"Saved {len(self.class_timing)} class timing records")
        
        # Save exam timings - use database table name exactly
        if self.class_exam_timing:
            df = pd.DataFrame(self.class_exam_timing)
            df.to_csv(os.path.join(output_dir, 'class_exam_timing.csv'), index=False)
            logger.info(f"Saved {len(self.class_exam_timing)} exam timing records")

        # Save temp classes for new records to be created
        if self.temp_classes:
            df = pd.DataFrame(self.temp_classes)
            df.to_csv(os.path.join(output_dir, 'temp_classes_new.csv'), index=False)
            logger.info(f"Saved {len(self.temp_classes)} temp class records for creation")

        # Save temp class timings
        if self.temp_class_timing:
            df = pd.DataFrame(self.temp_class_timing)
            df.to_csv(os.path.join(output_dir, 'temp_class_timing.csv'), index=False)
            logger.info(f"Saved {len(self.temp_class_timing)} temp class timing records")

        # Save temp exam timings
        if self.temp_class_exam_timing:
            df = pd.DataFrame(self.temp_class_exam_timing)
            df.to_csv(os.path.join(output_dir, 'temp_class_exam_timing.csv'), index=False)
            logger.info(f"Saved {len(self.temp_class_exam_timing)} temp exam timing records")

        # Save course-class mapping for cleanup
        if self.course_class_mapping:
            df = pd.DataFrame(self.course_class_mapping)
            # Remove duplicates
            df = df.drop_duplicates()
            df.to_csv(os.path.join(output_dir, 'existing_classes_for_cleanup.csv'), index=False)
            logger.info(f"Saved {len(df)} existing class records for cleanup")

        # Save new courses for creation
        if self.new_courses:
            df = pd.DataFrame(self.new_courses)
            # Remove filepath column for database insertion
            df_for_db = df.drop(columns=['filepath'], errors='ignore')
            df_for_db.to_csv(os.path.join(output_dir, 'new_courses.csv'), index=False)
            logger.info(f"Saved {len(self.new_courses)} new course records for creation")

        # Save courses needing manual faculty assignment
        if self.manual_review_courses:
            df = pd.DataFrame(self.manual_review_courses)
            df.to_csv(os.path.join(output_dir, 'courses_manual_review.csv'), index=False)
            logger.info(f"Saved {len(self.manual_review_courses)} course records needing manual faculty assignment")

        # Save errors
        if self.errors:
            df = pd.DataFrame(self.errors)
            
            # Ensure all required columns exist
            if 'table' not in df.columns:
                df['table'] = 'N/A'
            if 'timestamp' not in df.columns:
                df['timestamp'] = datetime.now().isoformat()
                
            # Reorder columns for better readability
            column_order = ['filepath', 'error', 'type', 'table', 'timestamp']
            existing_columns = [col for col in column_order if col in df.columns]
            df = df[existing_columns + [col for col in df.columns if col not in existing_columns]]
            
            df.to_csv(os.path.join(output_dir, 'processing_errors.csv'), index=False)
            logger.info(f"Saved {len(self.errors)} error records with table information")
            
            # Generate error summary by table
            self.generate_error_summary(output_dir)

    def generate_error_summary(self, output_dir: str):
        """Generate error summary by table and type"""
        if not self.errors:
            return
            
        df = pd.DataFrame(self.errors)
        
        # Summary by table
        table_summary = df.groupby(['table', 'type']).size().reset_index(name='count')
        table_summary.to_csv(os.path.join(output_dir, 'error_summary_by_table.csv'), index=False)
        
        # Log summary to console
        logger.info("\n📊 ERROR SUMMARY BY TABLE:")
        for _, row in table_summary.iterrows():
            logger.info(f"  {row['table']}: {row['count']} {row['type']} errors")

    def process_all_files(self, scraped_filepaths_csv: str, output_dir: str = 'extracted_data'):
        """Process all files listed in the scraped filepaths CSV"""
        try:
            # Read the CSV file with file paths
            df = pd.read_csv(scraped_filepaths_csv)
            
            # Handle both 'Filepath' and 'filepath' column names
            filepath_column = 'Filepath' if 'Filepath' in df.columns else 'filepath'
            
            total_files = len(df)
            processed_files = 0
            successful_files = 0
            
            logger.info(f"Starting to process {total_files} files")
            
            for index, row in df.iterrows():
                filepath = row[filepath_column]
                
                if os.path.exists(filepath):
                    if self.process_html_file(filepath):
                        successful_files += 1
                    processed_files += 1
                    
                    if processed_files % 100 == 0:
                        logger.info(f"Processed {processed_files}/{total_files} files")
                else:
                    self.errors.append({
                        'filepath': filepath,
                        'error': 'File not found',
                        'type': 'file_error'
                    })
            
            logger.info(f"Processing complete: {successful_files}/{processed_files} files successful")
            
            # Save all CSV files
            self.save_csv_files(output_dir)
            
        except Exception as e:
            logger.error(f"Error in process_all_files: {e}")
            raise

    def cleanup(self):
        """Clean up resources"""
        if self.driver:
            self.driver.quit()
            logger.info("Selenium WebDriver closed")
        
        if self.connection:
            self.connection.close()
            logger.info("Database connection closed")

def main():
    """Main function to run the data extraction"""
    
    # Database configuration - Fill in your Supabase credentials

    # Load variables from .env into environment
    load_dotenv()

    db_config = {
        'host': os.getenv('DB_HOST'),
        'database': os.getenv('DB_NAME'),
        'user': os.getenv('DB_USER'),
        'password': os.getenv('DB_PASSWORD'),
        'port': int(os.getenv('DB_PORT')),
        'gssencmode': 'disable'  # ✅ FIX GSSAPI ISSUE
    }
    
    # Initialize extractor
    extractor = AfterClassDataExtractor(db_config)
    
    try:
        # Set up Selenium WebDriver
        extractor.setup_selenium_driver()
        
        # Connect to database
        extractor.connect_database()
        
        # Try to load cached tables first, if not available download them
        if not extractor.load_cached_tables():
            logger.info("Downloading fresh data from database...")
            extractor.download_and_cache_tables()
        
        # Process all files
        extractor.process_all_files('scraped_filepaths.csv', 'extracted_data')
        
        print("Data extraction completed successfully!")
        print("Check the 'extracted_data' folder for CSV files:")
        print("- courses_updates.csv: Course area and enrollment requirements updates")
        print("- classes_updates.csv: Class grading basis, term, and outline URL updates")
        print("- acad_term.csv: Academic term records")
        print("- class_timing.csv: Class timing records")
        print("- class_exam_timing.csv: Exam timing records")
        print("- processing_errors.csv: Any errors encountered during processing")
        print("\nDatabase cache stored in 'db_cache' folder for future runs")
        
    except Exception as e:
        logger.error(f"Main process failed: {e}")
        raise
    finally:
        extractor.cleanup()

In [5]:
if __name__ == "__main__":
    main()

2025-05-26 19:14:14,727 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-26 19:14:14,959 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-26 19:14:15,190 - INFO - Driver [C:\Users\tanzh\.wdm\drivers\chromedriver\win64\136.0.7103.113\chromedriver-win32/chromedriver.exe] found in cache
2025-05-26 19:14:16,233 - INFO - Selenium WebDriver initialized successfully
2025-05-26 19:14:16,275 - INFO - Database connection established
2025-05-26 19:14:16,513 - INFO - Loaded cached tables: 1861 courses, 3437 classes
2025-05-26 19:14:16,531 - INFO - Starting to process 12976 files
2025-05-26 19:14:17,172 - INFO - ✅ Created temp class record: temp_class_id=1 for MGMT715-G1
2025-05-26 19:14:17,603 - INFO - ✅ Created temp class record: temp_class_id=2 for LGST700A-G1
2025-05-26 19:14:18,025 - INFO - ✅ Created temp class record: temp_class_id=3 for STAT701A-G1
2025-05-26 19:14:18,518 - INFO - ✅ Created temp class record: temp_class_id=4 for ACCT666-G1
2025-05-26 19:14

Data extraction completed successfully!
Check the 'extracted_data' folder for CSV files:
- courses_updates.csv: Course area and enrollment requirements updates
- classes_updates.csv: Class grading basis, term, and outline URL updates
- acad_term.csv: Academic term records
- class_timing.csv: Class timing records
- class_exam_timing.csv: Exam timing records
- processing_errors.csv: Any errors encountered during processing

Database cache stored in 'db_cache' folder for future runs


2025-05-26 20:36:47,360 - INFO - Selenium WebDriver closed
2025-05-26 20:36:47,361 - INFO - Database connection closed


### 3.2.3 Correcting faculty for courses

In [8]:
import pandas as pd
import os
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

class CourseManualReviewTool:
    def __init__(self):
        self.csv_path = 'extracted_data/courses_manual_review.csv'
        self.driver = None
        self.df = None
        
        # Faculty reference mapping
        self.faculty_reference = {
            1: "Lee Kong Chian School of Business (LKCSB)",
            2: "Yong Pung How School of Law (YPHSL)", 
            3: "School of Economics (SOE)",
            4: "School of Computing and Information Systems (SCIS)",
            5: "School of Social Sciences (SOSS)",
            6: "School of Accountancy (SOA)",
            7: "College of Integrative Studies (CIS)",
            8: "Center for English Communication (CEC)"
        }
    
    def setup_browser(self):
        """Setup Selenium browser"""
        try:
            options = Options()
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')
            # Remove headless mode so user can see the HTML
            options.add_argument('--disable-gpu')
            options.add_argument('--window-size=1200,800')
            
            service = Service(ChromeDriverManager().install())
            self.driver = webdriver.Chrome(service=service, options=options)
            print("✅ Browser initialized successfully")
        except Exception as e:
            print(f"❌ Failed to initialize browser: {e}")
            raise
    
    def load_csv(self):
        """Load the courses manual review CSV"""
        try:
            if not os.path.exists(self.csv_path):
                print(f"❌ File not found: {self.csv_path}")
                return False
            
            self.df = pd.read_csv(self.csv_path)
            print(f"✅ Loaded CSV with {len(self.df)} courses")
            
            # Count courses needing review (empty belong_to_faculty)
            needs_review = self.df['belong_to_faculty'].isna().sum()
            print(f"📋 {needs_review} courses need faculty assignment")
            
            return True
        except Exception as e:
            print(f"❌ Error loading CSV: {e}")
            return False
    
    def display_faculty_reference(self):
        """Display faculty reference for user"""
        print("\n" + "="*60)
        print("🏫 FACULTY REFERENCE:")
        print("="*60)
        for fid, fname in self.faculty_reference.items():
            print(f"  {fid}: {fname}")
        print("="*60)
        print("💡 Enter the faculty ID number (1-8) or 'skip' to skip this course")
        print("💡 Enter 'quit' to save progress and exit")
        print("="*60 + "\n")
    
    def open_html_file(self, filepath):
        """Open HTML file in browser"""
        try:
            # Convert to absolute path
            html_file = Path(filepath).resolve()
            
            if not html_file.exists():
                print(f"⚠️ HTML file not found: {filepath}")
                return False
            
            # Open in browser
            file_url = html_file.as_uri()
            self.driver.get(file_url)
            print(f"🌐 Opened: {html_file.name}")
            return True
        except Exception as e:
            print(f"❌ Error opening HTML file: {e}")
            return False
    
    def get_faculty_input(self, course_info):
        """Get faculty input from user"""
        print(f"\n📚 Course: {course_info['code']} - {course_info['name']}")
        print(f"📖 Description: {course_info['description'][:100]}...")
        print(f"📂 Course Areas: {course_info['course_area']}")
        
        while True:
            user_input = input("\n👉 Enter faculty ID (1-8), 'skip', or 'quit': ").strip().lower()
            
            if user_input == 'quit':
                return 'quit'
            elif user_input == 'skip':
                return 'skip'
            elif user_input.isdigit():
                faculty_id = int(user_input)
                if 1 <= faculty_id <= 8:
                    faculty_name = self.faculty_reference[faculty_id]
                    confirm = input(f"✅ Assign to {faculty_name}? (y/n): ").strip().lower()
                    if confirm in ['y', 'yes']:
                        return faculty_id
                    else:
                        continue
                else:
                    print("❌ Invalid faculty ID. Please enter 1-8.")
            else:
                print("❌ Invalid input. Please enter 1-8, 'skip', or 'quit'.")
    
    def save_progress(self):
        """Save the current progress to CSV"""
        try:
            # Create backup
            backup_path = self.csv_path.replace('.csv', '_backup.csv')
            if os.path.exists(self.csv_path):
                import shutil
                shutil.copy2(self.csv_path, backup_path)
                print(f"💾 Backup saved: {backup_path}")
            
            # Save updated CSV
            self.df.to_csv(self.csv_path, index=False)
            print(f"💾 Progress saved: {self.csv_path}")
            
            # Show completion stats
            completed = self.df['belong_to_faculty'].notna().sum()
            total = len(self.df)
            print(f"📊 Progress: {completed}/{total} courses assigned ({completed/total*100:.1f}%)")
            
        except Exception as e:
            print(f"❌ Error saving progress: {e}")
    
    def run(self):
        """Main execution loop"""
        print("🚀 Starting Course Manual Review Tool")
        print("="*50)
        
        # Load CSV
        if not self.load_csv():
            return
        
        # Setup browser
        self.setup_browser()
        
        try:
            # Display faculty reference
            self.display_faculty_reference()
            
            # Process each course
            courses_processed = 0
            courses_assigned = 0
            
            for index, row in self.df.iterrows():
                # Skip if already has faculty assigned
                if pd.notna(row['belong_to_faculty']):
                    continue
                
                courses_processed += 1
                print(f"\n{'='*20} Course {courses_processed} {'='*20}")
                
                # Open HTML file
                if not self.open_html_file(row['filepath']):
                    continue
                
                # Get user input
                result = self.get_faculty_input(row)
                
                if result == 'quit':
                    print("\n🛑 User requested to quit. Saving progress...")
                    break
                elif result == 'skip':
                    print("⏭️ Skipping this course")
                    continue
                elif isinstance(result, int):
                    # Update the dataframe
                    self.df.at[index, 'belong_to_faculty'] = result
                    faculty_name = self.faculty_reference[result]
                    print(f"✅ Assigned {row['code']} to {faculty_name}")
                    courses_assigned += 1
                
                # Close current tab/window (optional - keeps browser open for next course)
                # self.driver.close()
            
            print(f"\n🎉 Session complete!")
            print(f"📊 Processed: {courses_processed} courses")
            print(f"✅ Assigned: {courses_assigned} faculties")
            
        except KeyboardInterrupt:
            print(f"\n⌨️ Interrupted by user. Saving progress...")
        except Exception as e:
            print(f"\n❌ Unexpected error: {e}")
        finally:
            # Save progress
            self.save_progress()
            
            # Cleanup
            if self.driver:
                self.driver.quit()
                print("🔒 Browser closed")
    
    def show_completion_status(self):
        """Show current completion status"""
        if self.df is not None:
            total = len(self.df)
            assigned = self.df['belong_to_faculty'].notna().sum()
            remaining = total - assigned
            
            print(f"\n📈 COMPLETION STATUS:")
            print(f"  Total courses: {total}")
            print(f"  Assigned: {assigned}")
            print(f"  Remaining: {remaining}")
            print(f"  Progress: {assigned/total*100:.1f}%")
            
            if remaining > 0:
                print(f"\n🔄 Run the script again to continue with remaining {remaining} courses")

def main():
    """Main function"""
    tool = CourseManualReviewTool()
    
    # Check if CSV exists
    if not os.path.exists(tool.csv_path):
        print(f"❌ File not found: {tool.csv_path}")
        print("💡 Make sure you've run the extraction script first to generate this file")
        return
    
    # Show current status
    tool.load_csv()
    tool.show_completion_status()
    
    # Ask user if they want to proceed
    if tool.df['belong_to_faculty'].notna().all():
        print("🎉 All courses already have faculty assignments!")
        return
    
    proceed = input("\n❓ Start/continue manual faculty assignment? (y/n): ").strip().lower()
    if proceed in ['y', 'yes']:
        tool.run()
    else:
        print("👋 Exiting...")

if __name__ == "__main__":
    main()


✅ Loaded CSV with 364 courses
📋 359 courses need faculty assignment

📈 COMPLETION STATUS:
  Total courses: 364
  Assigned: 5
  Remaining: 359
  Progress: 1.4%

🔄 Run the script again to continue with remaining 359 courses




🚀 Starting Course Manual Review Tool
✅ Loaded CSV with 364 courses
📋 359 courses need faculty assignment


2025-05-26 21:08:25,055 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-26 21:08:25,074 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-26 21:08:25,091 - INFO - Driver [C:\Users\tanzh\.wdm\drivers\chromedriver\win64\136.0.7103.113\chromedriver-win32/chromedriver.exe] found in cache


✅ Browser initialized successfully

🏫 FACULTY REFERENCE:
  1: Lee Kong Chian School of Business (LKCSB)
  2: Yong Pung How School of Law (YPHSL)
  3: School of Economics (SOE)
  4: School of Computing and Information Systems (SCIS)
  5: School of Social Sciences (SOSS)
  6: School of Accountancy (SOA)
  7: College of Integrative Studies (CIS)
  8: Center for English Communication (CEC)
💡 Enter the faculty ID number (1-8) or 'skip' to skip this course
💡 Enter 'quit' to save progress and exit


🌐 Opened: SelectedAcadTerm=2110&SelectedClassNumber=1480.html

📚 Course: COMM120 - Intercultural Communication
📖 Description: The course provides strategies on how to read a person’s culture as well as corporate culture. Featu...
📂 Course Areas: General Education - Arts
Arts & Culture Mgmt Electives
Communication Management Electives
Business Options
Econ Major Rel/Econ Options
Business-Oriented Electives
Culture, Org & Behaviour Track (Business)
Social Sciences/PLE Major-related
✅ Assigned COMM12

In [13]:
# # Function to test scraping and CSV saving
# def test_scrape_class_details():
#     test_url = "https://boss.intranet.smu.edu.sg/ClassDetails.aspx?SelectedAcadTerm=2420&SelectedClassNumber=1580"
#     csv_filename = "TestClassDetails.csv"
#     print("Starting test scrape...")

#     # Open CSV file for writing
#     with open(csv_filename, "w", newline="", encoding="utf-8") as file:
#         writer = csv.writer(file)
#         headers = ["Term", "Course Code", "Section", "Description", "Grading Basis"]
#         for i in range(1, 4):  # Dynamic columns for up to 3 classes
#             headers.extend([f"class{i}_day", f"class{i}_starttime", f"class{i}_venue"])
#         writer.writerow(headers)

#         driver.get(test_url)
#         time.sleep(2)  # Allow time for page load

#         try:
#             # Extract key elements
#             wait = WebDriverWait(driver, 10)
#             course_header = wait.until(EC.presence_of_element_located((By.ID, "lblClassInfoHeader"))).text
#             description = driver.find_element(By.ID, "lblClassSection").text
#             term = driver.find_element(By.ID, "lblClassInfoSubHeader").text
#             grading_basis = driver.find_element(By.ID, "lblGradingBasis").text

#             # Split course code and section
#             course_code, section = [item.strip() for item in course_header.split('-')]

#             # Extract meeting details
#             class_details = []
#             rows = driver.find_elements(By.CSS_SELECTOR, "#RadGrid_MeetingInfo_ctl00 tr.rgRow, #RadGrid_MeetingInfo_ctl00 tr.rgAltRow")
#             for row in rows:
#                 cells = row.find_elements(By.TAG_NAME, "td")
#                 if cells and cells[0].text == "CLASS":
#                     class_details.append({
#                         "day": cells[3].text,
#                         "start_time": cells[4].text,
#                         "venue": cells[6].text
#                     })

#             # Prepare row data
#             row_data = [term, course_code, section, description, grading_basis]
#             for detail in class_details[:3]:  # Include up to 3 classes
#                 row_data.extend([detail["day"], detail["start_time"], detail["venue"]])

#             # Pad missing columns
#             for _ in range(len(class_details), 3):
#                 row_data.extend(["", "", ""])

#             # Write to CSV
#             writer.writerow(row_data)
#             print(f"Test data successfully written to {csv_filename}!")

#         except Exception as e:
#             print(f"Error occurred: {e}")

In [14]:
# # Main Execution
# try:
#     # Step 1: Navigate and wait for manual login
#     driver.get("https://boss.intranet.smu.edu.sg/OverallResults.aspx")
#     wait_for_manual_login()

#     # Step 2: Run the test scrape function
#     test_scrape_class_details()

# finally:
#     driver.quit()
#     print("Test completed!")

### **3.2 Scrape Class Details**

In [15]:
# def scrape_class_details(ay, term_code, class_number, csv_writer):
#     url = f"https://boss.intranet.smu.edu.sg/ClassDetails.aspx?SelectedClassNumber={class_number:04}&SelectedAcadTerm={ay}{term_code}&SelectedAcadCareer=UGRD"
#     driver.get(url)

#     # Immediately check for "No record found" in the raw page source
#     if "No record found" in driver.page_source:
#         return  # Exit early

#     try:
#         # Extract course details
#         course_header = driver.find_element(By.ID, "lblClassInfoHeader").text
#         description = driver.find_element(By.ID, "lblClassSection").text
#         term = driver.find_element(By.ID, "lblClassInfoSubHeader").text
#         grading_basis = driver.find_element(By.ID, "lblGradingBasis").text

#         # Split course header into Course Code and Section
#         course_code, section = [item.strip() for item in course_header.split('-')]

#         # Extract meeting and exam details
#         class_details = []
#         exam_details = {"exam_startdate": "", "exam_day": "", "exam_starttime": ""}

#         rows = driver.find_elements(By.CSS_SELECTOR, "#RadGrid_MeetingInfo_ctl00 tr.rgRow, #RadGrid_MeetingInfo_ctl00 tr.rgAltRow")
#         for row in rows:
#             cells = row.find_elements(By.TAG_NAME, "td")
#             if cells:
#                 if cells[0].text == "CLASS":  # CLASS rows
#                     class_details.append({
#                         "day": cells[3].text,
#                         "start_time": cells[4].text,
#                         "venue": cells[6].text
#                     })
#                 elif cells[0].text == "EXAM":  # EXAM row
#                     exam_details["exam_startdate"] = cells[1].text
#                     exam_details["exam_day"] = cells[3].text
#                     exam_details["exam_starttime"] = cells[4].text

#         # Prepare row data with SelectedClassNumber and SelectedAcadTerm
#         row_data = [class_number, f"{ay}{term_code}", term, course_code, section, description, grading_basis]

#         # Add class details (up to 3 classes)
#         for detail in class_details[:3]:
#             row_data.extend([detail["day"], detail["start_time"], detail["venue"]])
#         for _ in range(len(class_details), 3):  # Pad missing class details
#             row_data.extend(["", "", ""])

#         # Add exam details
#         row_data.extend([
#             exam_details["exam_startdate"],
#             exam_details["exam_day"],
#             exam_details["exam_starttime"]
#         ])

#         # Write to CSV
#         csv_writer.writerow(row_data)
#         print(f"Scraped: AY{ay}, Term {term_code}, Class Number {class_number:04}")
#         return True

#     except Exception as e:
#         print(f"Error scraping Class Number {class_number:04}, AY{ay}, Term {term_code}: {e}")
#         return False


---
### **3.3 Main Scraping Loop**

In [16]:
# def main():
#     ay_list = range(21, 25)  # AY2021 to AY2024
#     term_mapping = {"10": "T1", "20": "T2", "31": "T3A", "32": "T3B"}

#     for ay in ay_list:
#         for term_code, term_name in term_mapping.items():

#             # # Use this if your code suddenly stops.
#             # # Skip AY 2021 Term 1
#             # if ay == 21 and term_code == "10":
#             #     print(f"Skipping AY{ay}, Term {term_code} as it has already been scraped.")
#             #     continue  # Skip this iteration

#             filename = f"20{ay}-20{ay+1}_{term_name}AddedInfo.csv"
#             print(f"Starting scraping for file: {filename}")

#             with open(filename, "w", newline="", encoding="utf-8") as file:
#                 writer = csv.writer(file)
#                 headers = ["SelectedClassNumber", "SelectedAcadTerm", "Term", "Course Code", "Section", "Description", "Grading Basis"]
#                 for i in range(1, 4):
#                     headers.extend([f"class{i}_day", f"class{i}_starttime", f"class{i}_venue"])
#                 headers.extend(["exam_startdate", "exam_day", "exam_starttime"])
#                 writer.writerow(headers)

#                 class_number = 1000  # Start from class number 0001
#                 no_record_count = 0  # Track consecutive "No record found"

#                 while True:
#                     success = scrape_class_details(ay, term_code, class_number, writer)

#                     if not success:  # If no record is found
#                         no_record_count += 1
#                     else:
#                         no_record_count = 0  # Reset the counter if a record is found

#                     # Stop if no record is found 300 times in a row
#                     if no_record_count >= 300:
#                         print(f"300 consecutive 'No record found' reached. Moving to next term.")
#                         break

#                     class_number += 1  # Increment to next class number

#     driver.quit()
#     print("Scraping completed!")


---
## **4. Execution**

In [17]:
# if __name__ == "__main__":
#     driver.get("https://boss.intranet.smu.edu.sg/OverallResults.aspx")
#     wait_for_manual_login()
#     main()