<a href="https://colab.research.google.com/github/victorvalente/SolvePrep/blob/main/SolverPrep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
pip install airportsdata



In [1]:
# solve_prep_utils.py

import json
from enum import Enum
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass, field
import random
import re

# External Libraries (ensure these are installed: geopy, requests, pandas, numpy, airportsdata)
try:
    import geopy
    import geopy.distance
    import requests
    import pandas as pd
    import numpy as np
    import airportsdata
except ImportError as e:
    print(f"Error importing libraries in solve_prep_utils: {e}")
    print("Please ensure pandas, numpy, geopy, requests, and airportsdata are installed.")
    # Or raise the error: raise

# --- Problem Definitions ---
class ProblemType(Enum):
    TSP_FLIGHTS = 1
    TSP_DRIVING_FUEL = 2
    KNAPSACK_MOVING = 3
    VRP_MANHATTAN = 4
    FACILITY_LOCATION_SEATTLE = 5
    NURSE_SCHEDULING_MGH = 6
    PORTFOLIO_OPTIMIZATION = 7
    TIMETABLING_CONFERENCE = 8
    PROJECT_SCHEDULING_CONSTRUCTION = 9
    NETWORK_DESIGN_WATER = 10
    OTHER_HEURISTIC = 99 # Catch-all for recognized heuristic problems
    UNKNOWN = 0

# --- Data Structures ---
@dataclass
class Location:
    name: str
    address: Optional[str] = None
    coords: Optional[Tuple[float, float]] = None
    iata_code: Optional[str] = None

@dataclass
class ProblemContext:
    """Holds the state of the problem preparation."""
    original_description: str
    identified_type: ProblemType = ProblemType.UNKNOWN
    extracted_data: Dict[str, Any] = field(default_factory=dict)
    missing_info: List[str] = field(default_factory=list)
    user_questions: List[str] = field(default_factory=list)
    is_confirmed: bool = False
    requires_manual_data: bool = True

# --- LLM Interaction Placeholders (Improved Logic) ---
# --- LLM Interaction Placeholders (Improved Logic) ---

def call_llm_categorize(description: str, possible_types: List[ProblemType]) -> ProblemType:
    """Placeholder: Calls LLM to categorize the problem description."""
    print("\nStep 1.1: Entering LLM Categorization...")
    print(f"  Analyzing Full Input Description (Length: {len(description)}):")
    print(f"  '{description}'")
    desc_lower = description.lower()
    result_type = ProblemType.OTHER_HEURISTIC # Default

    # --- Improved Placeholder Rules ---
    if "visit" in desc_lower and ("cities" in desc_lower or "european cities" in desc_lower or re.search(r'\b(london|paris|berlin|rome|madrid|amsterdam|prague|vienna|budapest|barcelona)\b', desc_lower)) and ("fly" in desc_lower or "flight" in desc_lower or "airfare" in desc_lower):
        print("  [LLM Placeholder Rule Match] TSP_FLIGHTS keywords matched.")
        result_type = ProblemType.TSP_FLIGHTS
    elif ("road trip" in desc_lower or "driving distances" in desc_lower) and ("national parks" in desc_lower or "yellowstone" in desc_lower):
        print("  [LLM Placeholder Rule Match] TSP_DRIVING_FUEL keywords matched.")
        result_type = ProblemType.TSP_DRIVING_FUEL
    elif ("move items" in desc_lower or "knapsack" in desc_lower or "bin packing" in desc_lower or "furniture" in desc_lower) and ("truck" in desc_lower or "container" in desc_lower):
        print("  [LLM Placeholder Rule Match] KNAPSACK_MOVING keywords matched.")
        result_type = ProblemType.KNAPSACK_MOVING
    elif ("delivery service" in desc_lower or "vehicle routing" in desc_lower) and ("addresses" in desc_lower or "locations" in desc_lower) and ("drivers" in desc_lower or "trucks" in desc_lower or "vehicles" in desc_lower):
        print("  [LLM Placeholder Rule Match] VRP_MANHATTAN keywords matched.")
        result_type = ProblemType.VRP_MANHATTAN
    elif ("optimal locations" in desc_lower or "facility location" in desc_lower) and ("shops" in desc_lower or "stores" in desc_lower or "facilities" in desc_lower):
        print("  [LLM Placeholder Rule Match] FACILITY_LOCATION_SEATTLE keywords matched.")
        result_type = ProblemType.FACILITY_LOCATION_SEATTLE
    elif ("schedule" in desc_lower or "scheduling" in desc_lower) and "nurses" in desc_lower and ("shifts" in desc_lower or "ward" in desc_lower):
        print("  [LLM Placeholder Rule Match] NURSE_SCHEDULING_MGH keywords matched.")
        result_type = ProblemType.NURSE_SCHEDULING_MGH
    elif ("invest" in desc_lower or "portfolio" in desc_lower) and ("stocks" in desc_lower or "assets" in desc_lower or "etfs" in desc_lower or "bonds" in desc_lower) and ("returns" in desc_lower or "risk" in desc_lower):
        print("  [LLM Placeholder Rule Match] PORTFOLIO_OPTIMIZATION keywords matched.")
        result_type = ProblemType.PORTFOLIO_OPTIMIZATION
    elif ("conference" in desc_lower or "timetabling" in desc_lower) and ("sessions" in desc_lower or "courses" in desc_lower or "events" in desc_lower) and ("rooms" in desc_lower or "timeslots" in desc_lower):
        print("  [LLM Placeholder Rule Match] TIMETABLING_CONFERENCE keywords matched.")
        result_type = ProblemType.TIMETABLING_CONFERENCE
    elif ("construction sequence" in desc_lower or "project scheduling" in desc_lower or "building" in desc_lower) and ("tasks" in desc_lower or "activities" in desc_lower):
        print("  [LLM Placeholder Rule Match] PROJECT_SCHEDULING_CONSTRUCTION keywords matched.")
        result_type = ProblemType.PROJECT_SCHEDULING_CONSTRUCTION
    elif ("water distribution network" in desc_lower or "network design" in desc_lower) and ("pipe" in desc_lower or "pump" in desc_lower or "pressure" in desc_lower):
        print("  [LLM Placeholder Rule Match] NETWORK_DESIGN_WATER keywords matched.")
        result_type = ProblemType.NETWORK_DESIGN_WATER
    else:
        print("  [LLM Placeholder Rule Match] No specific rule matched. Defaulting to OTHER_HEURISTIC.")

    print(f"  [LLM Placeholder] Final Categorization: {result_type.name}")
    print("Step 1.1: Exiting LLM Categorization.")
    return result_type

def call_llm_extract_initial_data(problem_type: ProblemType, description: str) -> Dict:
    """Placeholder: Calls LLM to extract only the initial key parameters."""
    print("\nStep 2.1: Entering LLM Initial Data Extraction...")
    print(f"  Problem Type: {problem_type.name}")
    extracted_data = {}
    # --- Improved Placeholder Logic ---
    if problem_type == ProblemType.TSP_FLIGHTS:
        # More robust city extraction, handling potential multi-word cities
        cities = re.findall(r'\b[A-Z][a-zA-Z]+\b(?: \b[A-Z][a-zA-Z]+\b)*', description)
        common_words = {"I", "Find", "The", "My", "A", "And", "Between", "Order", "Fly", "Flying", "Them", "European", "Cities", "Efficient", "Total", "Airfare", "Travel", "Time"}
        cities = [city.strip(',.:;') for city in cities if city not in common_words and len(city)>2]
        # Filter against known cities from example if possible
        example_cities = ["London", "Paris", "Berlin", "Rome", "Madrid", "Amsterdam", "Prague", "Vienna", "Budapest", "Barcelona"]
        found_cities = [c for c in cities if c in example_cities]
        if found_cities: cities = found_cities
        elif not cities: cities = ["London", "Paris", "Berlin"] # Fallback
        print(f"  [LLM Placeholder] Extracted/Defaulted Cities: {cities}")
        extracted_data['list_of_cities'] = list(dict.fromkeys(cities))

    elif problem_type == ProblemType.TSP_DRIVING_FUEL:
         # Extract park names more reliably
         parks = re.findall(r'\b[A-Z][a-zA-Z]*(?: [A-Z][a-zA-Z]*)*\b(?=\s*(?:National Park|Mountains|Canyon))|\b(Yellowstone|Yosemite|Zion|Olympic|Glacier|Acadia|Teton)\b', description)
         parks = list(dict.fromkeys([p.strip() for p in parks if p and len(p) > 3])) # Unique and clean up
         if not parks: parks = ["Yellowstone", "Grand Canyon", "Yosemite"] # Fallback
         print(f"  [LLM Placeholder] Extracted Parks/Locations: {parks}")
         extracted_data['list_of_locations'] = parks
         mpg_match = re.search(r'(\d+(?:\.\d+)?)\s*MPG', description, re.IGNORECASE) # Allow decimals, ignore case
         if mpg_match: extracted_data['vehicle_mpg'] = float(mpg_match.group(1)); print(f"  [LLM Placeholder] Extracted MPG: {extracted_data['vehicle_mpg']}")
         else: print("  [LLM Placeholder] Could not extract MPG.")

    elif problem_type == ProblemType.KNAPSACK_MOVING:
         # Try to extract truck size if mentioned
         truck_match = re.search(r'(\d+)-foot U-Haul truck', description)
         if truck_match: extracted_data['truck_info'] = f"{truck_match.group(1)}-foot U-Haul"; print(f"  [LLM Placeholder] Extracted Truck Info: {extracted_data['truck_info']}")
         else: print("  [LLM Placeholder] No specific initial parameters extracted for Knapsack.")

    elif problem_type == ProblemType.VRP_MANHATTAN:
         drivers_match = re.search(r'(\d+)\s*drivers', description); addresses_match = re.search(r'(\d+)\s*addresses', description)
         if drivers_match: extracted_data['num_drivers'] = int(drivers_match.group(1)); print(f"  [LLM Placeholder] Extracted Drivers: {extracted_data['num_drivers']}")
         if addresses_match: extracted_data['num_addresses_expected'] = int(addresses_match.group(1)); print(f"  [LLM Placeholder] Expected Addresses: {extracted_data['num_addresses_expected']}")

    elif problem_type == ProblemType.FACILITY_LOCATION_SEATTLE:
         shops_match = re.search(r'(\d+)\s*new\s*(?:coffee shops|stores|facilities)', description)
         distance_match = re.search(r'(\d+(?:\.\d+)?)\s*miles\s*apart', description)
         if shops_match: extracted_data['num_new_shops'] = int(shops_match.group(1)); print(f"  [LLM Placeholder] Extracted Num Shops: {extracted_data['num_new_shops']}")
         if distance_match: extracted_data['min_distance_miles'] = float(distance_match.group(1)); print(f"  [LLM Placeholder] Extracted Min Distance (miles): {extracted_data['min_distance_miles']}")

    elif problem_type == ProblemType.NURSE_SCHEDULING_MGH:
         nurses_match = re.search(r'(\d+)\s*nurses', description); shifts_match = re.search(r'(\d+)\s*shifts', description)
         if nurses_match: extracted_data['num_nurses'] = int(nurses_match.group(1)); print(f"  [LLM Placeholder] Extracted Num Nurses: {extracted_data['num_nurses']}")
         if shifts_match: extracted_data['num_shifts'] = int(shifts_match.group(1)); print(f"  [LLM Placeholder] Extracted Num Shifts: {extracted_data['num_shifts']}")

    elif problem_type == ProblemType.PORTFOLIO_OPTIMIZATION:
         amount_match = re.search(r'\$(\d{1,3}(?:,\d{3})*(?:\.\d+)?)', description) # Match dollar amounts
         if amount_match: extracted_data['investment_amount'] = float(amount_match.group(1).replace(',', '')); print(f"  [LLM Placeholder] Extracted Investment Amount: {extracted_data['investment_amount']}")

    elif problem_type == ProblemType.TIMETABLING_CONFERENCE:
         sessions_match = re.search(r'(\d+)\s*sessions', description); rooms_match = re.search(r'(\d+)\s*rooms', description); days_match = re.search(r'(\d+)\s*days', description)
         if sessions_match: extracted_data['num_sessions'] = int(sessions_match.group(1)); print(f"  [LLM Placeholder] Extracted Num Sessions: {extracted_data['num_sessions']}")
         if rooms_match: extracted_data['num_rooms'] = int(rooms_match.group(1)); print(f"  [LLM Placeholder] Extracted Num Rooms: {extracted_data['num_rooms']}")
         if days_match: extracted_data['num_days'] = int(days_match.group(1)); print(f"  [LLM Placeholder] Extracted Num Days: {extracted_data['num_days']}")

    elif problem_type == ProblemType.PROJECT_SCHEDULING_CONSTRUCTION:
         story_match = re.search(r'(\d+)-story building', description)
         if story_match: extracted_data['building_stories'] = int(story_match.group(1)); print(f"  [LLM Placeholder] Extracted Building Stories: {extracted_data['building_stories']}")

    elif problem_type == ProblemType.NETWORK_DESIGN_WATER:
         homes_match = re.search(r'(\d+)\s*homes', description)
         if homes_match: extracted_data['num_homes'] = int(homes_match.group(1)); print(f"  [LLM Placeholder] Extracted Num Homes: {extracted_data['num_homes']}")

    else: print("  [LLM Placeholder] No specific initial data extraction rules for this type.")
    print(f"  Output Extracted Data: {extracted_data}")
    print("Step 2.1: Exiting LLM Initial Data Extraction.")
    return extracted_data

def call_llm_identify_missing_manual(problem_type: ProblemType, current_data: Dict, auto_fetched_keys: List[str] = []) -> List[str]:
    """Placeholder: Identifies missing manual info based on problem type and current data."""
    print("\nStep 3.1 / 4.1 (Re-check): Entering LLM Identify Missing Manual Info...")
    print(f"  Problem Type: {problem_type.name}"); print(f"  Current Keys: {list(current_data.keys())}"); print(f"  Auto Keys: {auto_fetched_keys}")
    missing_info = []
    # --- Improved Placeholder Logic ---
    if problem_type == ProblemType.TSP_FLIGHTS:
        if 'flight_cost_matrix' not in current_data: missing_info.append("Flight Costs between City Pairs") # Should have been auto-fetched
        if 'flight_duration_matrix' not in current_data: missing_info.append("Flight Durations between City Pairs") # Should have been auto-fetched
        if 'airport_transfer_times_hours' not in current_data: missing_info.append("Airport Transfer Times per City")
        # Optional ones last
        if 'travel_date_range' not in current_data: missing_info.append("Preferred travel date range (optional)")
        if 'airline_preferences' not in current_data: missing_info.append("Airline preferences (optional)")

    elif problem_type == ProblemType.TSP_DRIVING_FUEL:
         if 'driving_distance_matrix_miles' not in current_data: missing_info.append("Driving Distances between Park Entrances/Locations")
         if 'route_elevation_data_source' not in current_data: missing_info.append("Elevation Data along Routes")
         if 'park_closure_info_source' not in current_data: missing_info.append("Seasonal Park Closures/Road Status")

    elif problem_type == ProblemType.KNAPSACK_MOVING:
        if not current_data.get("item_list_dimensions_values"): missing_info.append("List of items with dimensions (width, height, depth) and value")
        if not current_data.get("truck_dimensions"): missing_info.append("Truck cargo dimensions (width, height, depth)")

    elif problem_type == ProblemType.VRP_MANHATTAN:
         if not current_data.get("delivery_addresses_list"): missing_info.append("List of Delivery Addresses")
         if not current_data.get("customer_delivery_time_windows"): missing_info.append("Customer Delivery Time Windows")
         if 'real_time_traffic_data_source' not in current_data: missing_info.append("Real-time Traffic Data Source")

    elif problem_type == ProblemType.FACILITY_LOCATION_SEATTLE:
         if 'population_density_data' not in current_data: missing_info.append("Population Density Data")
         if 'competitor_locations' not in current_data: missing_info.append("Competitor Locations")
         if 'commercial_real_estate_cost_data' not in current_data: missing_info.append("Commercial Real Estate Cost Data")
         if 'traffic_pattern_data' not in current_data: missing_info.append("Traffic Pattern Data")
         if 'target_geographic_area' not in current_data: missing_info.append("Target Geographic Area Definition (e.g., Seattle boundary)") # Added

    elif problem_type == ProblemType.NURSE_SCHEDULING_MGH:
         if not current_data.get("nurse_list_qualifications_preferences"): missing_info.append("List of Nurses with Qualifications/Preferences")
         if not current_data.get("ward_staffing_requirements_per_shift"): missing_info.append("Ward Staffing Requirements per Shift")
         if not current_data.get("labor_regulations_consecutive_days"): missing_info.append("Labor Regulations (Consecutive days, hours/week)")

    elif problem_type == ProblemType.PORTFOLIO_OPTIMIZATION:
         if not current_data.get("list_of_potential_assets"): missing_info.append("List of Potential Assets (Stocks, Bonds, ETFs)")
         if 'risk_level_preference' not in current_data: missing_info.append("Risk Level Preference")
         if 'diversification_rules' not in current_data: missing_info.append("Diversification Rules")
         if 'historical_asset_performance_data' not in current_data: missing_info.append("Historical Asset Performance Data (Prices/Returns)")
         if 'asset_sector_classifications' not in current_data: missing_info.append("Asset Sector Classifications")
         if 'asset_volatility_metrics' not in current_data: missing_info.append("Asset Volatility Metrics")
         if 'asset_correlation_data' not in current_data: missing_info.append("Asset Correlation Data")

    elif problem_type == ProblemType.TIMETABLING_CONFERENCE:
         if not current_data.get("list_of_sessions_with_topics_speakers"): missing_info.append("List of Sessions with Topics/Speakers")
         if not current_data.get("list_of_rooms_with_capacities"): missing_info.append("List of Rooms with Capacities")
         if 'timeslots_per_day' not in current_data: missing_info.append("Timeslots per Day")
         if 'speaker_availability_constraints' not in current_data: missing_info.append("Speaker Availability Constraints")
         if 'topic_relationships_minimize_distance_conflict' not in current_data: missing_info.append("Topic Relationships (Minimize distance/conflict)")
         if 'predicted_attendance_per_session_optional' not in current_data: missing_info.append("Predicted Attendance per Session (Optional)") # Added optional tag

    elif problem_type == ProblemType.PROJECT_SCHEDULING_CONSTRUCTION:
         if not current_data.get("list_of_tasks_with_durations_and_dependencies"): missing_info.append("List of Tasks with Durations and Dependencies")
         if not current_data.get("crew_availability_type_and_count_per_period"): missing_info.append("Crew Availability (Type and Count per Period)")
         if not current_data.get("material_delivery_lead_times"): missing_info.append("Material Delivery Lead Times")
         if 'weather_forecast_source_data' not in current_data: missing_info.append("Weather Forecast Source/Data")

    elif problem_type == ProblemType.NETWORK_DESIGN_WATER:
         if 'development_location_area_definition' not in current_data: missing_info.append("Development Location/Area Definition")
         if 'elevation_data_for_area' not in current_data: missing_info.append("Elevation Data for Area")
         if 'water_demand_patterns_per_home_area_peak_avg' not in current_data: missing_info.append("Water Demand Patterns (Per Home/Area, Peak/Avg)")
         if not current_data.get("pipe_types_and_costs_per_unit_length_per_diameter"): missing_info.append("Pipe Types and Costs (Per unit length per diameter)")
         if not current_data.get("pump_types_and_costs_based_on_head_flow_capacity"): missing_info.append("Pump Types and Costs (Based on head/flow capacity)")
         if 'minimum_pressure_requirements_at_nodes' not in current_data: missing_info.append("Minimum Pressure Requirements at Nodes")
         if 'hydraulic_simulation_library_tool' not in current_data: missing_info.append("Hydraulic Simulation Library/Tool")

    else: # OTHER_HEURISTIC or UNKNOWN
        if not any(k not in ['list_of_cities', 'city_to_code', 'geocoded_locations'] + auto_fetched_keys for k in current_data):
             missing_info.append("Specific problem parameters needing manual input")

    print(f"  [LLM Placeholder] Identified missing manual info: {missing_info} (using improved rules)")
    print("Step 3.1 / 4.1 (Re-check): Exiting LLM Identify Missing Manual Info.")
    return missing_info

def call_llm_generate_questions(missing_info: List[str]) -> List[str]:
    """Placeholder: Calls LLM to generate user-friendly questions."""
    print("\nStep 4.2: Entering LLM Generate Questions...")
    print(f"  Input Missing Info: {missing_info}")
    questions = []
    for info in missing_info:
        questions.append(f"Could you please provide the '{info}'?")
    print(f"  [LLM Placeholder] Generated questions: {questions} (using simple logic)")
    print("Step 4.2: Exiting LLM Generate Questions.")
    return questions

# --- SolvePrep Class ---
class SolvePrep:
    """Handles problem preparation using LLM and automatic data fetching where applicable."""
    # (Class definition remains exactly the same as the previous version)
    def __init__(self, gemini_api_key: Optional[str] = None, flight_api_key: Optional[str] = None):
        self.geolocator = None; self.airports_db = None
        try: self.geolocator = geopy.Nominatim(user_agent="heuristic_solver_util_v1")
        except Exception as e: print(f"  Warning: Failed to initialize geolocator: {e}")
        self.gemini_api_key = gemini_api_key; self.flight_api_key = flight_api_key
        try: self.airports_db = airportsdata.load('IATA')
        except Exception as e: print(f"  Warning: Could not load airports database: {e}.")

    def _get_airport_codes(self, cities: List[str]) -> Dict[str, Optional[str]]:
        print("\nStep 2.2.1: Entering Airport Code Lookup..."); print(f"  Input Cities: {cities}")
        if not self.airports_db: print("  Error: Airports database not loaded."); return {c: None for c in cities}
        city_to_code = {}
        for city_name in cities:
            found_code = None; print(f"  Searching for city: '{city_name}'")
            try:
                matches = [code for code, data in self.airports_db.items() if data.get('city', '').lower() == city_name.lower()]
                if matches:
                    if city_name == "London" and "LHR" in matches: found_code = "LHR"
                    elif city_name == "Paris" and "CDG" in matches: found_code = "CDG"
                    elif city_name == "Berlin" and "BER" in matches: found_code = "BER"
                    elif city_name == "Rome" and "FCO" in matches: found_code = "FCO"
                    elif city_name == "Madrid" and "MAD" in matches: found_code = "MAD"
                    elif city_name == "Amsterdam" and "AMS" in matches: found_code = "AMS"
                    elif city_name == "Prague" and "PRG" in matches: found_code = "PRG"
                    elif city_name == "Vienna" and "VIE" in matches: found_code = "VIE"
                    elif city_name == "Budapest" and "BUD" in matches: found_code = "BUD"
                    elif city_name == "Barcelona" and "BCN" in matches: found_code = "BCN"
                    else: found_code = matches[0]
                    print(f"    Found code(s): {matches} -> Selected: {found_code}")
                else: print(f"    Code not found for city: '{city_name}'")
            except Exception as e: print(f"    Error looking up code for '{city_name}': {e}")
            city_to_code[city_name] = found_code
        print(f"  Output City-to-Code Map: {city_to_code}"); print("Step 2.2.1: Exiting Airport Code Lookup.")
        return city_to_code

    def _fetch_flight_data(self, city_to_code: Dict[str, Optional[str]]) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
        print("\nStep 2.2.2: Entering Flight Data Fetching (Placeholder)..."); cities = list(city_to_code.keys()); codes = [city_to_code[city] for city in cities]; num_cities = len(cities)
        print(f"  Attempting to fetch data for {num_cities} cities with codes: {codes}"); cost_matrix = np.full((num_cities, num_cities), np.inf); duration_matrix = np.full((num_cities, num_cities), np.inf)
        np.fill_diagonal(cost_matrix, 0); np.fill_diagonal(duration_matrix, 0)
        if not self.flight_api_key:
            print("  Warning: Flight API key not provided."); print("  Generating dummy flight data instead.")
            for i in range(num_cities):
                for j in range(i + 1, num_cities):
                     cost = random.uniform(100, 1000); duration = random.uniform(1, 10)
                     cost_matrix[i, j] = cost_matrix[j, i] = cost; duration_matrix[i, j] = duration_matrix[j, i] = duration
                     print(f"    Generated dummy data for route {cities[i]}-{cities[j]}: Cost={cost:.0f}, Dur={duration:.1f}h")
            print("Step 2.2.2: Exiting Flight Data Fetching (Dummy Data)."); return cost_matrix, duration_matrix
        valid_codes = [code for code in codes if code is not None]
        if len(valid_codes) < 2: print("  Error: Need at least two valid airport codes."); print("Step 2.2.2: Exiting Flight Data Fetching (Error)."); return None, None
        print(f"  [API Placeholder] Simulating API calls for {len(valid_codes)} airports...")
        for i in range(num_cities):
            for j in range(i + 1, num_cities):
                origin_code = codes[i]; dest_code = codes[j]
                if origin_code and dest_code:
                    print(f"    [API Placeholder] Simulating API call for route: {origin_code} -> {dest_code}")
                    simulated_cost = random.uniform(100, 1000); simulated_duration = random.uniform(1, 10)
                    cost_matrix[i, j] = cost_matrix[j, i] = simulated_cost; duration_matrix[i, j] = duration_matrix[j, i] = simulated_duration
                    print(f"      [API Placeholder] Simulated data: Cost={simulated_cost:.0f}, Dur={simulated_duration:.1f}h")
                else: print(f"    Skipping route involving missing codes: {cities[i]} / {cities[j]}")
        print("  [API Placeholder] Flight data fetching simulation complete."); print("Step 2.2.2: Exiting Flight Data Fetching (Simulated API).")
        return cost_matrix, duration_matrix

    def _update_data_based_on_answers(self, current_data: Dict, questions: List[str], answers: List[str]) -> Dict:
        print("\nStep 4.4: Entering Update Data Based on Answers..."); print(f"  Input Questions: {questions}"); print(f"  Input Answers: {answers}")
        for i, answer in enumerate(answers):
            if i < len(questions):
                question = questions[i].lower(); key_guess = f"user_provided_{i}"
                match = re.search(r"provide the '(.+?)'", question)
                if match:
                    info_requested = match.group(1).lower()
                    # --- Key Guessing Logic ---
                    if "list of items" in info_requested: key_guess = "item_list_dimensions_values"
                    elif "truck cargo dimensions" in info_requested: key_guess = "truck_dimensions"
                    elif "date range" in info_requested: key_guess = "travel_date_range"
                    elif "airline preferences" in info_requested: key_guess = "airline_preferences"
                    elif "airport transfer times" in info_requested: key_guess = "airport_transfer_times_hours"
                    elif "driving distances" in info_requested: key_guess = "driving_distance_matrix_miles"
                    elif "elevation data" in info_requested: key_guess = "route_elevation_data_source"
                    elif "park closures" in info_requested: key_guess = "park_closure_info_source"
                    elif "delivery addresses" in info_requested: key_guess = "delivery_addresses_list"
                    elif "customer delivery time windows" in info_requested: key_guess = "customer_delivery_time_windows"
                    elif "traffic data source" in info_requested: key_guess = "real_time_traffic_data_source"
                    elif "population density" in info_requested: key_guess = "population_density_data"
                    elif "competitor locations" in info_requested: key_guess = "competitor_locations"
                    elif "real estate cost" in info_requested: key_guess = "commercial_real_estate_cost_data"
                    elif "traffic pattern" in info_requested: key_guess = "traffic_pattern_data"
                    elif "nurse list" in info_requested: key_guess = "nurse_list_qualifications_preferences"
                    elif "ward staffing" in info_requested: key_guess = "ward_staffing_requirements_per_shift"
                    elif "labor regulations" in info_requested: key_guess = "labor_regulations_consecutive_days"
                    elif "list of potential assets" in info_requested: key_guess = "list_of_potential_assets"
                    elif "risk level preference" in info_requested: key_guess = "risk_level_preference"
                    elif "diversification rules" in info_requested: key_guess = "diversification_rules"
                    elif "historical asset performance" in info_requested: key_guess = "historical_asset_performance_data"
                    elif "sector classifications" in info_requested: key_guess = "asset_sector_classifications"
                    elif "volatility metrics" in info_requested: key_guess = "asset_volatility_metrics"
                    elif "correlation data" in info_requested: key_guess = "asset_correlation_data"
                    elif "list of sessions" in info_requested: key_guess = "list_of_sessions_with_topics_speakers"
                    elif "list of rooms" in info_requested: key_guess = "list_of_rooms_with_capacities"
                    elif "timeslots per day" in info_requested: key_guess = "timeslots_per_day"
                    elif "speaker availability" in info_requested: key_guess = "speaker_availability_constraints"
                    elif "topic relationships" in info_requested: key_guess = "topic_relationships_minimize_distance_conflict"
                    elif "predicted attendance" in info_requested: key_guess = "predicted_attendance_per_session_optional"
                    elif "list of tasks" in info_requested: key_guess = "list_of_tasks_with_durations_and_dependencies"
                    elif "crew availability" in info_requested: key_guess = "crew_availability_type_and_count_per_period"
                    elif "material delivery" in info_requested: key_guess = "material_delivery_lead_times"
                    elif "weather forecast" in info_requested: key_guess = "weather_forecast_source_data"
                    elif "location/area definition" in info_requested: key_guess = "development_location_area_definition"
                    elif "elevation data for area" in info_requested: key_guess = "elevation_data_for_area"
                    elif "water demand patterns" in info_requested: key_guess = "water_demand_patterns_per_home_area_peak_avg"
                    elif "pipe types and costs" in info_requested: key_guess = "pipe_types_and_costs_per_unit_length_per_diameter"
                    elif "pump types and costs" in info_requested: key_guess = "pump_types_and_costs_based_on_head_flow_capacity"
                    elif "minimum pressure requirements" in info_requested: key_guess = "minimum_pressure_requirements_at_nodes"
                    elif "hydraulic simulation library" in info_requested: key_guess = "hydraulic_simulation_library_tool"
                    else: key_guess = info_requested.replace('(optional)', '').strip().replace(' ', '_').lower() # Fallback
                    # --- End Key Guessing ---
                print(f"    Updating/Adding key '{key_guess}' with value '{answer}'")
                # Basic type conversion attempt
                if ("list" in key_guess or "constraints" in key_guess or "dimensions" in key_guess or "requirements" in key_guess or "preferences" in key_guess or "relationships" in key_guess) and isinstance(answer, str) and '[' in answer and ']' in answer:
                    try: current_data[key_guess] = json.loads(answer.replace("'", '"')); print(f"      (Parsed as list/dict)") ; continue
                    except json.JSONDecodeError: print(f"      (Could not parse answer as JSON list/dict, storing as string)")
                elif ("matrix" in key_guess or "_data" in key_guess or "_source" in key_guess) and isinstance(answer, str): print(f"      (Storing potential file path/source as string)") # Avoid parsing matrices
                elif key_guess in ["number_of_drivers_vehicles", "vehicle_mpg", "num_new_shops", "num_nurses", "num_shifts", "num_sessions", "num_rooms", "num_days", "building_stories", "num_homes"] and isinstance(answer, str) and answer.isdigit():
                     try: current_data[key_guess] = int(answer); print(f"      (Parsed as int)"); continue
                     except ValueError: print(f"      (Could not parse answer as int, storing as string)")
                elif isinstance(answer, str) and answer.replace('.','',1).isdigit():
                     try: current_data[key_guess] = float(answer); print(f"      (Parsed as float)"); continue
                     except ValueError: print(f"      (Could not parse answer as float, storing as string)")
                current_data[key_guess] = answer
            else: print(f"    Warning: More answers ({len(answers)}) than questions ({len(questions)}).")
        print(f"  Output Updated Data Keys: {list(current_data.keys())}"); print("Step 4.4: Exiting Update Data Based on Answers.")
        return current_data

    def _perform_geocoding_if_needed(self, problem_context: ProblemContext) -> None:
        print("\nStep 5: Entering Geocoding (if needed)...")
        if not self.geolocator: print("  Skipping geocoding, geolocator not initialized."); return
        data = problem_context.extracted_data; loc_key = None
        if "delivery_addresses_list" in data: loc_key = "delivery_addresses_list"
        elif "list_of_cities" in data: loc_key = "list_of_cities"
        elif "list_of_locations" in data: loc_key = "list_of_locations"
        elif "competitor_locations" in data: loc_key = "competitor_locations" # Geocode competitor locs too?
        elif "development_location_area_definition" in data: loc_key = "development_location_area_definition" # Maybe geocode single point?
        elif "user_provided_locations" in data: loc_key = "user_provided_locations"
        if loc_key and isinstance(data.get(loc_key), list):
            if 'geocoded_locations' in data: print("  Skipping geocoding, already present.")
            else:
                print(f"  Attempting geocoding for locations in key: '{loc_key}'"); geocoded_locations = []
                locations_to_geocode = data[loc_key]; location_names = []
                if locations_to_geocode:
                     if isinstance(locations_to_geocode[0], dict) and 'name' in locations_to_geocode[0]: location_names = [loc.get('name', '') for loc in locations_to_geocode]; print("    (Extracting names from list of dicts)")
                     elif isinstance(locations_to_geocode[0], str): location_names = locations_to_geocode
                     else: print(f"    Warning: Cannot determine location names from format: {type(locations_to_geocode[0])}")
                else: print("    Warning: Location list is empty.")
                for loc_name in location_names:
                    if isinstance(loc_name, str) and loc_name:
                        print(f"    Geocoding '{loc_name}'...")
                        try:
                            location = self.geolocator.geocode(loc_name, timeout=10)
                            if location: iata_code = data.get('city_to_code', {}).get(loc_name); geo_loc = Location(name=loc_name, address=location.address, coords=(location.latitude, location.longitude), iata_code=iata_code); geocoded_locations.append(geo_loc); print(f"      Success: {geo_loc.coords}" + (f" (IATA: {iata_code})" if iata_code else ""))
                            else: print(f"      Failed: Could not geocode."); geocoded_locations.append(Location(name=loc_name))
                        except Exception as e: print(f"      Error: {e}"); geocoded_locations.append(Location(name=loc_name))
                    else: print(f"    Skipping invalid/empty location name: '{loc_name}'")
                data['geocoded_locations'] = geocoded_locations; print("  Geocoding complete.")
        # Handle single location geocoding if needed
        elif loc_key and isinstance(data.get(loc_key), str):
             loc_name = data[loc_key]
             print(f"    Geocoding single location '{loc_name}'...")
             # Similar try/except block as above for single location
             try:
                 location = self.geolocator.geocode(loc_name, timeout=10)
                 if location: data['geocoded_location'] = Location(name=loc_name, address=location.address, coords=(location.latitude, location.longitude)); print(f"      Success: {location.latitude, location.longitude}")
                 else: print(f"      Failed: Could not geocode.")
             except Exception as e: print(f"      Error: {e}")

        else: print("  No suitable location list/string found for geocoding or data type incorrect.")
        print("Step 5: Exiting Geocoding.")

    def present_data_for_confirmation(self, problem_context: ProblemContext, simulate: bool = False) -> bool:
        print("\nStep 6: Entering Present Data for Confirmation..."); print(f"  Identified Problem Type: {problem_context.identified_type.name}"); print("  Collected & Prepared Data:")
        try: print(json.dumps(problem_context.extracted_data, indent=2, default=lambda o: repr(o)))
        except Exception as e: print(f"    Error converting data to JSON: {e}"); print(f"    Raw Data: {problem_context.extracted_data}")
        if simulate: print("  > Is the above problem formulation correct...?: yes (Simulated)"); problem_context.is_confirmed = True
        else: confirmation = input("  > Is the above problem formulation correct...? (yes/no): "); problem_context.is_confirmed = confirmation.lower().strip() == 'yes'
        print(f"  User confirmation status: {problem_context.is_confirmed}"); print("Step 6: Exiting Present Data for Confirmation.")
        return problem_context.is_confirmed

    def run_preparation_pipeline(self, description: str, simulation_data: Optional[pd.DataFrame] = None) -> Optional[ProblemContext]:
        print("\nStarting Preparation Pipeline...")
        context = ProblemContext(original_description=description); is_simulation = simulation_data is not None
        problem_type_map = {
             ProblemType.TSP_FLIGHTS: "Traveling Salesman Problem", ProblemType.TSP_DRIVING_FUEL: "TSP with constraints",
             ProblemType.KNAPSACK_MOVING: "Knapsack/Bin Packing Problem", ProblemType.VRP_MANHATTAN: "Vehicle Routing Problem with Time Windows",
             ProblemType.FACILITY_LOCATION_SEATTLE: "Facility Location Problem", ProblemType.NURSE_SCHEDULING_MGH: "Nurse Scheduling Problem",
             ProblemType.PORTFOLIO_OPTIMIZATION: "Portfolio Optimization", ProblemType.TIMETABLING_CONFERENCE: "Timetabling Problem",
             ProblemType.PROJECT_SCHEDULING_CONSTRUCTION: "Project Scheduling Problem", ProblemType.NETWORK_DESIGN_WATER: "Network Design Problem",
        }
        print("\n=== Step 1: Problem Categorization ==="); context.identified_type = call_llm_categorize(description, list(ProblemType))
        if context.identified_type == ProblemType.UNKNOWN: print("Pipeline Error: Could not identify problem type."); return None
        print(f"Pipeline Update: Problem categorized as {context.identified_type.name}"); problem_type_str_for_sim = problem_type_map.get(context.identified_type, context.identified_type.name)

        print("\n=== Step 2: Initial Extraction / Automatic Data Fetching ==="); context.extracted_data = call_llm_extract_initial_data(context.identified_type, description)
        print(f"Pipeline Update: Initial data extracted: {list(context.extracted_data.keys())}"); auto_fetched_keys = []
        if context.identified_type == ProblemType.TSP_FLIGHTS:
            print("\n--- Starting Automatic Flight Data Fetching ---"); context.requires_manual_data = False; cities = context.extracted_data.get("list_of_cities", [])
            if not cities: print("Pipeline Error: City list needed for TSP_FLIGHTS."); return None
            city_to_code = self._get_airport_codes(cities); context.extracted_data['city_to_code'] = city_to_code; print(f"Pipeline Update: Stored city-to-code mapping.")
            valid_codes = [code for code in city_to_code.values() if code is not None]
            if len(valid_codes) < len(cities): print(f"Pipeline Warning: Found codes for {len(valid_codes)}/{len(cities)} cities.");
            if len(valid_codes) < 2: print("Pipeline Error: Need >= 2 valid codes."); return None
            cost_matrix, duration_matrix = self._fetch_flight_data(city_to_code)
            if cost_matrix is not None and duration_matrix is not None: print("Pipeline Update: Successfully fetched/simulated flight data."); context.extracted_data['flight_cost_matrix'] = cost_matrix; context.extracted_data['flight_duration_matrix'] = duration_matrix; auto_fetched_keys.extend(['flight_cost_matrix', 'flight_duration_matrix'])
            else: print("Pipeline Error: Failed to fetch flight data."); return None
            print("--- End Automatic Flight Data Fetching ---")
        elif context.identified_type == ProblemType.VRP_MANHATTAN: print("\n--- Deferring Geocoding until after potential address list update ---")
        else: print("Pipeline Info: No automatic data fetching configured for this problem type in Step 2.")

        print("\n=== Step 3 & 4: Manual Data Refinement / Simulation ==="); context.missing_info = call_llm_identify_missing_manual(context.identified_type, context.extracted_data, auto_fetched_keys)
        if context.missing_info: context.requires_manual_data = True; print(f"Pipeline Info: Manual data required for: {context.missing_info}")
        else: context.requires_manual_data = False; print("Pipeline Info: No essential manual information identified as missing.")

        if context.requires_manual_data:
            loop_name = "Simulation" if is_simulation else "Manual Data Refinement"; print(f"\n--- Starting {loop_name} Loop ---")
            for attempt in range(1):
                print(f"--- {loop_name} Attempt {attempt + 1} ---"); context.user_questions = call_llm_generate_questions(context.missing_info)
                if not context.user_questions: print("Pipeline Error: LLM failed to generate questions."); return None
                user_answers = []
                if is_simulation:
                    print("Pipeline Action: Simulating answers based on requirements CSV..."); current_missing_info = context.missing_info[:]
                    for missing_item_desc in current_missing_info:
                        sim_answer = f"SimulatedData_NotFound_For_{missing_item_desc[:20]}"; search_term = missing_item_desc.replace('(optional)','').strip().lower()
                        matched_rows = simulation_data[(simulation_data['ProblemType'].str.lower() == problem_type_str_for_sim.lower()) & (simulation_data['RequiredInfoDescription'].str.lower().str.contains(search_term, na=False))]
                        if not matched_rows.empty: sim_answer = matched_rows.iloc[0]['Format_Example']; print(f"    Found sim data for '{missing_item_desc}': Using -> '{sim_answer}'")
                        else: print(f"    Warning: No sim data found matching '{missing_item_desc}' for '{problem_type_str_for_sim}'.")
                        user_answers.append(sim_answer)
                    print(f"Pipeline Info: Simulated answers obtained: {user_answers}")
                else: print("Error: Manual input function (_get_user_input) is commented out."); return None
                context.extracted_data = self._update_data_based_on_answers(context.extracted_data, context.user_questions, user_answers)
                print("Pipeline Update: Data updated with answers."); context.missing_info = call_llm_identify_missing_manual(context.identified_type, context.extracted_data, auto_fetched_keys)
                if not context.missing_info: print("Pipeline Info: All essential info seems gathered/simulated."); break
            if context.missing_info: print(f"Pipeline Error: Could not gather/simulate all required {loop_name} info."); print(f"  Remaining: {context.missing_info}"); return None
            print(f"--- End {loop_name} Loop ---")
        else: print("Pipeline Info: Skipping manual data refinement loop.")

        print("\n=== Step 5: Post-Processing ==="); self._perform_geocoding_if_needed(context)
        print("\n=== Step 6: Final Confirmation ===");
        if self.present_data_for_confirmation(context, simulate=is_simulation): print("\nPreparation Pipeline Completed Successfully."); return context
        else: print("\nPreparation Pipeline Halted: Confirmation Failed."); return None

print("SolvePrep Utils Defined.")
# --- End of solve_prep_utils.py ---

SolvePrep Utils Defined.


# New Section

In [2]:
# run_simulation.py

import pandas as pd
import json
import os
from typing import Optional, List, Dict
from tabulate import tabulate # Import tabulate for table generation

# Import components from the utility file
try:
    from solve_prep_utils import SolvePrep, ProblemType, ProblemContext # Only import needed items
    print("Successfully imported components from solve_prep_utils.py")
except ImportError as e:
    print(f"Error importing from solve_prep_utils.py: {e}")
    print("Ensure solve_prep_utils.py is in the same directory or Python path.")
    exit()
except Exception as e: # Catch other potential errors during import
    print(f"An unexpected error occurred during import: {e}")
    exit()

# --- Helper Functions for File IO (Moved Here) ---
def read_problems_df_from_csv(filepath: str) -> Optional[pd.DataFrame]:
    """Reads all problem descriptions from a CSV file."""
    print(f"\nReading all problems from '{filepath}'...")
    try:
        df = pd.read_csv(filepath)
        if 'ProblemDescription' in df.columns:
            print(f"  Successfully read {len(df)} problems.")
            return df
        else:
            print(f"  Error: CSV file must contain a 'ProblemDescription' column.")
            return None
    except FileNotFoundError:
        print(f"  Error: CSV file not found at '{filepath}'.")
        return None
    except Exception as e:
        print(f"  Error reading CSV file: {e}")
        return None

def load_simulation_data(filepath: str) -> Optional[pd.DataFrame]:
     """Loads the required info specifications for simulation."""
     print(f"\nLoading simulation answers/requirements from '{filepath}'...")
     try:
          df = pd.read_csv(filepath)
          required_cols = ['ProblemID', 'ProblemType', 'RequiredInfoDescription', 'Format_Example']
          if all(col in df.columns for col in required_cols):
               print(f"  Successfully loaded simulation data ({len(df)} rows).")
               return df
          else:
               print(f"  Error: Simulation CSV missing required columns. Expected: {required_cols}")
               return None
     except FileNotFoundError:
          print(f"  Error: Simulation CSV file not found at '{filepath}'.")
          return None
     except Exception as e:
          print(f"  Error reading simulation CSV file: {e}")
          return None

# --- Function to Generate Analysis Table ---
def generate_analysis_table(results: List[Dict]) -> str:
     """Formats the results list into a Markdown table."""
     headers = ["Index", "Status", "Detected Type", "Issues/Notes"]
     table_data = []
     for r in results:
          index = r.get("index", "N/A")
          status = r.get("status", "Unknown")
          detected_type = r.get("type", "Unknown")
          # Extract key issues or summary points
          notes = []
          if status == "FailedPreparation":
               notes.append("Pipeline failed or was not confirmed.")
          elif isinstance(r.get("data"), dict): # Check if data exists for successful runs
               data = r["data"]
               if detected_type == "OTHER_HEURISTIC":
                    notes.append("Miscategorized by placeholder.")
               if data.get("travel_date_range", "").startswith("SimulatedData_NotFound"):
                    notes.append("Optional info sim data missing.")
               if "flight_cost_matrix" not in data and detected_type == "TSP_FLIGHTS":
                    notes.append("Flight matrix missing (API Error?).")
               if not data.get("item_list_dimensions_values") and detected_type == "KNAPSACK_MOVING":
                   notes.append("Knapsack items missing.")
               # Add more specific checks based on expected data for each type
          elif status == "DisplayError":
                notes.append("Error displaying final data.")

          if not notes:
               notes.append("Completed successfully.")

          table_data.append([index, status, detected_type, "; ".join(notes)])

     # Use tabulate to create the Markdown table
     try:
        return tabulate(table_data, headers=headers, tablefmt="pipe")
     except ImportError:
          print("\nError: 'tabulate' library not found. Cannot generate table.")
          print("Install it using: pip install tabulate")
          # Fallback to simpler format if tabulate fails
          table_str = "| " + " | ".join(headers) + " |\n"
          table_str += "| " + " | ".join(["---"] * len(headers)) + " |\n"
          for row in table_data:
              table_str += "| " + " | ".join(map(str, row)) + " |\n"
          return table_str
     except Exception as e:
         print(f"\nError generating table: {e}")
         return "Table generation failed."


# --- Main Execution Block ---
if __name__ == "__main__":
    print("\n--- Starting Main Execution Block (run_simulation.py) ---")

    # --- Configuration ---
    PROBLEMS_CSV_PATH = 'problems.csv'
    SIMULATION_CSV_PATH = 'problem_info_reqs.csv'
    GEMINI_API_KEY = None
    FLIGHT_API_KEY = None

    print("\nConfiguration:")
    print(f"  Problem Descriptions CSV: {PROBLEMS_CSV_PATH}")
    print(f"  Simulation Requirements CSV: {SIMULATION_CSV_PATH}")
    print(f"  Gemini API Key Provided: {bool(GEMINI_API_KEY)}")
    print(f"  Flight API Key Provided: {bool(FLIGHT_API_KEY)}")

    # --- Create/Ensure Dummy Files Exist ---
    print("\nEnsuring Input Files Exist...")
    # Create problems.csv if needed - *USE THE FULL 10 DESCRIPTIONS HERE FOR FULL ANALYSIS*
    if not os.path.exists(PROBLEMS_CSV_PATH):
        print(f"  Creating dummy problem description CSV: {PROBLEMS_CSV_PATH}")
        # **IMPORTANT**: For full analysis, replace this list with the 10 problem descriptions
        dummy_problems_data = {'ProblemDescription': [
            "I need to visit all the following European cities in the most efficient order: London, Paris, Berlin, Rome, Madrid, Amsterdam, Prague, Vienna, Budapest, and Barcelona. I'll fly between them and want to minimize my total airfare and travel time.", # Problem 1
            "I'm planning a road trip through the US national parks. I want to visit Yellowstone, Grand Canyon, Yosemite, Zion, Olympic, Glacier, Acadia, Great Smoky Mountains, Grand Teton, and Rocky Mountain. I need to find the most fuel-efficient route based on my car that gets 25 MPG.", # Problem 2
            "I need to move items from my 3-bedroom apartment in Boston to my new place in Chicago. I have furniture pieces of different sizes and values, and I need to determine which items to take in a 26-foot U-Haul truck to maximize the value of what I bring.", # Problem 3
            "Our delivery service needs to distribute packages to 45 addresses across Manhattan using 5 drivers. We need routes that account for real-time traffic conditions and ensure all deliveries happen within promised time windows.", # Problem 4
            "I need to find the optimal locations for 7 new coffee shops in Seattle to maximize potential customers while ensuring shops are at least 0.5 miles apart and accounting for competitor locations.", # Problem 5
            "I need to schedule 25 nurses across 3 shifts at Massachusetts General Hospital, considering their shift preferences, required skill levels for each ward, and ensuring no one works more than 5 consecutive days.", # Problem 6
            "I need to invest $50,000 across stocks from the S&P 500, bonds, and ETFs to maximize returns with a risk level I'm comfortable with and proper diversification across sectors.", # Problem 7
            "I'm organizing a conference at the Hilton Chicago with 35 sessions across 8 rooms over 3 days. I need to schedule them to minimize room changes for topic tracks and avoid scheduling similar topics simultaneously.", # Problem 8
            "I need to plan the construction sequence for our 50-story building in downtown Miami, determining the optimal order of tasks considering crew availability, material delivery times, and weather forecasts to minimize the project timeline.", # Problem 9
            "I need to design a water distribution network for a new development in Phoenix with 120 homes, determining pipe diameters and pump capacities to ensure adequate pressure while minimizing infrastructure costs." # Problem 10
        ]}
        try: pd.DataFrame(dummy_problems_data).to_csv(PROBLEMS_CSV_PATH, index=False); print(f"  Successfully created {PROBLEMS_CSV_PATH} with 10 problems.")
        except Exception as e: print(f"  Error creating dummy {PROBLEMS_CSV_PATH}: {e}")
    else: print(f"  Using existing problem description CSV: {PROBLEMS_CSV_PATH}")

    # Ensure simulation requirements file exists
    if not os.path.exists(SIMULATION_CSV_PATH):
         print(f"  ERROR: {SIMULATION_CSV_PATH} not found. Please create this file with the full requirements.")
         # Create basic dummy file (won't provide good simulation answers for all types)
         print(f"  Creating basic dummy requirements CSV: {SIMULATION_CSV_PATH}")
         dummy_reqs_data = {
             'ProblemID': [1, 3, 3], 'ProblemType': ["Traveling Salesman Problem", "Knapsack/Bin Packing Problem", "Knapsack/Bin Packing Problem"],
             'RequiredInfoDescription': ["Airport Transfer Times per City", "List of items with dimensions (width, height, depth) and value", "Truck cargo dimensions (width, height, depth)"],
             'Format_Example': ["1.5, 1.0, 1.2", "[{'name':'Painting', 'width_cm':50, 'height_cm':50, 'depth_cm':10, 'value_usd':10000}, {'name':'Sculpture', 'width_cm':30, 'height_cm':30, 'depth_cm':80, 'value_usd':5000}]", "100, 100, 100"],
             'AutomationNotes': ["Estimate or User Input", "User Input File", "User Input"]
         }
         try: pd.DataFrame(dummy_reqs_data).to_csv(SIMULATION_CSV_PATH, index=False); print(f"  Successfully created basic dummy {SIMULATION_CSV_PATH}.")
         except Exception as e: print(f"  Error creating dummy {SIMULATION_CSV_PATH}: {e}")
    else: print(f"  Using existing simulation requirements CSV: {SIMULATION_CSV_PATH}")


    # --- Load Data ---
    print("\nLoading Data...")
    problems_df = read_problems_df_from_csv(PROBLEMS_CSV_PATH)
    simulation_reqs_df = load_simulation_data(SIMULATION_CSV_PATH)

    # --- Instantiate Solver Prep ---
    print("\nInstantiating SolvePrep...")
    prep = SolvePrep(gemini_api_key=GEMINI_API_KEY, flight_api_key=FLIGHT_API_KEY)
    print("SolvePrep Instantiated.")

    # --- Process Each Problem ---
    all_results_summary = [] # Store summary results for final table
    if problems_df is not None and simulation_reqs_df is not None:
        print(f"\n--- Starting to Process {len(problems_df)} Problems ---")

        for index, row in problems_df.iterrows():
            problem_status = "Unknown"
            problem_type_name = "Unknown"
            final_data = None

            if 'ProblemDescription' not in row:
                print(f"\nSkipping row {index}: 'ProblemDescription' column missing.")
                all_results_summary.append({"index": index, "status": "Skipped", "type": "N/A", "data": None})
                continue

            problem_desc = row['ProblemDescription']
            print(f"\n\n<<<<<<<<<< Processing Problem Index {index} >>>>>>>>>>")
            print(f"Description: '{problem_desc[:100]}...'")

            try:
                prepared_context = prep.run_preparation_pipeline(problem_desc, simulation_data=simulation_reqs_df)

                if prepared_context and prepared_context.is_confirmed:
                    print(f"\n--- Problem {index} Preparation Complete ---")
                    problem_status = "Success"
                    problem_type_name = prepared_context.identified_type.name
                    final_data = prepared_context.extracted_data # Store for analysis table notes
                    print(f"  Type: {problem_type_name}")
                    print("  Final Prepared Data (JSON):")
                    try: print(json.dumps(final_data, indent=2, default=repr))
                    except Exception as e: print(f"  Error displaying final data as JSON: {e}")
                    print("\n[Placeholder] Would proceed to EvoMoE stage for this problem now...")
                else:
                    print(f"\n--- Problem {index} Preparation Failed or Not Confirmed ---")
                    problem_status = "FailedPreparation"
                    # Try to get type if categorization happened before failure
                    if prepared_context: problem_type_name = prepared_context.identified_type.name

            except Exception as e:
                 print(f"\n--- CRITICAL ERROR during processing Problem Index {index} ---")
                 print(f"  Error: {e}")
                 problem_status = "CriticalError"
                 # You might want to add more detailed error logging here

            all_results_summary.append({"index": index, "status": problem_status, "type": problem_type_name, "data": final_data}) # Store result summary
            print(f"<<<<<<<<<< Finished Problem Index {index} >>>>>>>>>>")

        # --- Summary Table Generation ---
        print("\n\n--- All Problems Processed ---")
        if all_results_summary:
             print("\n--- Final Analysis Table ---")
             # Pass the collected summary data to the table generator
             analysis_table = generate_analysis_table(all_results_summary)
             print(analysis_table)
             print("--- End of Table ---")
        else:
             print("No problems were processed or results collected.")


    elif simulation_reqs_df is None:
         print("\n--- Solver exiting: Could not load simulation requirements data. ---")
    else: # problems_df is None
        print("\n--- Solver exiting: Could not read problems from CSV. ---")

    print("\n--- Main Execution Block Finished ---")

# --- End of run_simulation.py ---

Error importing from solve_prep_utils.py: No module named 'solve_prep_utils'
Ensure solve_prep_utils.py is in the same directory or Python path.

--- Starting Main Execution Block (run_simulation.py) ---

Configuration:
  Problem Descriptions CSV: problems.csv
  Simulation Requirements CSV: problem_info_reqs.csv
  Gemini API Key Provided: False
  Flight API Key Provided: False

Ensuring Input Files Exist...
  Using existing problem description CSV: problems.csv
  Using existing simulation requirements CSV: problem_info_reqs.csv

Loading Data...

Reading all problems from 'problems.csv'...
  Successfully read 10 problems.

Loading simulation answers/requirements from 'problem_info_reqs.csv'...
  Successfully loaded simulation data (54 rows).

Instantiating SolvePrep...
SolvePrep Instantiated.

--- Starting to Process 10 Problems ---


<<<<<<<<<< Processing Problem Index 0 >>>>>>>>>>
Description: 'I need to visit all the following European cities in the most efficient order: London, Pari

  matched_rows = simulation_data[(simulation_data['ProblemType'].str.lower() == problem_type_str_for_sim.lower()) & (simulation_data['RequiredInfoDescription'].str.lower().str.contains(search_term, na=False))]


      Failed: Could not geocode.
Step 5: Exiting Geocoding.

=== Step 6: Final Confirmation ===

Step 6: Entering Present Data for Confirmation...
  Identified Problem Type: VRP_MANHATTAN
  Collected & Prepared Data:
{
  "num_drivers": 5,
  "num_addresses_expected": 45,
  "delivery_addresses_list": "List of full street addresses: ['123 Main St, New York, NY 10001', ...]",
  "customer_delivery_time_windows": "List per address: [{'address_idx': 0, 'start_time': '09:00', 'end_time': '11:00'}, ...]",
  "real_time_traffic_data_source": "API endpoint/key for traffic conditions"
}
  > Is the above problem formulation correct...?: yes (Simulated)
  User confirmation status: True
Step 6: Exiting Present Data for Confirmation.

Preparation Pipeline Completed Successfully.

--- Problem 3 Preparation Complete ---
  Type: VRP_MANHATTAN
  Final Prepared Data (JSON):
{
  "num_drivers": 5,
  "num_addresses_expected": 45,
  "delivery_addresses_list": "List of full street addresses: ['123 Main St, New Y