In [6]:
# data_generation.py
import pandas as pd
import numpy as np
from faker import Faker
import random

def generate_building_dataset(n_samples=5000):
    """
    Generate synthetic building energy dataset with all 34 features
    """
    np.random.seed(42)
    random.seed(42)
    fake = Faker()
    
    data = {}
    
    # === BASIC BUILDING CHARACTERISTICS ===
    data['total_floor_area'] = np.random.uniform(500, 50000, n_samples)  # m¬≤
    data['number_of_floors'] = np.random.randint(1, 50, n_samples)
    data['occupant_count'] = np.random.randint(10, 2000, n_samples)
    data['building_age'] = np.random.randint(1, 100, n_samples)
    
    # === SYSTEM EFFICIENCIES ===
    data['hvac_efficiency'] = np.random.uniform(0.6, 0.95, n_samples)  # COP ratio (0.6-0.95)
    data['lighting_power_density'] = np.random.uniform(5, 20, n_samples)  # W/m¬≤
    data['equipment_power_density'] = np.random.uniform(3, 15, n_samples)  # W/m¬≤
    
    # === ENVIRONMENTAL FACTORS ===
    # For variety, use different climate distributions
    climate_zones = np.random.choice([1, 2, 3, 4, 5, 6, 7], n_samples, p=[0.1, 0.15, 0.2, 0.25, 0.15, 0.1, 0.05])
    
    # Generate temperatures based on climate zone
    temperatures = []
    for zone in climate_zones:
        if zone in [1, 2]:  # Hot climates
            temperatures.append(np.random.uniform(25, 40))
        elif zone in [3, 4]:  # Temperate
            temperatures.append(np.random.uniform(15, 30))
        else:  # Cold climates
            temperatures.append(np.random.uniform(-5, 20))
    
    data['outdoor_temperature'] = np.array(temperatures)
    data['humidity'] = np.random.uniform(20, 90, n_samples)  # Percentage
    data['climate_zone_numeric'] = climate_zones
    
    # === OPERATIONAL FACTORS ===
    data['occupancy_hours'] = np.random.uniform(8, 24, n_samples)
    data['weekday_numeric'] = np.random.randint(0, 7, n_samples)  # 0=Monday, 6=Sunday
    data['has_elevator_numeric'] = np.random.choice([0, 1], n_samples, p=[0.3, 0.7])
    data['floor_height'] = np.random.uniform(3, 5, n_samples)  # meters
    
    # === BUILDING TYPE (one-hot encoded) ===
    building_types = np.random.choice([0, 1, 2, 3, 4], n_samples, p=[0.4, 0.2, 0.2, 0.1, 0.1])
    data['building_type_numeric'] = building_types
    
    # === BUILDING HEIGHT CATEGORIES ===
    data['is_single_story'] = np.where(data['number_of_floors'] == 1, 1, 0)
    data['is_low_rise'] = np.where((data['number_of_floors'] >= 2) & (data['number_of_floors'] <= 4), 1, 0)
    data['is_mid_rise'] = np.where((data['number_of_floors'] >= 5) & (data['number_of_floors'] <= 9), 1, 0)
    data['is_high_rise'] = np.where(data['number_of_floors'] >= 10, 1, 0)
    
    # === CALCULATE DERIVED FEATURES ===
    data['floor_area_per_floor'] = data['total_floor_area'] / data['number_of_floors']
    data['occupant_density'] = data['occupant_count'] / data['total_floor_area']
    
    # === ENGINEERED FEATURES (based on research insights) ===
    # 1. HVAC floor factor (from sensitivity analysis - 38.18% impact)
    data['hvac_floor_factor'] = data['hvac_efficiency'] * (data['number_of_floors'] / 10)
    
    # 2. Vertical lighting factor (15.07% impact)
    data['vertical_lighting_factor'] = data['lighting_power_density'] * np.sqrt(data['number_of_floors'])
    
    # 3. Vertical occupancy impact (14.48% impact)
    data['vertical_occupancy_impact'] = data['occupant_density'] * (data['number_of_floors'] ** 0.5)
    
    # 4. Elevator energy estimate
    data['elevator_energy_estimate'] = data['has_elevator_numeric'] * data['number_of_floors'] * 100
    
    # 5. Stack effect intensity (buoyancy-driven airflow)
    data['stack_effect_intensity'] = (data['number_of_floors'] * data['floor_height'] * 
                                     np.abs(data['outdoor_temperature'] - 22)) / 100
    
    # 6. Envelope efficiency score (combining age and climate)
    data['envelope_efficiency_score'] = ((100 - data['building_age']) / 100) * (1 - data['climate_zone_numeric'] / 10)
    data['envelope_efficiency_score'] = np.clip(data['envelope_efficiency_score'], 0.1, 1.0)
    
    # 7. Floor compactness (shape efficiency)
    data['floor_compactness'] = data['total_floor_area'] / (data['number_of_floors'] * data['floor_area_per_floor'])
    data['floor_compactness'] = np.clip(data['floor_compactness'], 0.5, 2.0)
    
    # 8. Thermal mass effect (based on construction type and age)
    # Older buildings often have better thermal mass
    data['thermal_mass_effect'] = 0.5 + (data['building_age'] / 200) + np.random.uniform(-0.1, 0.1, n_samples)
    data['thermal_mass_effect'] = np.clip(data['thermal_mass_effect'], 0.3, 2.0)
    
    # 9. Comprehensive efficiency score (weighted sum from research)
    data['comprehensive_efficiency_score'] = (
        data['hvac_efficiency'] * 0.3818 +  # Weight from sensitivity analysis
        data['envelope_efficiency_score'] * 0.3 +
        (1 - data['lighting_power_density'] / 30) * 0.1507
    )
    
    # 10. Degree days calculations
    data['cooling_degree_hours'] = np.maximum(data['outdoor_temperature'] - 18, 0) * 24
    data['heating_degree_hours'] = np.maximum(18 - data['outdoor_temperature'], 0) * 24
    
    # 11. Humidity comfort penalty (optimal humidity ~50%)
    data['humidity_comfort_penalty'] = np.abs(data['humidity'] - 50) / 100
    
    # 12. Usage intensity (occupancy hours normalized by area)
    data['usage_intensity'] = (data['occupant_count'] * data['occupancy_hours']) / data['total_floor_area']
    
    # === CREATE DATAFRAME ===
    df = pd.DataFrame(data)
    
    # === CALCULATE ENERGY CONSUMPTION (TARGET VARIABLE) ===
    # Base formula incorporating key factors from research sensitivity analysis
    base_energy = (
        df['total_floor_area'] * 0.8 +           # Area impact (kWh/m¬≤)
        df['occupant_count'] * 120 +             # Occupant impact (kWh/person)
        df['hvac_efficiency'] * -5000 +          # Efficiency impact (negative: higher efficiency = lower energy)
        df['lighting_power_density'] * 200 +     # Lighting impact
        df['outdoor_temperature'] * 50 +         # Temperature impact
        df['cooling_degree_hours'] * 0.5 +       # Cooling needs
        df['heating_degree_hours'] * 0.3 +       # Heating needs
        df['building_age'] * 10 +                # Age penalty
        df['stack_effect_intensity'] * 100 +     # Stack effect penalty
        df['vertical_lighting_factor'] * 50 +    # Vertical lighting impact
        df['vertical_occupancy_impact'] * 1000 + # Vertical occupancy impact
        df['humidity_comfort_penalty'] * 1000 +  # Humidity penalty
        df['usage_intensity'] * 1000             # Usage intensity impact
    )
    
    # Add noise (realistic variation)
    noise = np.random.normal(0, 0.2, n_samples) * base_energy
    
    # Ensure positive values and add some outliers (5% chance)
    outliers = np.random.choice([0, 1], n_samples, p=[0.95, 0.05])
    outlier_factor = np.where(outliers == 1, np.random.uniform(1.5, 3.0, n_samples), 1.0)
    
    # Final energy consumption calculation
    df['energy_consumption_kwh'] = np.maximum(
        (base_energy + noise) * outlier_factor, 
        1000  # Minimum consumption
    )
    
    # Add building ID and name for realism
    df['building_id'] = [f'B{str(i).zfill(5)}' for i in range(n_samples)]
    df['building_name'] = [f'{fake.company()} Building' for _ in range(n_samples)]
    df['location'] = [fake.city() for _ in range(n_samples)]
    
    # Add some realistic correlations
    # Older buildings tend to have lower HVAC efficiency
    age_efficiency_corr = np.random.choice([0, 1], n_samples, p=[0.3, 0.7])
    df.loc[age_efficiency_corr == 1, 'hvac_efficiency'] *= (1 - df.loc[age_efficiency_corr == 1, 'building_age'] / 200)
    
    # High-rise buildings more likely to have elevators
    high_rise_mask = df['is_high_rise'] == 1
    df.loc[high_rise_mask, 'has_elevator_numeric'] = 1
    
    return df

def save_dataset(df, filename='building_energy_dataset.csv'):
    """Save dataset to CSV"""
    # Reorder columns for better readability
    feature_columns = [
        'building_id', 'building_name', 'location',
        'total_floor_area', 'number_of_floors', 'occupant_count', 
        'building_age', 'floor_area_per_floor', 'occupant_density',
        'hvac_efficiency', 'lighting_power_density', 'equipment_power_density',
        'outdoor_temperature', 'humidity', 'occupancy_hours', 
        'weekday_numeric', 'has_elevator_numeric', 'floor_height',
        'building_type_numeric', 'climate_zone_numeric',
        'is_single_story', 'is_low_rise', 'is_mid_rise', 'is_high_rise',
        'hvac_floor_factor', 'vertical_lighting_factor', 'vertical_occupancy_impact',
        'elevator_energy_estimate', 'stack_effect_intensity', 'envelope_efficiency_score',
        'floor_compactness', 'thermal_mass_effect', 'comprehensive_efficiency_score',
        'cooling_degree_hours', 'heating_degree_hours', 'humidity_comfort_penalty',
        'usage_intensity', 'energy_consumption_kwh'
    ]
    
    df = df[feature_columns]
    df.to_csv(filename, index=False)
    print(f"Dataset saved to {filename}")
    print(f"Shape: {df.shape}")
    print(f"Features: {len(feature_columns) - 1} (plus target variable)")
    return df

# Create and explore dataset
if __name__ == "__main__":
    # Generate dataset
    print("Generating synthetic building energy dataset...")
    df = generate_building_dataset(n_samples=5000)
    
    # Save dataset
    df = save_dataset(df, 'building_energy_dataset.csv')
    
    # Display dataset info
    print("\n=== Dataset Information ===")
    print(f"Total samples: {len(df)}")
    print(f"Features: {len(df.columns) - 1}")  # Excluding target
    
    # Show basic statistics
    print("\n=== Feature Statistics ===")
    stats_df = pd.DataFrame({
        'Feature': df.columns,
        'Type': df.dtypes,
        'Non-Null': df.count(),
        'Null': df.isnull().sum(),
        'Mean': df.mean(numeric_only=True),
        'Std': df.std(numeric_only=True),
        'Min': df.min(numeric_only=True),
        'Max': df.max(numeric_only=True)
    })
    
    # Display for key features
    key_features = [
        'total_floor_area', 'number_of_floors', 'occupant_count',
        'building_age', 'energy_consumption_kwh', 'hvac_efficiency',
        'lighting_power_density', 'outdoor_temperature'
    ]
    
    print("\n=== Key Features Statistics ===")
    for feature in key_features:
        if feature in df.columns:
            print(f"\n{feature}:")
            print(f"  Mean: {df[feature].mean():.2f}")
            print(f"  Std:  {df[feature].std():.2f}")
            print(f"  Min:  {df[feature].min():.2f}")
            print(f"  Max:  {df[feature].max():.2f}")
    
    # Display correlation with target
    print("\n=== Top Correlations with Energy Consumption ===")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    correlations = df[numeric_cols].corr()['energy_consumption_kwh'].abs().sort_values(ascending=False)
    print(correlations.head(15))
    
    # Save a smaller sample for testing
    df_sample = df.head(100)
    df_sample.to_csv('building_energy_sample.csv', index=False)
    print("\nSample dataset saved to 'building_energy_sample.csv'")
    
    # Create a metadata file - FIXED VERSION
    metadata = {
        'feature': [
            'total_floor_area', 'number_of_floors', 'occupant_count', 
            'building_age', 'floor_area_per_floor', 'occupant_density',
            'hvac_efficiency', 'lighting_power_density', 'equipment_power_density',
            'outdoor_temperature', 'humidity', 'occupancy_hours',
            'weekday_numeric', 'has_elevator_numeric', 'floor_height',
            'building_type_numeric', 'climate_zone_numeric', 'is_single_story',
            'is_low_rise', 'is_mid_rise', 'is_high_rise', 'hvac_floor_factor',
            'vertical_lighting_factor', 'vertical_occupancy_impact', 'elevator_energy_estimate',
            'stack_effect_intensity', 'envelope_efficiency_score', 'floor_compactness',
            'thermal_mass_effect', 'comprehensive_efficiency_score', 'cooling_degree_hours',
            'heating_degree_hours', 'humidity_comfort_penalty', 'usage_intensity',
            'energy_consumption_kwh'  # Added target variable
        ],
        'description': [
            'Total floor area in square meters',
            'Number of floors in the building',
            'Number of regular occupants',
            'Age of building in years',
            'Average area per floor (m¬≤/floor)',
            'Occupants per square meter',
            'HVAC system efficiency ratio (0-1, higher is better)',
            'Lighting power density in W/m¬≤',
            'Equipment power density in W/m¬≤',
            'Average outdoor temperature in ¬∞C',
            'Average relative humidity in %',
            'Average daily occupancy hours',
            'Weekday numeric (0=Monday, 6=Sunday)',
            'Binary indicator for elevator presence',
            'Average floor height in meters',
            'Building type encoded as integer',
            'Climate zone code (1-7)',
            'Binary: 1 if single story building',
            'Binary: 1 if low-rise (2-4 floors)',
            'Binary: 1 if mid-rise (5-9 floors)',
            'Binary: 1 if high-rise (10+ floors)',
            'HVAC efficiency scaled by floor count',
            'Lighting impact considering vertical distribution',
            'Occupancy impact considering vertical distribution',
            'Estimated elevator energy consumption',
            'Stack effect intensity from temperature difference',
            'Building envelope thermal performance score',
            'Floor plan compactness metric',
            'Thermal mass effect on energy performance',
            'Weighted comprehensive efficiency score',
            'Cooling degree hours based on temperature',
            'Heating degree hours based on temperature',
            'Penalty for humidity deviation from comfort zone',
            'Intensity of building usage',
            'Annual energy consumption (target variable)'
        ],
        'unit': [
            'm¬≤', 'count', 'count', 'years', 'm¬≤/floor', 'person/m¬≤',
            'ratio', 'W/m¬≤', 'W/m¬≤', '¬∞C', '%', 'hours',
            '0-6', '0/1', 'm', '0-4', '1-7',
            '0/1', '0/1', '0/1', '0/1', 'dimensionless',
            'dimensionless', 'dimensionless', 'kWh',
            'dimensionless', '0-1', 'ratio',
            'dimensionless', '0-1', '¬∞C-hours',
            '¬∞C-hours', 'ratio', 'dimensionless',
            'kWh'
        ]
    }
    
    metadata_df = pd.DataFrame(metadata)
    metadata_df.to_csv('dataset_metadata.csv', index=False)
    print("Metadata saved to 'dataset_metadata.csv'")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.




A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start
    self.io_loop.start()
  File "/opt/anaconda3/lib/python3.12/site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



Generating synthetic building energy dataset...
Dataset saved to building_energy_dataset.csv
Shape: (5000, 38)
Features: 37 (plus target variable)

=== Dataset Information ===
Total samples: 5000
Features: 37

=== Feature Statistics ===

=== Key Features Statistics ===

total_floor_area:
  Mean: 25093.18
  Std:  14336.87
  Min:  500.58
  Max:  49986.02

number_of_floors:
  Mean: 24.65
  Std:  14.05
  Min:  1.00
  Max:  49.00

occupant_count:
  Mean: 1019.94
  Std:  568.75
  Min:  10.00
  Max:  1998.00

building_age:
  Mean: 49.34
  Std:  28.61
  Min:  1.00
  Max:  99.00

energy_consumption_kwh:
  Mean: 158430.44
  Std:  97824.62
  Min:  5703.35
  Max:  913660.81

hvac_efficiency:
  Mean: 0.65
  Std:  0.15
  Min:  0.30
  Max:  0.95

lighting_power_density:
  Mean: 12.66
  Std:  4.34
  Min:  5.00
  Max:  20.00

outdoor_temperature:
  Mean: 20.60
  Std:  10.76
  Min:  -5.00
  Max:  39.99

=== Top Correlations with Energy Consumption ===
energy_consumption_kwh       1.000000
occupant_count

In [8]:
# test_imports.py
import sys
print(f"Python version: {sys.version}")

try:
    import numpy as np
    print(f"‚úÖ NumPy version: {np.__version__}")
except ImportError as e:
    print(f"‚ùå NumPy import error: {e}")

try:
    import pandas as pd
    print(f"‚úÖ Pandas version: {pd.__version__}")
except ImportError as e:
    print(f"‚ùå Pandas import error: {e}")

try:
    import scipy
    print(f"‚úÖ SciPy version: {scipy.__version__}")
except ImportError as e:
    print(f"‚ùå SciPy import error: {e}")

try:
    from faker import Faker
    print("‚úÖ Faker imported successfully")
except ImportError as e:
    print(f"‚ùå Faker import error: {e}")

# Test dataset loading
try:
    df = pd.read_csv('building_energy_dataset.csv')
    print(f"‚úÖ Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
    print(f"   Target variable range: {df['energy_consumption_kwh'].min():.0f} to {df['energy_consumption_kwh'].max():.0f} kWh")
except Exception as e:
    print(f"‚ùå Dataset loading error: {e}")

Python version: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
‚úÖ NumPy version: 2.3.5
‚úÖ Pandas version: 2.3.3
‚úÖ SciPy version: 1.13.1
‚úÖ Faker imported successfully


  import scipy


‚úÖ Dataset loaded: 5000 rows, 38 columns
   Target variable range: 5703 to 913661 kWh


In [14]:
# test_installation.py
import sys
print(f"Python: {sys.version}")

try:
    import numpy as np
    print(f"‚úÖ NumPy: {np.__version__}")
    print(f"   Has numpy.char: {hasattr(np, 'char')}")
except Exception as e:
    print(f"‚ùå NumPy error: {e}")

try:
    import pandas as pd
    print(f"‚úÖ Pandas: {pd.__version__}")
except Exception as e:
    print(f"‚ùå Pandas error: {e}")

try:
    import sklearn
    print(f"‚úÖ Scikit-learn: {sklearn.__version__}")
except Exception as e:
    print(f"‚ùå Scikit-learn error: {e}")

Python: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
‚úÖ NumPy: 2.3.5
‚ùå NumPy error: No module named 'numpy.char'
‚úÖ Pandas: 2.3.3
‚ùå Scikit-learn error: No module named 'sklearn'


In [16]:
# test_new_env.py
import sys
print(f"Python: {sys.version}")
print(f"Executable: {sys.executable}")

try:
    import numpy as np
    print(f"‚úÖ NumPy: {np.__version__}")
except Exception as e:
    print(f"‚ùå NumPy error: {e}")

try:
    import pandas as pd
    print(f"‚úÖ Pandas: {pd.__version__}")
except Exception as e:
    print(f"‚ùå Pandas error: {e}")

try:
    import sklearn
    print(f"‚úÖ Scikit-learn: {sklearn.__version__}")
except Exception as e:
    print(f"‚ùå Scikit-learn error: {e}")

Python: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
Executable: /opt/anaconda3/bin/python
‚úÖ NumPy: 2.3.5
‚úÖ Pandas: 2.3.3
‚ùå Scikit-learn error: No module named 'sklearn'


In [18]:
# test_env_final.py
import sys
print(f"Python path: {sys.executable}")
print(f"Python version: {sys.version}")

# Try imports
try:
    import numpy as np
    print(f"‚úÖ NumPy: {np.__version__}")
    print(f"   Path: {np.__file__}")
except Exception as e:
    print(f"‚ùå NumPy error: {e}")

try:
    import sklearn
    print(f"‚úÖ Scikit-learn: {sklearn.__version__}")
except Exception as e:
    print(f"‚ùå Scikit-learn error: {e}")
    
    # Install it if missing
    import subprocess
    import sys
    print("Installing scikit-learn...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn"])

Python path: /opt/anaconda3/bin/python
Python version: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
‚úÖ NumPy: 2.3.5
   Path: /opt/anaconda3/lib/python3.12/site-packages/numpy/__init__.py
‚ùå Scikit-learn error: No module named 'sklearn'
Installing scikit-learn...
Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting numpy>=1.22.0 (from scikit-learn)
  Using cached numpy-2.3.5-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Using cached scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl (8.6 MB)
Using cached numpy-2.3.5-cp312-cp312-macosx_14_0_arm64.whl (5.1 MB)
Using cached scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl (20.9 MB)
Installing collected packages: numpy, scipy, scikit-learn
Successfully installed numpy-2.3.5 scikit-learn-1.7.2 scipy-1.16.3


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
hvplot 0.11.0 requires pandas>=1.3, which is not installed.
datashader 0.16.3 requires pandas, which is not installed.
seaborn 0.13.2 requires pandas>=1.2, which is not installed.
bokeh 3.6.0 requires pandas>=1.2, which is not installed.
holoviews 1.19.1 requires pandas>=1.3, which is not installed.
statsmodels 0.14.2 requires pandas!=2.1.0,>=1.4, which is not installed.
altair 5.0.1 requires pandas>=0.18, which is not installed.
streamlit 1.37.1 requires pandas<3,>=1.3.0, which is not installed.
catboost 1.2.8 requires pandas>=0.24, which is not installed.
xarray 2023.6.0 requires pandas>=1.4, which is not installed.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.3.5 which is incompatible.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.16.3 which is incompatible.
numba 0.60.

In [20]:
# quick_start.py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

print("=" * 60)
print("üè¢ BUILDING ENERGY ML - QUICK START")
print("=" * 60)

# Load your dataset
try:
    df = pd.read_csv('building_energy_dataset.csv')
    print(f"‚úÖ Dataset loaded: {len(df)} samples")
except:
    print("‚ùå Dataset not found. Using sample data...")
    # Create sample data
    df = pd.DataFrame({
        'total_floor_area': np.random.uniform(500, 5000, 100),
        'occupant_count': np.random.randint(10, 500, 100),
        'building_age': np.random.randint(1, 50, 100),
        'energy_consumption_kwh': np.random.uniform(10000, 100000, 100)
    })

# Select features
features = ['total_floor_area', 'occupant_count', 'building_age']
X = df[features]
y = df['energy_consumption_kwh']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nüìä Model Performance:")
print(f"   MAE: {mae:,.0f} kWh")
print(f"   R¬≤: {r2:.4f}")

# Feature importance
print(f"\nüîù Feature Importance:")
for feature, importance in zip(features, model.feature_importances_):
    print(f"   {feature}: {importance:.3f}")

print("\n" + "=" * 60)
print("‚úÖ READY FOR FULL IMPLEMENTATION!")
print("=" * 60)

ImportError: cannot import name 'is_cupy_namespace' from 'scipy._lib.array_api_compat.common._helpers' (/opt/anaconda3/lib/python3.12/site-packages/scipy/_lib/array_api_compat/common/_helpers.py)

In [22]:
# test_environment.py
import sys
import os

print("=" * 60)
print("ENVIRONMENT CHECK")
print("=" * 60)

# Check Python path
python_path = sys.executable
print(f"Python executable: {python_path}")

# Check if we're using environment Python
if "envs/building_energy" in python_path:
    print("‚úÖ USING CORRECT ENVIRONMENT: building_energy")
else:
    print("‚ùå WRONG ENVIRONMENT! Using system Python")
    print("   Run: conda activate building_energy")

print(f"\nPython version: {sys.version}")

# Test imports
try:
    import numpy as np
    print(f"‚úÖ NumPy: {np.__version__}")
except Exception as e:
    print(f"‚ùå NumPy: {e}")

try:
    import pandas as pd
    print(f"‚úÖ Pandas: {pd.__version__}")
except Exception as e:
    print(f"‚ùå Pandas: {e}")

try:
    import sklearn
    print(f"‚úÖ Scikit-learn: {sklearn.__version__}")
except Exception as e:
    print(f"‚ùå Scikit-learn: {e}")

print("\n" + "=" * 60)

ENVIRONMENT CHECK
Python executable: /opt/anaconda3/bin/python
‚ùå WRONG ENVIRONMENT! Using system Python
   Run: conda activate building_energy

Python version: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
‚úÖ NumPy: 2.3.5
‚úÖ Pandas: 2.3.3
‚ùå Scikit-learn: cannot import name 'is_cupy_namespace' from 'scipy._lib.array_api_compat.common._helpers' (/opt/anaconda3/lib/python3.12/site-packages/scipy/_lib/array_api_compat/common/_helpers.py)

