# Step 4: Model Training & Evaluation
Training anomaly detection models (Isolation Forest, LOF, Elliptic Envelope) and analyzing results.

In [1]:
import pandas as pd
import numpy as np
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns

current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
sys.path.append(project_root)

from src.data_loader import load_data
from src.preprocessing import preprocess_data
from src.features import engineer_features
from src.models import train_anomaly_models

FileNotFoundError: [Errno 2] No such file or directory: 'E:\\CODING\\New folder\\notebooks\\results\\project.log'

In [None]:
# Prepare Data Pipeline
data_path = os.path.join(project_root, 'data')
df_raw = load_data(data_path)
df_clean = preprocess_data(df_raw)
df = engineer_features(df_clean)

# Select features for modeling (exclude datetime and non-numeric)
features = ['electricity', 'chilled_water', 'steam', 'temperature', 'humidity', 
            'electricity_rolling_mean', 'electricity_deviation', 'hour', 'day_of_week']
# Filter to available columns
model_features = [c for c in features if c in df.columns]
X = df[model_features].fillna(0)
print("Training features:", model_features)

In [None]:
# Train Models
output = train_anomaly_models(X)
models = output['models']
results = output['results']

# Merge results back to main dataframe
df_final = pd.concat([df, results], axis=1)
print("Anomalies detected:", df_final['is_anomaly'].sum())

## Visualizations

In [None]:
# Feature Importance (Isolation Forest)
if 'isolation_forest' in models:
    iso_forest = models['isolation_forest']
    # Note: feature_importances_ is not always available in standard sklearn IsoForest versions, 
    # usually we might use permutation importance or if the version supports it. 
    # We'll use a try-except or just basic plotting if available.
    try:
        # Check if attribute exists (it might not in all sklearn versions for IsoForest)
        # We'll assume for now or skip.
        pass
    except:
        pass

In [None]:
# Anomaly Visualization
plt.figure(figsize=(15, 6))
subset = df_final.iloc[:1000] # Plot first 1000 hours for clarity
plt.plot(subset['timestamp'], subset['electricity'], label='Electricity', alpha=0.6)
anomalies = subset[subset['is_anomaly'] == 1]
plt.scatter(anomalies['timestamp'], anomalies['electricity'], color='red', label='Anomaly', s=50)
plt.title('Electricity Consumption & Anomalies (First 1000 Hours)')
plt.legend()
plt.show()

## Business Insights

In [None]:
# Cost Estimation
avg_kwh_cost = 0.12
anomaly_cost = df_final[df_final['is_anomaly'] == 1]['electricity'].sum() * avg_kwh_cost
print(f"Estimated excess cost from anomalies: ${anomaly_cost:,.2f}")

In [None]:
# Save Results
os.makedirs(os.path.join(project_root, 'results'), exist_ok=True)
df_final[df_final['is_anomaly'] == 1].to_csv(os.path.join(project_root, 'results', 'anomalies.csv'))
print("Results saved to results/anomalies.csv")