In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from sklearn.model_selection import cross_val_score, KFold, learning_curve, train_test_split
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# Load the saved models
with open('linear_regression_model.pkl', 'rb') as lr_file:
    regressor_lr = pickle.load(lr_file)

with open('random_forest_model.pkl', 'rb') as rf_file:
    regressor_rf = pickle.load(rf_file)

with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

# Load the preprocessed dataset (same dataset used for training and testing)
data = pd.read_csv("energy_consumption_preprocessed.csv")

# Feature selection and target (same as before)
features = ['Temperature', 'Humidity', 'Wind Speed', 'Hour', 'Day', 'Weekday', 'Month', 'Holiday_1', 'Season_Summer', 'Season_Winter']
target = 'Consumption'

X = data[features]
y = data[target]

# Standardizing the features using StandardScaler (same scaler used during training)
X_scaled = scaler.transform(X)

# Define K-fold cross-validation strategy
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Evaluate performance using K-fold Cross Validation
def evaluate_model_kfold(model, X, y, model_name="Model"):
    cv_results = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
    print(f"\nK-fold Cross-Validation for {model_name} Model:")
    print(f"Mean MSE: {-np.mean(cv_results):.4f}")
    print(f"Mean RMSE: {np.sqrt(-np.mean(cv_results)):.4f}")
    return -np.mean(cv_results), np.sqrt(-np.mean(cv_results))

# Ridge and Lasso Regression (with hyperparameter tuning)
def evaluate_ridge_lasso(X, y):
    alpha_values = [0.1, 0.5, 1.0, 5.0, 10.0]  # Regularization strengths
    for alpha in alpha_values:
        ridge = Ridge(alpha=alpha)
        lasso = Lasso(alpha=alpha)

        # Ridge
        print(f"Evaluating Ridge Regression with alpha={alpha}...")
        ridge.fit(X, y)
        y_pred_ridge = ridge.predict(X)
        evaluate_model(y, y_pred_ridge, f"Ridge (alpha={alpha})")

        # Lasso
        print(f"Evaluating Lasso Regression with alpha={alpha}...")
        lasso.fit(X, y)
        y_pred_lasso = lasso.predict(X)
        evaluate_model(y, y_pred_lasso, f"Lasso (alpha={alpha})")

# Model comparison and detailed evaluation for Ridge, Lasso, Linear Regression, and Random Forest
evaluate_ridge_lasso(X_scaled, y)

# Hyperparameter tuning for Random Forest using Grid Search
from sklearn.model_selection import GridSearchCV

def random_forest_tuning(X, y):
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    grid_search_rf = GridSearchCV(estimator=regressor_rf, param_grid=param_grid, cv=kf, n_jobs=-1, verbose=2)
    grid_search_rf.fit(X, y)

    print(f"Best Parameters for Random Forest: {grid_search_rf.best_params_}")

    best_rf_model = grid_search_rf.best_estimator_
    y_pred_rf = best_rf_model.predict(X)
    evaluate_model(y, y_pred_rf, "Random Forest (Tuned)")

# Random Forest with Grid Search hyperparameter tuning
random_forest_tuning(X_scaled, y)

# Residual Analysis (with Quantile Residuals)
def quantile_residuals(y_true, y_pred, quantile=0.5):
    residuals = y_pred - y_true
    quantile_residuals = np.maximum(quantile * residuals, (quantile - 1) * residuals)
    return quantile_residuals

# Calculate quantile residuals for Random Forest and Ridge models
quantile_residuals_rf = quantile_residuals(y, y_pred_rf, quantile=0.5)
quantile_residuals_ridge = quantile_residuals(y, y_pred_ridge, quantile=0.5)

# Plot Quantile Residuals for Random Forest and Ridge
plt.figure(figsize=(12, 6))
sns.histplot(quantile_residuals_rf, kde=True, color='purple', label="Random Forest Quantile Residuals")
sns.histplot(quantile_residuals_ridge, kde=True, color='orange', label="Ridge Quantile Residuals")
plt.title('Distribution of Quantile Residuals')
plt.xlabel('Quantile Residuals')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

# Learning Curves for Hyperparameter Tuning
def plot_learning_curve(estimator, X, y, title="Learning Curve", cv=None):
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=-1)
    plt.figure(figsize=(12, 6))
    plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Training Score', color='blue')
    plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Cross-validation Score', color='red')
    plt.title(title)
    plt.xlabel('Training Samples')
    plt.ylabel('Score')
    plt.legend()
    plt.grid(True)
    plt.show()

# Learning Curve for Random Forest after Hyperparameter Tuning
plot_learning_curve(grid_search_rf.best_estimator_, X_scaled, y, title="Learning Curve (Random Forest Tuning)", cv=kf)

# Performance by Weekday vs Weekend
data['Weekend'] = data['Weekday'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)

# Splitting data into weekday and weekend subsets
X_weekday = data[data['Weekend'] == 0][features]
y_weekday = data[data['Weekend'] == 0][target]

X_weekend = data[data['Weekend'] == 1][features]
y_weekend = data[data['Weekend'] == 1][target]

# Evaluate performance on Weekday and Weekend separately
print("\nEvaluating Model on Weekday Data...")
y_pred_weekday = regressor_rf.predict(X_weekday)
evaluate_model(y_weekday, y_pred_weekday, "Random Forest (Weekday)")

print("\nEvaluating Model on Weekend Data...")
y_pred_weekend = regressor_rf.predict(X_weekend)
evaluate_model(y_weekend, y_pred_weekend, "Random Forest (Weekend)")

# Visualizing the predictions (weekday vs weekend)
plt.figure(figsize=(12, 6))
plt.plot(y_weekday.values, label='Actual Consumption (Weekday)', color='blue')
plt.plot(y_pred_weekday, label='Predicted Consumption (Weekday)', color='green', linestyle='dashed')
plt.title('Actual vs Predicted Energy Consumption (Weekday)')
plt.xlabel('Data Points')
plt.ylabel('Consumption (kWh)')
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(y_weekend.values, label='Actual Consumption (Weekend)', color='blue')
plt.plot(y_pred_weekend, label='Predicted Consumption (Weekend)', color='green', linestyle='dashed')
plt.title('Actual vs Predicted Energy Consumption (Weekend)')
plt.xlabel('Data Points')
plt.ylabel('Consumption (kWh)')
plt.legend()
plt.grid(True)
plt.show()

# Saving the evaluation results to CSV (for analysis later)
evaluation_results = pd.DataFrame({
    'Actual Consumption': y,
    'Predicted LR': y_pred_lr,
    'Predicted RF': y_pred_rf,
    'Predicted Ridge': y_pred_ridge,
    'Predicted Lasso': y_pred_lasso
})

evaluation_results.to_csv("advanced_model_evaluation_results.csv", index=False)

print("Advanced evaluation results saved successfully!")
