In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import shap
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Create directories if they don't exist
os.makedirs('models', exist_ok=True)
os.makedirs('plots', exist_ok=True)

print("=" * 80)
print("OPTIMIZED CATBOOST MODEL FOR SOLAR POWER PREDICTION")
print("=" * 80)

# 1. Load the processed data
print("\n1. Loading processed data...")
train_data = pd.read_csv('processed_data/train_all_predict_one/train_data.csv')
test_data = pd.read_csv('processed_data/train_all_predict_one/test_data.csv')

# Convert datetime for easier analysis
train_data['LocalTime'] = pd.to_datetime(train_data['LocalTime'])
test_data['LocalTime'] = pd.to_datetime(test_data['LocalTime'])

# 2. Create feature sets focused on CatBoost strengths
print("\n2. Creating optimized CatBoost features...")

def create_optimized_catboost_features(data):
    """Create features optimized for CatBoost focusing on categorical variables and weather data"""
    features = pd.DataFrame()
    
    # Extract time features (but not cyclical encoding)
    data['hour'] = data['LocalTime'].dt.hour
    data['month'] = data['LocalTime'].dt.month
    data['dayofweek'] = data['LocalTime'].dt.dayofweek
    data['date'] = data['LocalTime'].dt.date
    
    # Categorical features - CatBoost's strength
    if 'Cloud_Type' in data.columns:
        features['Cloud_Type'] = data['Cloud_Type'].astype('category')
    else:
        print("Warning: Cloud_Type not found in data columns. Using zero values.")
        features['Cloud_Type'] = 0
        
    features['location_id'] = data['location_id'].astype('category')
    
    if 'PV_Type' in data.columns:
        features['PV_Type'] = data['PV_Type'].astype('category')
    else:
        print("Warning: PV_Type not found in data columns. Using default values.")
        features['PV_Type'] = 'unknown'
    
    # Time as categorical (not continuous) - CatBoost handles this well
    features['hour_cat'] = data['hour'].astype('category')
    features['month_cat'] = data['month'].astype('category') 
    features['dayofweek'] = data['dayofweek'].astype('category')
    
    # Weather features
    weather_cols = ['Temperature', 'Pressure', 'GHI', 'DHI', 'Cloud_Coverage', 'WindSpeed', 'WindDirection']
    for col in weather_cols:
        if col in data.columns:
            features[col] = data[col]
        else:
            print(f"Warning: {col} not found in data columns. Using zero values.")
            features[col] = 0
    
    # Solar installation specifics 
    if 'Capacity_MW' in data.columns:
        features['Capacity_MW'] = data['Capacity_MW']
    
    # Binary daylight indicator
    features['is_daylight'] = data['hour'].between(6, 18).astype(int)
    
    # Night mask for zero production - physical constraint
    features['night_mask'] = ~data['hour'].between(5, 21)
    
    # Feature interactions that CatBoost can leverage
    if 'GHI' in data.columns and 'Temperature' in data.columns:
        features['temp_ghi_interaction'] = data['Temperature'] * data['GHI']
    
    if 'Cloud_Coverage' in data.columns and 'GHI' in data.columns:
        # More cloud coverage means less direct sunlight
        features['adjusted_ghi'] = data['GHI'] * (1 - data['Cloud_Coverage']/100)
    
    return features

# Create feature sets
X_train_cat = create_optimized_catboost_features(train_data)
X_test_cat = create_optimized_catboost_features(test_data)

# Target variable
y_train = train_data['Power(MW)']
y_test = test_data['Power(MW)']

# Generate night mask for test data
test_night_mask = ~test_data['LocalTime'].dt.hour.between(5, 21)

# Determine categorical features for CatBoost
cat_features = ['Cloud_Type', 'location_id', 'PV_Type', 'hour_cat', 'month_cat', 'dayofweek']
for feat in cat_features.copy():  # Create a copy to avoid modifying while iterating
    if feat not in X_train_cat.columns:
        cat_features.remove(feat)
        print(f"Warning: Categorical feature {feat} not found. Removed from cat_features list.")

print(f"Using {len(cat_features)} categorical features: {cat_features}")

# 3. Train the CatBoost model
print("\n3. Training CatBoost model...")

# Create validation set
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_cat, y_train, test_size=0.2, random_state=42
)

# Create pool objects
train_pool = Pool(X_train_final, y_train_final, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)
test_pool = Pool(X_test_cat, y_test, cat_features=cat_features)

# Model parameters - optimized for categorical data
params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 6,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'verbose': 100,
    'task_type': 'CPU',  # Change to 'GPU' if available
    'random_seed': 42,
    # Add more category-specific parameters
    'max_ctr_complexity': 3,  # Helps with categorical features
    'leaf_estimation_method': 'Newton',  # Better for regression tasks
    'bootstrap_type': 'Bernoulli',  # Better handling of imbalanced data
    'subsample': 0.8,  # Prevents overfitting
    'l2_leaf_reg': 3.0,  # L2 regularization
    'one_hot_max_size': 10  # Efficient handling of categorical features
}

# Train model
model = CatBoostRegressor(**params)
model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)

# Save the model
model.save_model('models/optimized_catboost_solar_model.cbm')
print("Model saved to models/optimized_catboost_solar_model.cbm")

# 4. Evaluate the model
print("\n4. Evaluating model performance...")

# Make predictions
y_pred = model.predict(X_test_cat)

# Apply night mask (zero production during night)
y_pred_masked = y_pred.copy()
y_pred_masked[test_night_mask] = 0

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred_masked))
mae = mean_absolute_error(y_test, y_pred_masked)
r2 = r2_score(y_test, y_pred_masked)

print(f"RMSE: {rmse:.4f} MW")
print(f"MAE: {mae:.4f} MW")
print(f"R²: {r2:.4f}")

# 5. SHAP Analysis - Understanding feature importance
print("\n5. Performing SHAP analysis for feature importance...")

try:
    # For CatBoost, we can use TreeExplainer directly
    explainer = shap.TreeExplainer(model)
    
    # Calculate SHAP values on a sample subset to save memory
    # Taking a subset of test data for SHAP analysis
    shap_sample_size = min(500, len(X_test_cat))
    shap_sample = X_test_cat.iloc[:shap_sample_size]
    
    print(f"Calculating SHAP values on {shap_sample_size} samples...")
    shap_values = explainer.shap_values(shap_sample)
    
    # Calculate feature importance
    feature_importance = np.abs(shap_values).mean(axis=0)
    
    # Create dataframe of feature importance
    importance_df = pd.DataFrame({
        'Feature': X_test_cat.columns,
        'Importance': feature_importance
    }).sort_values('Importance', ascending=False)
    
    # Plot feature importance using SHAP - wider plot to match LSTM
    plt.figure(figsize=(24, 10))
    shap.summary_plot(shap_values, shap_sample, plot_type="bar", show=False)
    plt.title('Feature Impact on Solar Power Prediction (SHAP)', fontsize=16)
    plt.tight_layout()
    plt.savefig('plots/catboost_shap_feature_importance.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Create a more traditional bar plot for feature importance
    plt.figure(figsize=(24, 10))
    plt.barh(importance_df['Feature'][:15], importance_df['Importance'][:15])
    plt.title('CatBoost Feature Importance', fontsize=16)
    plt.xlabel('Average Impact on Model Output', fontsize=14)
    plt.ylabel('Features', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.tight_layout()
    plt.savefig('plots/catboost_feature_importance.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Print feature importance
    print("\nTop 15 most important features:")
    for i, (feature, importance) in enumerate(zip(importance_df['Feature'][:15], importance_df['Importance'][:15])):
        print(f"{i+1}. {feature}: {importance:.6f}")
    
    # Save feature importance to CSV
    importance_df.to_csv('models/catboost_feature_importance.csv', index=False)
    
    print("SHAP analysis completed and saved.")
    
except Exception as e:
    import traceback
    print(f"Error performing SHAP analysis: {e}")
    print(traceback.format_exc())
    print("Continuing without complete SHAP analysis...")
    
    # Fallback to model's feature importance
    feature_importance = model.get_feature_importance()
    feature_names = X_test_cat.columns
    
    # Create dataframe
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importance
    }).sort_values('Importance', ascending=False)
    
    # Plot - with larger figure size to match LSTM
    plt.figure(figsize=(24, 10))
    plt.barh(importance_df['Feature'][:15], importance_df['Importance'][:15])
    plt.title('CatBoost Feature Importance (Native Method)', fontsize=16)
    plt.xlabel('Importance Score', fontsize=14)
    plt.ylabel('Features', fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.tight_layout()
    plt.savefig('plots/catboost_native_feature_importance.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Print feature importance
    print("\nTop 15 most important features (Native CatBoost method):")
    for i, (feature, importance) in enumerate(zip(importance_df['Feature'][:15], importance_df['Importance'][:15])):
        print(f"{i+1}. {feature}: {importance:.6f}")

# 6. Create prediction visualizations
print("\n6. Creating prediction vs actual visualizations...")

# Prepare data for plotting
test_results = pd.DataFrame({
    'LocalTime': test_data['LocalTime'],
    'Actual': y_test,
    'Predicted': y_pred_masked,
    'Hour': test_data['LocalTime'].dt.hour,
    'Date': test_data['LocalTime'].dt.date
})

# Calculate daily aggregated production
daily_results = test_results.groupby('Date').agg({
    'Actual': 'sum',
    'Predicted': 'sum'
}).reset_index()

# Plot daily bar chart - wider figure to match LSTM
plt.figure(figsize=(24, 10))
width = 0.35
x = np.arange(len(daily_results))
plt.bar(x - width/2, daily_results['Actual'], width, label='Actual', alpha=0.7)
plt.bar(x + width/2, daily_results['Predicted'], width, label='Predicted', alpha=0.7)

# Format x-axis with dates
plt.xticks(x, [d.strftime('%Y-%m-%d') for d in daily_results['Date']], rotation=45, fontsize=12)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Total Daily Power Production (MW)', fontsize=14)
plt.title('Daily Actual vs Predicted Solar Power Production', fontsize=16)
plt.legend(fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('plots/catboost_daily_prediction.png', dpi=300, bbox_inches='tight')
plt.close()

# Plot hourly prediction for a sample day
# Select a day with good sunlight
sample_day = test_results['Date'].unique()[10]  # 10th test day
print(f"Creating hourly plot for example day: {sample_day}")

day_data = test_results[test_results['Date'] == sample_day]
plt.figure(figsize=(24, 10))
plt.plot(day_data['Hour'], day_data['Actual'], 'o-', label='Actual', linewidth=2, markersize=10)
plt.plot(day_data['Hour'], day_data['Predicted'], 's-', label='Predicted', linewidth=2, markersize=10)
plt.xlabel('Hour of Day', fontsize=14)
plt.ylabel('Power (MW)', fontsize=14)
plt.title(f'Hourly Solar Power Prediction for {sample_day}', fontsize=16)
plt.xticks(range(0, 24), fontsize=12)
plt.yticks(fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=14)
plt.tight_layout()
plt.savefig('plots/catboost_hourly_prediction.png', dpi=300, bbox_inches='tight')
plt.close()

# Create a heatmap of prediction accuracy by hour and month
print("\n7. Creating prediction accuracy heatmap by hour and month...")

# Add error metrics
test_results['AbsError'] = abs(test_results['Predicted'] - test_results['Actual'])
test_results['Month'] = test_results['LocalTime'].dt.month

# Create pivot table for heatmap
hour_month_error = test_results.pivot_table(
    values='AbsError', 
    index='Hour', 
    columns='Month', 
    aggfunc='mean'
)

# Plot heatmap - wider to match LSTM
plt.figure(figsize=(24, 12))
sns.heatmap(hour_month_error, cmap='YlOrRd', annot=True, fmt='.2f', annot_kws={"size": 10})
plt.title('Mean Absolute Error by Hour and Month', fontsize=16)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Hour of Day', fontsize=14)
plt.tight_layout()
plt.savefig('plots/catboost_error_heatmap.png', dpi=300, bbox_inches='tight')
plt.close()

# Create scatter plot of predicted vs actual
plt.figure(figsize=(16, 16))
plt.scatter(test_results['Actual'], test_results['Predicted'], alpha=0.5)
plt.plot([0, max(test_results['Actual'])], [0, max(test_results['Actual'])], 'r--')
plt.xlabel('Actual Power (MW)', fontsize=14)
plt.ylabel('Predicted Power (MW)', fontsize=14)
plt.title('CatBoost Predicted vs Actual Solar Power', fontsize=16)
plt.axis('equal')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('plots/catboost_prediction_scatter.png', dpi=300, bbox_inches='tight')
plt.close()

# Evaluate feature interactions
print("\n8. Analyzing feature interactions...")

# Explore the most important feature interactions
try:
    top_features = importance_df['Feature'][:3].tolist()
    
    # Create interaction plots between top features
    if len(top_features) >= 2:
        fig, axes = plt.subplots(1, 3, figsize=(24, 8))
        
        if 'hour_cat' in top_features and 'GHI' in X_test_cat.columns:
            # Convert hour_cat to numeric for plotting
            hour_numeric = X_test_cat['hour_cat'].astype(str).astype(int)
            
            # Plot hour vs GHI colored by prediction
            sc = axes[0].scatter(hour_numeric, X_test_cat['GHI'], c=y_pred, cmap='viridis', alpha=0.6)
            axes[0].set_xlabel('Hour of Day', fontsize=12)
            axes[0].set_ylabel('GHI', fontsize=12)
            axes[0].set_title('Hour vs GHI Impact on Prediction', fontsize=14)
            plt.colorbar(sc, ax=axes[0], label='Predicted Power (MW)')
        
        if 'Temperature' in X_test_cat.columns and 'GHI' in X_test_cat.columns:
            # Plot Temperature vs GHI colored by prediction
            sc = axes[1].scatter(X_test_cat['Temperature'], X_test_cat['GHI'], c=y_pred, cmap='viridis', alpha=0.6)
            axes[1].set_xlabel('Temperature', fontsize=12)
            axes[1].set_ylabel('GHI', fontsize=12)
            axes[1].set_title('Temperature vs GHI Impact on Prediction', fontsize=14)
            plt.colorbar(sc, ax=axes[1], label='Predicted Power (MW)')
        
        if 'Cloud_Type' in top_features and 'GHI' in X_test_cat.columns:
            # Plot Cloud_Type vs GHI colored by prediction
            cloud_type_numeric = X_test_cat['Cloud_Type'].astype(int)
            sc = axes[2].scatter(cloud_type_numeric, X_test_cat['GHI'], c=y_pred, cmap='viridis', alpha=0.6)
            axes[2].set_xlabel('Cloud Type', fontsize=12)
            axes[2].set_ylabel('GHI', fontsize=12)
            axes[2].set_title('Cloud Type vs GHI Impact on Prediction', fontsize=14)
            plt.colorbar(sc, ax=axes[2], label='Predicted Power (MW)')
        
        plt.tight_layout()
        plt.savefig('plots/catboost_feature_interactions.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        print("Created feature interaction visualizations.")
except Exception as e:
    print(f"Could not create feature interaction plots: {e}")

print("\n9. Saving prediction results...")
# Save prediction results
test_results.to_csv('models/catboost_predictions.csv', index=False)
daily_results.to_csv('models/catboost_daily_predictions.csv', index=False)

# Create monthly aggregated results
monthly_results = test_results.groupby(test_results['LocalTime'].dt.month).agg({
    'Actual': 'sum',
    'Predicted': 'sum'
}).reset_index()
monthly_results.columns = ['Month', 'Actual', 'Predicted']

# Plot monthly results
plt.figure(figsize=(18, 10))
width = 0.35
x = np.arange(len(monthly_results))
plt.bar(x - width/2, monthly_results['Actual'], width, label='Actual', alpha=0.7)
plt.bar(x + width/2, monthly_results['Predicted'], width, label='Predicted', alpha=0.7)
plt.xlabel('Month', fontsize=14)
plt.ylabel('Total Monthly Power Production (MW)', fontsize=14)
plt.title('Monthly Actual vs Predicted Solar Power Production', fontsize=16)
plt.xticks(x, monthly_results['Month'], fontsize=12)
plt.legend(fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('plots/catboost_monthly_prediction.png', dpi=300, bbox_inches='tight')
plt.close()

print("\nOptimized CatBoost model training and evaluation completed!")
print("This implementation focuses on CatBoost's strengths with categorical features and weather variables")
print("Check the 'plots' directory for enhanced visualizations.")
print("=" * 80)

OPTIMIZED CATBOOST MODEL FOR SOLAR POWER PREDICTION

1. Loading processed data...

2. Creating optimized CatBoost features...
Using 6 categorical features: ['Cloud_Type', 'location_id', 'PV_Type', 'hour_cat', 'month_cat', 'dayofweek']

3. Training CatBoost model...
0:	learn: 6.6708719	test: 6.6523208	best: 6.6523208 (0)	total: 236ms	remaining: 3m 56s
100:	learn: 2.9867416	test: 3.0111281	best: 3.0111281 (100)	total: 6.99s	remaining: 1m 2s
200:	learn: 2.8391953	test: 2.8628466	best: 2.8628466 (200)	total: 14.8s	remaining: 58.6s
300:	learn: 2.7561961	test: 2.7819068	best: 2.7819068 (300)	total: 22.5s	remaining: 52.3s
400:	learn: 2.6928098	test: 2.7221724	best: 2.7221724 (400)	total: 30.3s	remaining: 45.2s
500:	learn: 2.6519218	test: 2.6831281	best: 2.6831281 (500)	total: 37.6s	remaining: 37.5s
600:	learn: 2.6155589	test: 2.6476454	best: 2.6476454 (600)	total: 45.8s	remaining: 30.4s
700:	learn: 2.5846719	test: 2.6192782	best: 2.6192782 (700)	total: 55.3s	remaining: 23.6s
