# 04_advanced_methods.ipynb - Advanced ML/DL Techniques & Feature Engineering

This notebook explores cutting-edge machine learning and deep learning approaches for house price prediction, including advanced feature engineering techniques and sophisticated model architectures.

## Advanced Models:
1. **LightGBM** - High-performance gradient boosting
2. **CatBoost** - Categorical feature-optimized boosting  
3. **Neural Networks** - Deep learning with TensorFlow/Keras
4. **Ensemble Methods** - Stacking and blending approaches
5. **AutoML** - Automated feature selection and hyperparameter tuning

## Advanced Feature Engineering:
1. **Polynomial Features** - Non-linear feature interactions
2. **Geographic Features** - Location-based engineered features
3. **Time-based Features** - Temporal patterns and seasonality
4. **Target Encoding Variants** - Advanced categorical encoding
5. **Feature Selection** - Automated important feature identification

## Evaluation Framework:
- **Comprehensive Metrics**: R², RMSE, MAPE, MAE
- **Cross-Validation**: Stratified and time-based splits
- **Model Interpretability**: SHAP values and feature importance
- **Performance Profiling**: Training time and prediction speed

**Target:** Exceed XGBoost baseline (R² = 0.8972) with advanced techniques

In [None]:
# Section 1: Advanced Imports and Setup
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Advanced ML imports
try:
    import lightgbm as lgb
    print(f'LightGBM version: {lgb.__version__}')
except ImportError:
    print('Installing LightGBM...')
    import subprocess
    import sys
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'lightgbm'])
    import lightgbm as lgb

try:
    import catboost as cb
    print(f'CatBoost version: {cb.__version__}')
except ImportError:
    print('Installing CatBoost...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'catboost'])
    import catboost as cb

# Deep Learning imports
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers, callbacks
    print(f'TensorFlow version: {tf.__version__}')
except ImportError:
    print('Installing TensorFlow...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow'])
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers, callbacks

# Advanced preprocessing
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Other advanced tools
from datetime import datetime
import joblib
import json
from scipy import stats

print(f"Advanced Methods Setup Complete - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Section 2: Load Data and Baseline Results
ROOT = Path(r"c:\\Users\\lpnhu\\Downloads\\home-price-prediction")
DATA_DIR = ROOT / 'data'
ENHANCED_PATH = DATA_DIR / 'cleaned_enhanced.csv'
MODELS_DIR = ROOT / 'models'

print('=== LOADING DATA AND BASELINE RESULTS ===')

# Load enhanced dataset
data = pd.read_csv(ENHANCED_PATH)
print(f'Enhanced dataset loaded: {data.shape}')

# Load baseline results for comparison
try:
    baseline_summary_path = MODELS_DIR / 'baseline_models_summary.json'
    with open(baseline_summary_path, 'r') as f:
        baseline_summary = json.load(f)
    
    baseline_best_r2 = baseline_summary['best_model']['r2_score']
    baseline_best_model = baseline_summary['best_model']['name']
    print(f'Baseline best model: {baseline_best_model} (R² = {baseline_best_r2:.4f})')
except FileNotFoundError:
    baseline_best_r2 = 0.85  # Conservative baseline
    baseline_best_model = "Unknown"
    print(f'Baseline results not found, using conservative target: R² = {baseline_best_r2:.4f}')

# Load XGBoost results for comparison
try:
    xgb_metrics_path = MODELS_DIR / 'xgboost_enhanced_metrics.json'
    with open(xgb_metrics_path, 'r') as f:
        xgb_metrics = json.load(f)
    
    xgb_r2 = xgb_metrics['test_metrics']['r2']
    print(f'XGBoost enhanced R²: {xgb_r2:.4f}')
    target_r2 = max(baseline_best_r2, xgb_r2)
except FileNotFoundError:
    target_r2 = baseline_best_r2
    print(f'XGBoost results not found')

print(f'Target to exceed: R² > {target_r2:.4f}')

In [None]:
# Section 3: Advanced Feature Engineering
print('=== ADVANCED FEATURE ENGINEERING ===')

# Prepare base features
exclude_columns = ['ClosePrice', 'ListingId', 'UnparsedAddress']
base_features = [col for col in data.columns if col not in exclude_columns]

X_base = data[base_features].copy()
y = data['ClosePrice'].copy()

# Handle missing values
X_base = X_base.fillna(X_base.median())
print(f'Base features: {X_base.shape[1]}')

# 1. Geographic Clustering Features
print('Creating geographic clustering features...')
if 'Latitude' in X_base.columns and 'Longitude' in X_base.columns:
    coords = X_base[['Latitude', 'Longitude']].fillna(X_base[['Latitude', 'Longitude']].median())
    
    # K-means clustering for geographic regions
    for n_clusters in [5, 10, 20]:
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        X_base[f'GeoCluster_{n_clusters}'] = kmeans.fit_predict(coords)
    
    # Distance from city center (assuming center is median lat/lon)
    center_lat, center_lon = coords.median()
    X_base['DistanceFromCenter'] = np.sqrt(
        (coords['Latitude'] - center_lat)**2 + 
        (coords['Longitude'] - center_lon)**2
    )
    print('Geographic features created')

# 2. Advanced Time Features  
print('Creating advanced temporal features...')
if 'BuildingAge' in X_base.columns:
    # Age-based binning
    X_base['AgeCategory'] = pd.cut(X_base['BuildingAge'], 
                                  bins=[0, 5, 15, 30, 50, 100], 
                                  labels=['New', 'Recent', 'Mature', 'Old', 'Historic'])
    
    # Age squared for non-linear effects
    X_base['BuildingAge_Squared'] = X_base['BuildingAge'] ** 2
    X_base['BuildingAge_Log'] = np.log1p(X_base['BuildingAge'])

# 3. Property Size Interactions
print('Creating property size interaction features...')
size_columns = [col for col in X_base.columns if any(keyword in col.lower() 
               for keyword in ['sqft', 'size', 'area', 'room', 'bed', 'bath'])]

if len(size_columns) >= 2:
    # Create ratios and interactions for top size features
    for i in range(min(3, len(size_columns))):
        for j in range(i+1, min(3, len(size_columns))):
            col1, col2 = size_columns[i], size_columns[j]
            # Ratio features
            X_base[f'{col1}_{col2}_ratio'] = X_base[col1] / (X_base[col2] + 1)
            # Product features  
            X_base[f'{col1}_{col2}_product'] = X_base[col1] * X_base[col2]

# 4. Statistical Features per Categorical Group
print('Creating statistical group features...')
categorical_cols = X_base.select_dtypes(include=['object', 'category']).columns
numerical_cols = X_base.select_dtypes(include=[np.number]).columns[:10]  # Top 10 numerical

for cat_col in categorical_cols[:3]:  # Limit to prevent explosion
    if X_base[cat_col].nunique() < 50:  # Only for reasonable cardinality
        for num_col in numerical_cols[:3]:
            group_stats = X_base.groupby(cat_col)[num_col].agg(['mean', 'std', 'median'])
            X_base[f'{cat_col}_{num_col}_mean'] = X_base[cat_col].map(group_stats['mean'])
            X_base[f'{cat_col}_{num_col}_std'] = X_base[cat_col].map(group_stats['std'])

print(f'Feature engineering complete. Total features: {X_base.shape[1]}')

In [None]:
# Section 4: Feature Selection and Preprocessing
print('=== FEATURE SELECTION AND PREPROCESSING ===')

# Convert categorical variables to numerical using target encoding
categorical_columns = X_base.select_dtypes(include=['object', 'category']).columns

for col in categorical_columns:
    if X_base[col].nunique() < 100:  # Reasonable cardinality
        # Target encoding with smoothing
        target_mean = y.mean()
        counts = X_base[col].value_counts()
        means = y.groupby(X_base[col]).mean()
        
        # Smoothing factor (higher = more smoothing)
        smooth = 10
        smoothed_means = (counts * means + smooth * target_mean) / (counts + smooth)
        
        X_base[f'{col}_target_encoded'] = X_base[col].map(smoothed_means)
        X_base[f'{col}_count'] = X_base[col].map(counts)

# Drop original categorical columns
X_processed = X_base.select_dtypes(include=[np.number]).copy()

# Handle any infinite values
X_processed = X_processed.replace([np.inf, -np.inf], np.nan)
X_processed = X_processed.fillna(X_processed.median())

print(f'Processed feature matrix: {X_processed.shape}')

# Feature selection using mutual information
print('Performing feature selection...')
from sklearn.feature_selection import mutual_info_regression

# Calculate feature importance
mi_scores = mutual_info_regression(X_processed, y, random_state=42)
feature_importance = pd.DataFrame({
    'feature': X_processed.columns,
    'importance': mi_scores
}).sort_values('importance', ascending=False)

# Select top features (adaptive based on total features)
n_features = min(200, int(X_processed.shape[1] * 0.8))  
top_features = feature_importance.head(n_features)['feature'].tolist()

X_selected = X_processed[top_features].copy()
print(f'Selected {len(top_features)} most important features')
print(f'Final feature matrix: {X_selected.shape}')

In [None]:
# Section 5: Advanced Train/Test Split and Evaluation Framework
from sklearn.model_selection import train_test_split

print('=== ADVANCED DATA SPLITTING ===')

# Advanced train/validation/test split
X_temp, X_test, y_temp, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42, shuffle=True
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, shuffle=True  # 0.25 of 0.8 = 0.2 of total
)

print(f'Training set: {X_train.shape[0]:,} samples')
print(f'Validation set: {X_val.shape[0]:,} samples') 
print(f'Test set: {X_test.shape[0]:,} samples')

# Advanced evaluation function
def advanced_evaluate_model(model, X_train, X_val, X_test, y_train, y_val, y_test, 
                          model_name, fit_kwargs=None):
    """Advanced model evaluation with validation set"""
    print(f'\n=== {model_name} ===')
    
    fit_kwargs = fit_kwargs or {}
    start_time = datetime.now()
    
    # Train model
    if 'eval_set' in fit_kwargs:
        model.fit(X_train, y_train, **fit_kwargs)
    else:
        model.fit(X_train, y_train)
    
    training_time = (datetime.now() - start_time).total_seconds()
    
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val) 
    y_pred_test = model.predict(X_test)
    
    # Metrics
    train_r2 = r2_score(y_train, y_pred_train)
    val_r2 = r2_score(y_val, y_pred_val)
    test_r2 = r2_score(y_test, y_pred_test)
    
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mape = mean_absolute_percentage_error(y_test, y_pred_test)
    
    results = {
        'model_name': model_name,
        'train_r2': train_r2,
        'val_r2': val_r2, 
        'test_r2': test_r2,
        'rmse': test_rmse,
        'mae': test_mae,
        'mape': test_mape,
        'training_time': training_time,
        'model_object': model,
        'predictions': y_pred_test
    }
    
    print(f'  Training time: {training_time:.2f}s')
    print(f'  R² (train): {train_r2:.4f}')
    print(f'  R² (val):   {val_r2:.4f}') 
    print(f'  R² (test):  {test_r2:.4f}')
    print(f'  RMSE:       ${test_rmse:,.0f}')
    print(f'  MAE:        ${test_mae:,.0f}')
    print(f'  MAPE:       {test_mape*100:.2f}%')
    
    # Overfitting check
    if train_r2 - val_r2 > 0.05:
        print(f'  ⚠ Potential overfitting detected')
    
    return results

advanced_results = []

In [None]:
# Section 6: LightGBM Implementation
print('=== LIGHTGBM ADVANCED IMPLEMENTATION ===')

# LightGBM with early stopping and validation
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 20,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': -1
}

lgb_model = lgb.LGBMRegressor(n_estimators=2000, **lgb_params)

# Use validation set for early stopping
fit_kwargs_lgb = {
    'eval_set': [(X_val, y_val)],
    'callbacks': [lgb.early_stopping(100), lgb.log_evaluation(0)]
}

lgb_results = advanced_evaluate_model(
    lgb_model, X_train, X_val, X_test, y_train, y_val, y_test,
    "LightGBM", fit_kwargs_lgb
)
advanced_results.append(lgb_results)

In [None]:
# Section 7: CatBoost Implementation  
print('=== CATBOOST ADVANCED IMPLEMENTATION ===')

# CatBoost with built-in categorical handling
catboost_model = cb.CatBoostRegressor(
    iterations=2000,
    learning_rate=0.01,
    depth=8,
    l2_leaf_reg=3,
    subsample=0.8,
    colsample_bylevel=0.8,
    random_seed=42,
    logging_level='Silent',
    use_best_model=True,
    eval_metric='RMSE'
)

fit_kwargs_cb = {
    'eval_set': (X_val, y_val),
    'early_stopping_rounds': 100,
    'verbose': False
}

cb_results = advanced_evaluate_model(
    catboost_model, X_train, X_val, X_test, y_train, y_val, y_test,
    "CatBoost", fit_kwargs_cb
)
advanced_results.append(cb_results)

In [None]:
# Section 8: Neural Network Implementation
print('=== NEURAL NETWORK IMPLEMENTATION ===')

# Preprocessing for neural networks
from sklearn.preprocessing import StandardScaler

scaler_nn = StandardScaler()
X_train_scaled = scaler_nn.fit_transform(X_train)
X_val_scaled = scaler_nn.transform(X_val)
X_test_scaled = scaler_nn.transform(X_test)

# Build neural network architecture
def create_neural_network(input_dim):
    model = keras.Sequential([
        layers.Dense(512, activation='relu', input_shape=(input_dim,)),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(), 
        layers.Dropout(0.3),
        
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),
        
        layers.Dense(32, activation='relu'),
        layers.Dense(1)  # Output layer
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )
    
    return model

# Create and train neural network
nn_model = create_neural_network(X_train_scaled.shape[1])

# Callbacks for training
early_stopping = callbacks.EarlyStopping(
    monitor='val_loss', patience=20, restore_best_weights=True
)

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.5, patience=10, min_lr=1e-7
)

print('Training Neural Network...')
history = nn_model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=200,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=0
)

print(f'Training completed after {len(history.history["loss"])} epochs')

# Evaluate neural network
y_pred_nn_test = nn_model.predict(X_test_scaled, verbose=0).flatten()
y_pred_nn_val = nn_model.predict(X_val_scaled, verbose=0).flatten()
y_pred_nn_train = nn_model.predict(X_train_scaled, verbose=0).flatten()

nn_results = {
    'model_name': 'Neural Network',
    'train_r2': r2_score(y_train, y_pred_nn_train),
    'val_r2': r2_score(y_val, y_pred_nn_val),
    'test_r2': r2_score(y_test, y_pred_nn_test),
    'rmse': np.sqrt(mean_squared_error(y_test, y_pred_nn_test)),
    'mae': mean_absolute_error(y_test, y_pred_nn_test),
    'mape': mean_absolute_percentage_error(y_test, y_pred_nn_test),
    'training_time': 0,  # Approximate 
    'model_object': nn_model,
    'predictions': y_pred_nn_test,
    'scaler': scaler_nn
}

print(f"Neural Network Results:")
print(f"  R² (train): {nn_results['train_r2']:.4f}")
print(f"  R² (val):   {nn_results['val_r2']:.4f}")
print(f"  R² (test):  {nn_results['test_r2']:.4f}")
print(f"  RMSE:       ${nn_results['rmse']:,.0f}")

advanced_results.append(nn_results)

In [None]:
# Section 9: Ensemble Methods
print('=== ENSEMBLE METHODS ===')

from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import Ridge

# Create ensemble of best models
print('Creating ensemble model...')

# Use the trained models (without refitting)
ensemble_models = [
    ('lightgbm', lgb_results['model_object']),
    ('catboost', cb_results['model_object']),
    ('ridge', Ridge(alpha=1.0))  # Add a simple linear model for diversity
]

# Fit the ridge model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
ensemble_models[2] = ('ridge', ridge_model)

# Create voting ensemble
voting_ensemble = VotingRegressor(ensemble_models)
voting_ensemble.fit(X_train, y_train)  # This won't refit already fitted models

voting_results = advanced_evaluate_model(
    voting_ensemble, X_train, X_val, X_test, y_train, y_val, y_test,
    "Voting Ensemble"
)
advanced_results.append(voting_results)

# Weighted ensemble (manual)
print('Creating weighted ensemble...')

lgb_pred = lgb_results['model_object'].predict(X_test)
cb_pred = cb_results['model_object'].predict(X_test)
nn_pred = y_pred_nn_test

# Optimal weights based on validation performance
val_scores = [lgb_results['val_r2'], cb_results['val_r2'], nn_results['val_r2']]
weights = np.array(val_scores) / sum(val_scores)  # Normalize to sum to 1

weighted_pred = (weights[0] * lgb_pred + 
                weights[1] * cb_pred + 
                weights[2] * nn_pred)

weighted_r2 = r2_score(y_test, weighted_pred)
weighted_rmse = np.sqrt(mean_squared_error(y_test, weighted_pred))
weighted_mape = mean_absolute_percentage_error(y_test, weighted_pred)

print(f"\nWeighted Ensemble Results:")
print(f"  Weights: LightGBM={weights[0]:.3f}, CatBoost={weights[1]:.3f}, NN={weights[2]:.3f}")
print(f"  R² (test):  {weighted_r2:.4f}")
print(f"  RMSE:       ${weighted_rmse:,.0f}")
print(f"  MAPE:       {weighted_mape*100:.2f}%")

weighted_results = {
    'model_name': 'Weighted Ensemble',
    'train_r2': None, 
    'val_r2': None,
    'test_r2': weighted_r2,
    'rmse': weighted_rmse,
    'mae': mean_absolute_error(y_test, weighted_pred),
    'mape': weighted_mape,
    'training_time': 0,
    'model_object': None,
    'predictions': weighted_pred,
    'weights': weights
}

advanced_results.append(weighted_results)

In [None]:
# Section 10: Results Analysis and Comparison
print('=== ADVANCED METHODS RESULTS ANALYSIS ===')

# Create comprehensive results DataFrame
results_df_advanced = pd.DataFrame([
    {
        'Model': result['model_name'],
        'R² (Train)': result.get('train_r2', np.nan),
        'R² (Val)': result.get('val_r2', np.nan),
        'R² (Test)': result['test_r2'],
        'RMSE ($)': result['rmse'],
        'MAE ($)': result['mae'], 
        'MAPE (%)': result['mape'] * 100,
        'Training Time (s)': result['training_time']
    }
    for result in advanced_results
])

# Sort by test R² score
results_df_advanced = results_df_advanced.sort_values('R² (Test)', ascending=False).reset_index(drop=True)

print("\nAdvanced Methods Performance Ranking:")
print(results_df_advanced.round(4).to_string(index=False))

# Compare with previous best results
best_advanced = results_df_advanced.iloc[0]
print(f'\nBest Advanced Model: {best_advanced["Model"]}')
print(f'  R² Score: {best_advanced["R² (Test)"]:.4f}')
print(f'  RMSE: ${best_advanced["RMSE ($)"]:,.0f}')
print(f'  MAPE: {best_advanced["MAPE (%)"]:.2f}%')

print(f'\n=== PERFORMANCE COMPARISON ===')
print(f'Target to exceed: {target_r2:.4f}')
print(f'Best advanced:    {best_advanced["R² (Test)"]:.4f}')

improvement = best_advanced["R² (Test)"] - target_r2
if improvement > 0:
    print(f'✓ IMPROVEMENT ACHIEVED: +{improvement:.4f} R² points')
else:
    print(f'✗ Target not exceeded: {improvement:.4f} R² points')

In [None]:
# Section 11: Advanced Visualizations and Model Interpretation
print('=== ADVANCED VISUALIZATIONS AND INTERPRETABILITY ===')

# Comprehensive performance visualization
fig, axes = plt.subplots(2, 3, figsize=(20, 12))

# 1. Model Performance Comparison
ax1 = axes[0, 0]
models = results_df_advanced['Model']
r2_scores = results_df_advanced['R² (Test)']
colors = plt.cm.viridis(np.linspace(0, 1, len(models)))

bars = ax1.barh(models, r2_scores, color=colors)
ax1.axvline(x=target_r2, color='red', linestyle='--', alpha=0.7, label=f'Target: {target_r2:.3f}')
ax1.set_xlabel('R² Score')
ax1.set_title('Advanced Methods - R² Score Comparison')
ax1.legend()

for i, bar in enumerate(bars):
    width = bar.get_width()
    ax1.text(width + 0.001, bar.get_y() + bar.get_height()/2, 
             f'{width:.4f}', ha='left', va='center', fontsize=10)

# 2. RMSE vs R² scatter
ax2 = axes[0, 1]
rmse_values = results_df_advanced['RMSE ($)'] / 1000
scatter = ax2.scatter(r2_scores, rmse_values, c=range(len(models)), 
                     s=150, cmap='viridis', alpha=0.7, edgecolors='black')

for i, model in enumerate(models):
    ax2.annotate(model, (r2_scores.iloc[i], rmse_values.iloc[i]), 
                xytext=(5, 5), textcoords='offset points', fontsize=9)

ax2.set_xlabel('R² Score')
ax2.set_ylabel('RMSE (Thousands $)')
ax2.set_title('R² vs RMSE Trade-off')
ax2.grid(True, alpha=0.3)

# 3. Training/Validation/Test R² comparison
ax3 = axes[0, 2]
train_r2 = results_df_advanced['R² (Train)'].fillna(0)
val_r2 = results_df_advanced['R² (Val)'].fillna(0)
test_r2 = results_df_advanced['R² (Test)']

x = np.arange(len(models))
width = 0.25

ax3.bar(x - width, train_r2, width, label='Train', alpha=0.8)
ax3.bar(x, val_r2, width, label='Validation', alpha=0.8) 
ax3.bar(x + width, test_r2, width, label='Test', alpha=0.8)

ax3.set_xlabel('Models')
ax3.set_ylabel('R² Score')
ax3.set_title('Train/Validation/Test R² Comparison')
ax3.set_xticks(x)
ax3.set_xticklabels(models, rotation=45, ha='right')
ax3.legend()
ax3.grid(True, alpha=0.3)

# 4. Best model predictions scatter plot
ax4 = axes[1, 0]
best_model_name = best_advanced['Model']
best_predictions = None

for result in advanced_results:
    if result['model_name'] == best_model_name:
        best_predictions = result['predictions']
        break

if best_predictions is not None:
    scatter = ax4.scatter(y_test, best_predictions, alpha=0.6, c='blue', edgecolors='none')
    ax4.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    ax4.set_xlabel('Actual ClosePrice')
    ax4.set_ylabel('Predicted ClosePrice')
    ax4.set_title(f'{best_model_name}: Actual vs Predicted')
    
    # Add R² text
    ax4.text(0.05, 0.95, f'R² = {best_advanced["R² (Test)"]:.4f}', 
            transform=ax4.transAxes, fontsize=12, verticalalignment='top',
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# 5. Feature importance (for tree-based models)
ax5 = axes[1, 1]
try:
    # Try to get feature importance from LightGBM
    lgb_model_obj = lgb_results['model_object']
    feature_imp = pd.DataFrame({
        'feature': X_selected.columns,
        'importance': lgb_model_obj.feature_importances_
    }).sort_values('importance', ascending=False).head(15)
    
    ax5.barh(range(len(feature_imp)), feature_imp['importance'])
    ax5.set_yticks(range(len(feature_imp)))
    ax5.set_yticklabels(feature_imp['feature'])
    ax5.set_xlabel('Feature Importance')
    ax5.set_title('Top 15 Feature Importances (LightGBM)')
    ax5.invert_yaxis()
except Exception as e:
    ax5.text(0.5, 0.5, 'Feature importance\nnot available', 
            ha='center', va='center', transform=ax5.transAxes)
    ax5.set_title('Feature Importance')

# 6. Model complexity vs performance
ax6 = axes[1, 2]
model_complexity = {
    'Neural Network': 4,  # Deep model
    'LightGBM': 3,       # Gradient boosting
    'CatBoost': 3,       # Gradient boosting  
    'Voting Ensemble': 3.5,  # Ensemble
    'Weighted Ensemble': 4.5  # Complex ensemble
}

complexity_scores = [model_complexity.get(model, 2) for model in models]
ax6.scatter(complexity_scores, r2_scores, s=150, c=colors, alpha=0.7)

for i, model in enumerate(models):
    ax6.annotate(model, (complexity_scores[i], r2_scores.iloc[i]),
                xytext=(5, 5), textcoords='offset points', fontsize=9)

ax6.set_xlabel('Model Complexity')
ax6.set_ylabel('R² Score') 
ax6.set_title('Model Complexity vs Performance')
ax6.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Save visualization
advanced_plot_path = MODELS_DIR / 'advanced_methods_analysis.png'
plt.savefig(advanced_plot_path, dpi=300, bbox_inches='tight')
print(f'Advanced analysis plot saved to: {advanced_plot_path}')

In [None]:
# Section 12: Save Advanced Results and Models
print('=== SAVING ADVANCED RESULTS AND MODELS ===')

# Compile comprehensive results
advanced_summary = {
    'evaluation_timestamp': datetime.now().isoformat(),
    'dataset_info': {
        'total_samples': len(data),
        'train_samples': len(X_train), 
        'val_samples': len(X_val),
        'test_samples': len(X_test),
        'features_engineered': X_selected.shape[1],
        'target_variable': 'ClosePrice'
    },
    'baseline_comparison': {
        'previous_best_r2': target_r2,
        'advanced_best_r2': float(best_advanced['R² (Test)']),
        'improvement': float(best_advanced['R² (Test)'] - target_r2)
    },
    'best_model': {
        'name': best_advanced['Model'],
        'r2_score': float(best_advanced['R² (Test)']),
        'rmse': float(best_advanced['RMSE ($)']),
        'mae': float(best_advanced['MAE ($)']),
        'mape': float(best_advanced['MAPE (%)']) / 100
    },
    'all_results': [
        {
            'model_name': result['model_name'],
            'test_r2': float(result['test_r2']),
            'val_r2': float(result.get('val_r2', 0)),
            'rmse': float(result['rmse']),
            'mae': float(result['mae']),
            'mape': float(result['mape']),
            'training_time': float(result['training_time'])
        }
        for result in advanced_results
    ]
}

# Save summary
advanced_summary_path = MODELS_DIR / 'advanced_methods_summary.json'
with open(advanced_summary_path, 'w') as f:
    json.dump(advanced_summary, f, indent=2)

# Save the best model
best_model_result = next(result for result in advanced_results 
                        if result['model_name'] == best_advanced['Model'])

if best_model_result['model_object'] is not None:
    if best_advanced['Model'] == 'Neural Network':
        # Save neural network model
        nn_model_path = MODELS_DIR / 'best_neural_network.h5'
        best_model_result['model_object'].save(nn_model_path)
        
        # Save scaler
        scaler_path = MODELS_DIR / 'neural_network_scaler.joblib'
        joblib.dump(best_model_result['scaler'], scaler_path)
        print(f'Neural network saved to: {nn_model_path}')
        print(f'Scaler saved to: {scaler_path}')
    else:
        # Save other models
        best_model_path = MODELS_DIR / f'best_advanced_{best_advanced["Model"].lower().replace(" ", "_")}.joblib'
        joblib.dump(best_model_result['model_object'], best_model_path)
        print(f'Best model saved to: {best_model_path}')

# Save results DataFrame
results_csv_path = MODELS_DIR / 'advanced_methods_results.csv'
results_df_advanced.to_csv(results_csv_path, index=False)

# Save feature names for reproducibility
feature_names_path = MODELS_DIR / 'advanced_feature_names.json'
with open(feature_names_path, 'w') as f:
    json.dump(X_selected.columns.tolist(), f, indent=2)

print(f'Advanced summary saved to: {advanced_summary_path}')
print(f'Results table saved to: {results_csv_path}')
print(f'Feature names saved to: {feature_names_path}')

print(f'\n=== ADVANCED METHODS EVALUATION COMPLETE ===')
print(f'Best performing model: {best_advanced["Model"]}')
print(f'Achieved R² = {best_advanced["R² (Test)"]:.4f}')
print(f'Improvement over target: {best_advanced["R² (Test)"] - target_r2:+.4f}')

if best_advanced["R² (Test)"] > target_r2:
    print(f'SUCCESS! Advanced methods exceeded the target performance!')
else:
    print(f'Target not achieved. Consider further hyperparameter tuning.')

print(f'\nAll advanced models and artifacts saved to: {MODELS_DIR}')