# Dynamic Pricing Model: Ridge Regression with Hyperparameter Tuning and MLflow Tracking

This notebook demonstrates a complete machine learning workflow for a dynamic pricing problem, including feature selection, hyperparameter tuning using `GridSearchCV` on a **Ridge Regression** model, and meticulous logging of all parameters, metrics, and the final model using **MLflow**.

**Prerequisites:** This notebook assumes you have the required libraries installed (`pandas`, `numpy`, `sklearn`, `mlflow`, `matplotlib`) and that the data file `9e05bb74-2719-45b0-bbc7-0ced2f61a442_dynamicpricingmodelingprepareddata.csv` is available in the current directory.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from mlflow.models.signature import infer_signature
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import os

# --- Configuration and Setup ---

# Define the file path for the uploaded data
FILE_NAME = "9e05bb74-2719-45b0-bbc7-0ced2f61a442_dynamicpricingmodelingprepareddata.csv"
TARGET = 'LogSellingPrice'
EXPERIMENT_NAME = "Dynamic_Pricing_Model_Ridge_Tuning"

# Define the features to be used in the model
FEATURE_COLS = [
    'UnitsSold', 'StockStart', 'Demand', 'Backorders', 'StockEnd',
    'ReorderPoint', 'OrderPlaced', 'OrderQty', 'LeadTimeFloat', 'SafetyStock',
    'CTR', 'AbandonedCartRate', 'BounceRate', 'FunnelDrop_ViewToCart',
    'FunnelDrop_CartToCheckout', 'ReturningVisitorRatio', 'AvgSessionDuration_sec',
    'DiscountRate_Nirma', 'DiscountRate_Surf Excel', 'FinalPrice_Nirma', 'FinalPrice_Surf Excel'
]

# Set the MLflow experiment name
mlflow.set_experiment(EXPERIMENT_NAME)

# --- 1. Data Preparation ---

def load_and_split_data(file_name, features, target):
    """Loads data, selects features, and splits into train/test sets."""
    print("Loading data...")
    df = pd.read_csv(file_name)
    X = df[features]
    y = df[target]
    # Use 80% for training and 20% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Data split: Training size={len(X_train)}, Testing size={len(X_test)}")
    return X_train, X_test, y_train, y_test

# Load and split the data
X_train, X_test, y_train, y_test = load_and_split_data(FILE_NAME, FEATURE_COLS, TARGET)

X_train.head()

## 2. Evaluation Helper Functions

In [None]:
def calculate_metrics(y_test, y_pred, X_test):
    """Calculates all required regression metrics."""
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Adjusted R-squared calculation
    n = X_test.shape[0] # number of samples
    k = X_test.shape[1] # number of features
    # Check for valid denominator to prevent errors
    if (n - k - 1) <= 0:
        adj_r2 = -float('inf')
    else:
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - k - 1)

    return {
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'mse': mse,
        'adj_r2': adj_r2
    }

def create_residual_plot(y_test, y_pred, plot_path="residual_plot.png"):
    """Generates a residual plot and saves it as a PNG file."""
    residuals = y_test - y_pred
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.scatter(y_pred, residuals, alpha=0.6)
    ax.hlines(0, y_pred.min(), y_pred.max(), color='red', linestyle='--', linewidth=2)
    ax.set_xlabel("Predicted Log Selling Price", fontsize=12)
    ax.set_ylabel("Residuals (Actual - Predicted)", fontsize=12)
    ax.set_title("Residual Plot: Assessing Model Bias", fontsize=14)
    ax.grid(True, linestyle=':', alpha=0.7)
    plt.tight_layout()
    plt.savefig(plot_path)
    plt.close(fig)
    return plot_path

## 3. Hyperparameter Tuning and MLflow Tracking

In [None]:
def train_and_log_model(X_train, X_test, y_train, y_test, features):
    """
    Performs Grid Search hyperparameter tuning for Ridge Regression,
    logs each candidate run, and logs the final best model.
    """
    print(f"Starting experiment: {EXPERIMENT_NAME}")

    # Model definition: Ridge Regression
    ridge_model = Ridge(random_state=42)

    # Hyperparameter search space for alpha
    param_grid = {'alpha': [0.1, 1.0, 10.0, 100.0]}

    # Set up GridSearchCV for cross-validation
    grid_search = GridSearchCV(
        estimator=ridge_model,
        param_grid=param_grid,
        scoring='r2',
        cv=3, 
        n_jobs=-1, 
        return_train_score=False
    )
    
    print("Beginning Grid Search for best 'alpha' parameter...")
    grid_search.fit(X_train, y_train)

    # --- Extract Best Model Details ---
    best_alpha = grid_search.best_params_['alpha']
    best_score = grid_search.best_score_
    best_model = grid_search.best_estimator_
    
    # Predict with the best model on the test set
    y_pred = best_model.predict(X_test)
    test_metrics = calculate_metrics(y_test, y_pred, X_test)

    # --- Start the FINAL Run for the BEST Model ---
    with mlflow.start_run(run_name=f"Ridge_Best_Alpha={best_alpha}") as run:
        print(f"\nLogging BEST model run: {run.info.run_id}")

        # Log Model Parameters
        mlflow.log_param("model_type", "Ridge Regression (Regularized Linear Model)")
        mlflow.log_param("best_alpha", best_alpha)
        mlflow.log_param("cv_best_score_r2", f"{best_score:.4f}")
        mlflow.log_param("feature_count", len(features))
        mlflow.log_param("target_variable", TARGET)
        
        feature_engineering = (
            "No scaling applied; feature selection based on numerical non-date columns."
        )
        mlflow.log_param("feature_engineering_details", feature_engineering)
        mlflow.log_param("features_list", features)

        # Log Evaluation Metrics
        mlflow.log_metrics(test_metrics)
        print(f"Logged Test Set RMSE: {test_metrics['rmse']:.4f}")
        print(f"Logged Test Set Adjusted R2: {test_metrics['adj_r2']:.4f}")

        # Create and Log Artifact (Residual Plot)
        plot_path = create_residual_plot(y_test, y_pred)
        mlflow.log_artifact(plot_path)
        print(f"Logged artifact: {plot_path}")
        
        # Define and Log Input-Output Signature
        signature = infer_signature(X_test, y_pred)

        # Log the Final Trained Model
        mlflow.sklearn.log_model(
            sk_model=best_model,
            artifact_path="ridge_model", 
            signature=signature,
            input_example=X_test.head(1).to_dict() 
        )
        print("Logged final Ridge model with signature and input example.")
        
    return run.info.run_id

# --- Execution ---
final_run_id = train_and_log_model(X_train, X_test, y_train, y_test, FEATURE_COLS)
print("\n-------------------------------------------------")
print("MLflow Execution Summary")
print(f"Experiment: {EXPERIMENT_NAME}")
print(f"Final Best Run ID: {final_run_id}")
print("All parameters, metrics, the model, and a residual plot have been logged.")
print("View the results in the MLflow UI by navigating to this Run ID.")
print("-------------------------------------------------")
# Clean up temporary plot file
if os.path.exists("residual_plot.png"):
    os.remove("residual_plot.png")
