feat(model): add model persistence with save and load functionality for serialization and metadata tracking

artemisTurintech · artemisTurintech · commit 4718e8573047 · 2025-10-23T06:45:49.000Z
diff --git a/model.py b/model.py
@@ -4,13 +4,25 @@
 on preprocessed data and calculate comprehensive regression metrics. The module
 is designed to work with data that has already been preprocessed (scaled and imputed)
 by the preprocessing pipeline.
+
+Additionally, this module provides model persistence functionality to save and load
+trained models along with their preprocessing pipelines and metadata for reproducibility.
 """
 
 import pandas as pd
 import numpy as np
+import joblib
+import pickle
+import sys
+import platform
+import warnings
+from datetime import datetime
+from pathlib import Path
 from sklearn.linear_model import LinearRegression
 from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
-from typing import Dict, Tuple
+from sklearn.pipeline import Pipeline
+from typing import Dict, Tuple, List, Optional, Any
+import sklearn
 
 
 def train_model(X: pd.DataFrame, y: pd.Series) -> Tuple[LinearRegression, Dict[str, float]]:
@@ -180,3 +192,330 @@ def _validate_inputs(X: pd.DataFrame, y: pd.Series) -> None:
             f"Insufficient data: only {X.shape[0]} sample(s) provided. "
             "At least 2 samples are required for Linear Regression."
         )
+
+
+def _get_feature_names_out(pipeline: Pipeline, original_features: List[str]) -> List[str]:
+    """
+    Extract feature names after preprocessing pipeline transformation.
+    
+    This helper function attempts to retrieve the feature names that result from
+    applying the preprocessing pipeline. This is particularly important for pipelines
+    that include transformations like OneHotEncoding which generate new feature names.
+    
+    Parameters
+    ----------
+    pipeline : Pipeline
+        A fitted preprocessing pipeline.
+    original_features : List[str]
+        Original feature names before preprocessing.
+    
+    Returns
+    -------
+    List[str]
+        Feature names after preprocessing transformation. If feature names cannot
+        be extracted (e.g., pipeline doesn't support get_feature_names_out), returns
+        the original feature names.
+    
+    Notes
+    -----
+    - Attempts to use get_feature_names_out() method if available (sklearn >= 1.0)
+    - Falls back to original feature names if method is not available
+    - Useful for understanding what features the model is actually using
+    """
+    try:
+        # Try to get feature names from the pipeline
+        if hasattr(pipeline, 'get_feature_names_out'):
+            feature_names = pipeline.get_feature_names_out()
+            return list(feature_names)
+    except Exception:
+        # If anything fails, fall back to original names
+        pass
+    
+    # Fallback to original feature names
+    return original_features
+
+
+def save_model(
+    model: LinearRegression,
+    pipeline: Pipeline,
+    feature_names: List[str],
+    target_name: str,
+    save_path: str
+) -> None:
+    """
+    Save a trained model, preprocessing pipeline, and metadata to disk.
+    
+    This function serializes a trained LinearRegression model along with its
+    preprocessing pipeline and comprehensive metadata for reproducibility. The
+    saved file can be loaded later using load_model() to recreate the exact
+    training environment.
+    
+    File Format
+    -----------
+    The saved file is a dictionary containing three keys:
+    - 'model': The trained LinearRegression object
+    - 'pipeline': The fitted preprocessing Pipeline object
+    - 'metadata': Dictionary with training information including:
+        - 'original_feature_names': Feature names before preprocessing
+        - 'transformed_feature_names': Feature names after preprocessing
+        - 'target_name': Name of the target variable
+        - 'training_timestamp': ISO 8601 formatted timestamp
+        - 'sklearn_version': Version of scikit-learn used for training
+        - 'python_version': Python version used for training
+    
+    Parameters
+    ----------
+    model : LinearRegression
+        Trained LinearRegression model to save.
+    pipeline : Pipeline
+        Fitted preprocessing pipeline used to transform training data.
+    feature_names : List[str]
+        Original feature names (before preprocessing transformations).
+    target_name : str
+        Name of the target variable/column.
+    save_path : str
+        File path where the model bundle should be saved. Parent directories
+        will be created if they don't exist.
+    
+    Raises
+    ------
+    TypeError
+        If model is not a LinearRegression instance or pipeline is not a Pipeline.
+    ValueError
+        If feature_names is empty or target_name is empty/None.
+    OSError
+        If the save path directory cannot be created or file cannot be written.
+    
+    Examples
+    --------
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from sklearn.pipeline import Pipeline
+    >>> 
+    >>> # Assume model and pipeline are already trained
+    >>> save_model(
+    ...     model=trained_model,
+    ...     pipeline=fitted_pipeline,
+    ...     feature_names=['age', 'income', 'credit_score'],
+    ...     target_name='loan_amount',
+    ...     save_path='models/my_model.joblib'
+    ... )
+    >>> print("Model saved successfully!")
+    
+    Notes
+    -----
+    - Uses joblib for efficient serialization of sklearn objects
+    - Saved files are cross-platform compatible
+    - File extension .joblib or .pkl is recommended but not enforced
+    - Metadata enables version compatibility checks during loading
+    - Parent directories are created automatically if they don't exist
+    - Original feature names are preserved to ensure correct column ordering during prediction
+    """
+    # Validate inputs
+    if not isinstance(model, LinearRegression):
+        raise TypeError(
+            f"model must be a LinearRegression instance, got {type(model).__name__} instead."
+        )
+    
+    if not isinstance(pipeline, Pipeline):
+        raise TypeError(
+            f"pipeline must be a Pipeline instance, got {type(pipeline).__name__} instead."
+        )
+    
+    if not feature_names or len(feature_names) == 0:
+        raise ValueError("feature_names cannot be empty.")
+    
+    if not target_name or not isinstance(target_name, str):
+        raise ValueError("target_name must be a non-empty string.")
+    
+    # Create metadata dictionary
+    metadata = {
+        'original_feature_names': feature_names,
+        'transformed_feature_names': _get_feature_names_out(pipeline, feature_names),
+        'target_name': target_name,
+        'training_timestamp': datetime.now().isoformat(),
+        'sklearn_version': sklearn.__version__,
+        'python_version': platform.python_version()
+    }
+    
+    # Create model bundle
+    model_bundle = {
+        'model': model,
+        'pipeline': pipeline,
+        'metadata': metadata
+    }
+    
+    # Create parent directories if they don't exist
+    save_path_obj = Path(save_path)
+    save_path_obj.parent.mkdir(parents=True, exist_ok=True)
+    
+    # Save using joblib
+    try:
+        joblib.dump(model_bundle, save_path)
+    except Exception as e:
+        raise OSError(f"Failed to save model to {save_path}: {str(e)}") from e
+
+
+def _check_version_compatibility(saved_sklearn_version: str) -> None:
+    """
+    Check version compatibility and issue warnings if versions differ.
+    
+    Compares the current scikit-learn version with the version used to train
+    the saved model. Issues a warning if there's a mismatch, as this could
+    potentially lead to compatibility issues or different prediction results.
+    
+    Parameters
+    ----------
+    saved_sklearn_version : str
+        The scikit-learn version string from the saved model metadata.
+    
+    Notes
+    -----
+    - Uses Python's warnings module to issue version mismatch warnings
+    - Warnings are issued at the UserWarning level
+    - Major version differences are more likely to cause issues than minor ones
+    """
+    current_version = sklearn.__version__
+    
+    if current_version != saved_sklearn_version:
+        warnings.warn(
+            f"Scikit-learn version mismatch: Model was trained with version "
+            f"{saved_sklearn_version}, but current version is {current_version}. "
+            f"This may lead to compatibility issues or different prediction results.",
+            UserWarning,
+            stacklevel=3
+        )
+
+
+def load_model(load_path: str) -> Dict[str, Any]:
+    """
+    Load a saved model, preprocessing pipeline, and metadata from disk.
+    
+    This function deserializes a model bundle previously saved with save_model(),
+    validates its structure, and checks for version compatibility. The returned
+    dictionary contains the model, pipeline, and metadata needed for making
+    predictions on new data.
+    
+    Parameters
+    ----------
+    load_path : str
+        File path to the saved model bundle (.joblib or .pkl file).
+    
+    Returns
+    -------
+    Dict[str, Any]
+        Dictionary containing three keys:
+        - 'model': The trained LinearRegression object
+        - 'pipeline': The fitted preprocessing Pipeline object
+        - 'metadata': Dictionary with training information:
+            - 'original_feature_names': Feature names before preprocessing
+            - 'transformed_feature_names': Feature names after preprocessing
+            - 'target_name': Name of the target variable
+            - 'training_timestamp': ISO 8601 formatted timestamp
+            - 'sklearn_version': Version of scikit-learn used for training
+            - 'python_version': Python version used for training
+    
+    Raises
+    ------
+    FileNotFoundError
+        If the specified load_path does not exist.
+    ValueError
+        If the loaded object doesn't have the expected structure (missing keys).
+    EOFError
+        If the file is corrupted or truncated.
+    pickle.UnpicklingError
+        If the file cannot be deserialized (corrupted or incompatible format).
+    
+    Warnings
+    --------
+    UserWarning
+        Issued if the scikit-learn version differs from the one used for training.
+    
+    Examples
+    --------
+    >>> # Load a previously saved model
+    >>> model_bundle = load_model('models/my_model.joblib')
+    >>> 
+    >>> # Extract components
+    >>> model = model_bundle['model']
+    >>> pipeline = model_bundle['pipeline']
+    >>> metadata = model_bundle['metadata']
+    >>> 
+    >>> # Check metadata
+    >>> print(f"Model trained on: {metadata['training_timestamp']}")
+    >>> print(f"Features: {metadata['original_feature_names']}")
+    >>> print(f"Target: {metadata['target_name']}")
+    >>> 
+    >>> # Use for predictions
+    >>> X_new_preprocessed = pipeline.transform(X_new)
+    >>> predictions = model.predict(X_new_preprocessed)
+    
+    Notes
+    -----
+    - The model bundle must have been created using save_model() function
+    - Version compatibility warnings help identify potential issues
+    - The pipeline is already fitted and ready to transform new data
+    - Original feature names help ensure correct column ordering
+    - Cross-platform compatible (can load models saved on different OS)
+    """
+    # Check if file exists
+    if not Path(load_path).exists():
+        raise FileNotFoundError(
+            f"Model file not found at path: {load_path}. "
+            f"Please check that the file exists and the path is correct."
+        )
+    
+    # Load the model bundle
+    try:
+        model_bundle = joblib.load(load_path)
+    except EOFError as e:
+        raise EOFError(
+            f"Failed to load model from {load_path}: File appears to be corrupted or truncated. "
+            f"The file may have been incompletely written or damaged."
+        ) from e
+    except pickle.UnpicklingError as e:
+        raise pickle.UnpicklingError(
+            f"Failed to deserialize model from {load_path}: File format is invalid or incompatible. "
+            f"The file may be corrupted or created with an incompatible version."
+        ) from e
+    except Exception as e:
+        raise RuntimeError(
+            f"Unexpected error loading model from {load_path}: {str(e)}"
+        ) from e
+    
+    # Validate structure
+    if not isinstance(model_bundle, dict):
+        raise ValueError(
+            f"Loaded object is not a dictionary. Expected a model bundle with "
+            f"'model', 'pipeline', and 'metadata' keys, but got {type(model_bundle).__name__}."
+        )
+    
+    required_keys = {'model', 'pipeline', 'metadata'}
+    missing_keys = required_keys - set(model_bundle.keys())
+    
+    if missing_keys:
+        raise ValueError(
+            f"Model bundle is missing required keys: {missing_keys}. "
+            f"Expected keys: {required_keys}. Found keys: {set(model_bundle.keys())}."
+        )
+    
+    # Validate metadata structure
+    metadata = model_bundle['metadata']
+    required_metadata_keys = {
+        'original_feature_names',
+        'transformed_feature_names',
+        'target_name',
+        'training_timestamp',
+        'sklearn_version'
+    }
+    missing_metadata_keys = required_metadata_keys - set(metadata.keys())
+    
+    if missing_metadata_keys:
+        raise ValueError(
+            f"Metadata is missing required keys: {missing_metadata_keys}. "
+            f"The model bundle may have been created with an older version of this module."
+        )
+    
+    # Check version compatibility
+    _check_version_compatibility(metadata['sklearn_version'])
+    
+    return model_bundle