feat(model): add predict function with input validation and preprocessing pipeline support

artemisTurintech · artemisTurintech · commit a54babaf36c6 · 2025-10-23T09:52:10.000Z
diff --git a/model.py b/model.py
@@ -519,3 +519,151 @@ def load_model(load_path: str) -> Dict[str, Any]:
     _check_version_compatibility(metadata['sklearn_version'])
     
     return model_bundle
+
+
+def predict(
+    model: LinearRegression,
+    preprocessing_pipeline: Pipeline,
+    raw_features_df: pd.DataFrame,
+    expected_feature_names: List[str]
+) -> pd.Series:
+    """
+    Generate predictions on new data using saved model and preprocessing.
+    
+    This function takes raw input features and applies the same preprocessing
+    transformations that were used during training, then generates predictions
+    using the trained model. It validates that the input data has the correct
+    feature columns and handles column reordering to match the training data.
+    
+    Parameters
+    ----------
+    model : LinearRegression
+        Trained LinearRegression model instance.
+    preprocessing_pipeline : Pipeline
+        Fitted sklearn Pipeline (imputer + scaler) that was used during training.
+        Must be already fitted on training data.
+    raw_features_df : pd.DataFrame
+        pandas DataFrame with raw feature data to make predictions on.
+        Must contain all expected feature columns (order doesn't matter).
+        May contain extra columns which will be ignored.
+    expected_feature_names : List[str]
+        List of feature column names from training data.
+        These are the columns the model expects to see.
+    
+    Returns
+    -------
+    pd.Series
+        pandas Series with predictions, indexed to match the input DataFrame.
+    
+    Raises
+    ------
+    TypeError
+        If inputs are not of the expected types.
+    ValueError
+        If feature columns don't match training data (missing columns).
+        If raw_features_df is empty.
+        If all features are NaN after preprocessing.
+    
+    Examples
+    --------
+    >>> # Load a saved model
+    >>> model_bundle = load_model('models/my_model.joblib')
+    >>> model = model_bundle['model']
+    >>> pipeline = model_bundle['pipeline']
+    >>> feature_names = model_bundle['metadata']['original_feature_names']
+    >>> 
+    >>> # Make predictions on new data
+    >>> predictions = predict(model, pipeline, new_data_df, feature_names)
+    >>> print(predictions)
+    
+    Notes
+    -----
+    - The preprocessing pipeline is applied using transform() (not fit_transform!)
+    - Column order in raw_features_df doesn't matter; columns are reordered automatically
+    - Extra columns in the input are ignored
+    - Missing columns raise a clear error with details about what's missing
+    - The function preserves the index from the input DataFrame
+    - Edge cases like empty DataFrames and all-NaN features are handled gracefully
+    """
+    # Validate input types
+    if not isinstance(model, LinearRegression):
+        raise TypeError(
+            f"model must be a LinearRegression instance, got {type(model).__name__} instead."
+        )
+    
+    if not isinstance(preprocessing_pipeline, Pipeline):
+        raise TypeError(
+            f"preprocessing_pipeline must be a Pipeline instance, got {type(preprocessing_pipeline).__name__} instead."
+        )
+    
+    if not isinstance(raw_features_df, pd.DataFrame):
+        raise TypeError(
+            f"raw_features_df must be a pandas DataFrame, got {type(raw_features_df).__name__} instead."
+        )
+    
+    if not isinstance(expected_feature_names, list):
+        raise TypeError(
+            f"expected_feature_names must be a list, got {type(expected_feature_names).__name__} instead."
+        )
+    
+    # Validate non-empty DataFrame
+    if raw_features_df.empty:
+        raise ValueError("raw_features_df is empty (no rows). Cannot make predictions on empty data.")
+    
+    if len(expected_feature_names) == 0:
+        raise ValueError("expected_feature_names is empty. Cannot validate features.")
+    
+    # Validate feature columns
+    expected_set = set(expected_feature_names)
+    actual_set = set(raw_features_df.columns)
+    
+    missing_features = expected_set - actual_set
+    extra_features = actual_set - expected_set
+    
+    if missing_features:
+        raise ValueError(
+            f"Expected features: {sorted(expected_feature_names)}, "
+            f"got: {sorted(raw_features_df.columns.tolist())}. "
+            f"Missing: {sorted(missing_features)}, "
+            f"extra: {sorted(extra_features)}"
+        )
+    
+    # Warn about extra features if present
+    if extra_features:
+        warnings.warn(
+            f"Input data contains extra columns that will be ignored: {sorted(extra_features)}",
+            UserWarning,
+            stacklevel=2
+        )
+    
+    # Select and reorder columns to match training data order
+    features_df = raw_features_df[expected_feature_names].copy()
+    
+    # Apply preprocessing pipeline (using transform, not fit_transform!)
+    try:
+        preprocessed_features = preprocessing_pipeline.transform(features_df)
+    except Exception as e:
+        raise RuntimeError(
+            f"Failed to apply preprocessing pipeline: {str(e)}. "
+            f"Ensure the pipeline is fitted and compatible with the input data."
+        ) from e
+    
+    # Check for all-NaN features after preprocessing
+    if np.isnan(preprocessed_features).all():
+        raise ValueError(
+            "All features are NaN after preprocessing. "
+            "Check that input data contains valid numeric values."
+        )
+    
+    # Generate predictions
+    try:
+        predictions_array = model.predict(preprocessed_features)
+    except Exception as e:
+        raise RuntimeError(
+            f"Failed to generate predictions: {str(e)}"
+        ) from e
+    
+    # Return predictions as pandas Series with original index
+    predictions = pd.Series(predictions_array, index=raw_features_df.index, name='predictions')
+    
+    return predictions