feat(cli): implement predict subcommand with model inference and feature name consistency

artemisTurintech · artemisTurintech · commit 1a9da27457fa · 2025-10-23T11:19:01.000Z
Add predict subcommand with model loading, feature validation, prediction generation, and output formatting. Fix feature naming to use pipeline's get_feature_n
diff --git a/cli.py b/cli.py
@@ -96,9 +96,11 @@ def train(input_path, target_column, output_model_path, report_path):
             click.echo(f"✓ Preprocessing complete: {X_transformed.shape[1]} transformed features")
             
             # Convert transformed data to DataFrame for model training
+            # Use feature names from pipeline to ensure consistency with prediction
+            feature_names_from_pipeline = fitted_pipeline.get_feature_names_out().tolist()
             X_transformed_df = pd.DataFrame(
                 X_transformed,
-                columns=[f'feature_{i}' for i in range(X_transformed.shape[1])]
+                columns=feature_names_from_pipeline
             )
         except Exception as e:
             click.echo(f"✗ Error during preprocessing: {str(e)}", err=True)
@@ -264,14 +266,176 @@ def train(input_path, target_column, output_model_path, report_path):
 )
 def predict(model_path, input_path, output_path):
     """Make predictions using a trained model."""
-    # Placeholder function - print arguments for now
     click.echo("=== Predict Command ===")
     click.echo(f"Model Path: {model_path}")
     click.echo(f"Input CSV: {input_path}")
     click.echo(f"Output Path: {output_path}")
+    click.echo("")
     
-    # TODO: Add validation for output path (check if writable)
-    # TODO: Implement actual prediction logic
+    try:
+        # Step 1: Load model bundle
+        click.echo("Step 1: Loading model...")
+        try:
+            from model import load_model, predict as make_predictions
+            model_bundle = load_model(model_path)
+            
+            # Extract components
+            model = model_bundle['model']
+            pipeline = model_bundle['pipeline']
+            metadata = model_bundle['metadata']
+            expected_feature_names = metadata['original_feature_names']
+            target_name = metadata['target_name']
+            
+            click.echo(f"✓ Model loaded successfully")
+            click.echo(f"  - Target variable: '{target_name}'")
+            click.echo(f"  - Expected features: {len(expected_feature_names)}")
+            click.echo(f"  - Trained on: {metadata.get('training_timestamp', 'N/A')}")
+        except FileNotFoundError as e:
+            click.echo(f"✗ Error: Model file not found: {model_path}", err=True)
+            click.echo("  Suggestion: Check that the model file path is correct and the file exists.", err=True)
+            raise click.Abort()
+        except (EOFError, ValueError) as e:
+            click.echo(f"✗ Error loading model: {str(e)}", err=True)
+            click.echo("  Suggestion: The model file may be corrupted or incompatible.", err=True)
+            raise click.Abort()
+        except Exception as e:
+            click.echo(f"✗ Unexpected error loading model: {str(e)}", err=True)
+            raise click.Abort()
+        
+        # Step 2: Load input CSV
+        click.echo("\nStep 2: Loading input data...")
+        try:
+            input_df = pd.read_csv(input_path)
+            
+            if input_df.empty:
+                click.echo(f"✗ Error: Input CSV file is empty: {input_path}", err=True)
+                click.echo("  Suggestion: Ensure the CSV file contains data rows.", err=True)
+                raise click.Abort()
+            
+            click.echo(f"✓ Input data loaded successfully: {input_df.shape[0]} samples, {input_df.shape[1]} features")
+        except FileNotFoundError as e:
+            click.echo(f"✗ Error: Input file not found: {input_path}", err=True)
+            click.echo("  Suggestion: Check that the input file path is correct and the file exists.", err=True)
+            raise click.Abort()
+        except pd.errors.EmptyDataError as e:
+            click.echo(f"✗ Error: Input CSV file is empty: {input_path}", err=True)
+            click.echo("  Suggestion: Ensure the CSV file contains data.", err=True)
+            raise click.Abort()
+        except Exception as e:
+            click.echo(f"✗ Error reading input CSV: {str(e)}", err=True)
+            raise click.Abort()
+        
+        # Step 3: Validate features and make predictions
+        click.echo("\nStep 3: Making predictions...")
+        try:
+            predictions = make_predictions(
+                model=model,
+                preprocessing_pipeline=pipeline,
+                raw_features_df=input_df,
+                expected_feature_names=expected_feature_names
+            )
+            click.echo(f"✓ Predictions generated successfully: {len(predictions)} predictions")
+        except ValueError as e:
+            error_msg = str(e)
+            if "Missing:" in error_msg:
+                click.echo(f"✗ Error: Feature mismatch between input data and trained model", err=True)
+                click.echo(f"  {error_msg}", err=True)
+                click.echo("  Suggestion: Ensure the input CSV contains all required feature columns.", err=True)
+            else:
+                click.echo(f"✗ Error: {error_msg}", err=True)
+            raise click.Abort()
+        except Exception as e:
+            click.echo(f"✗ Error making predictions: {str(e)}", err=True)
+            raise click.Abort()
+        
+        # Step 4: Create output DataFrame
+        click.echo("\nStep 4: Creating output file...")
+        try:
+            # Check if predictions contain all NaN values
+            if predictions.isna().all():
+                click.echo(f"✗ Error: All predictions are NaN", err=True)
+                click.echo("  Suggestion: Check that input data contains valid numeric values.", err=True)
+                raise click.Abort()
+            
+            # Create output DataFrame with original data + predictions
+            output_df = input_df.copy()
+            prediction_column_name = f"predicted_{target_name}"
+            output_df[prediction_column_name] = predictions.values
+            
+            click.echo(f"✓ Output DataFrame created with column '{prediction_column_name}'")
+        except Exception as e:
+            click.echo(f"✗ Error creating output DataFrame: {str(e)}", err=True)
+            raise click.Abort()
+        
+        # Step 5: Save output CSV
+        click.echo("\nStep 5: Saving predictions...")
+        try:
+            # Create parent directories if they don't exist
+            output_path_obj = Path(output_path)
+            output_path_obj.parent.mkdir(parents=True, exist_ok=True)
+            
+            output_df.to_csv(output_path, index=False)
+            click.echo(f"✓ Predictions saved to: {output_path}")
+        except PermissionError as e:
+            click.echo(f"✗ Error: Permission denied writing to: {output_path}", err=True)
+            click.echo("  Suggestion: Check that you have write permissions for the output path.", err=True)
+            raise click.Abort()
+        except Exception as e:
+            click.echo(f"✗ Error saving predictions: {str(e)}", err=True)
+            click.echo("  Suggestion: Check that the output path is valid and writable.", err=True)
+            raise click.Abort()
+        
+        # Step 6: Calculate and print summary statistics
+        click.echo("\nStep 6: Calculating summary statistics...")
+        try:
+            # Calculate statistics, excluding NaN values
+            valid_predictions = predictions.dropna()
+            
+            if len(valid_predictions) == 0:
+                click.echo(f"⚠ Warning: All predictions are NaN, cannot calculate statistics", err=True)
+            else:
+                stats = {
+                    'count': len(valid_predictions),
+                    'mean': float(valid_predictions.mean()),
+                    'median': float(valid_predictions.median()),
+                    'std': float(valid_predictions.std()),
+                    'min': float(valid_predictions.min()),
+                    'max': float(valid_predictions.max())
+                }
+                
+                click.echo("✓ Summary statistics calculated")
+                click.echo(f"  - Count: {stats['count']}")
+                click.echo(f"  - Mean: {stats['mean']:.4f}")
+                click.echo(f"  - Median: {stats['median']:.4f}")
+                click.echo(f"  - Std Dev: {stats['std']:.4f}")
+                click.echo(f"  - Min: {stats['min']:.4f}")
+                click.echo(f"  - Max: {stats['max']:.4f}")
+        except Exception as e:
+            click.echo(f"✗ Error calculating statistics: {str(e)}", err=True)
+            # Don't abort here, predictions are already saved
+        
+        # Step 7: Print success message
+        click.echo("\n" + "=" * 60)
+        click.echo("🎉 Prediction completed successfully!")
+        click.echo("=" * 60)
+        click.echo(f"\n📊 Prediction Summary:")
+        click.echo(f"  - Output file: {output_path}")
+        click.echo(f"  - Number of predictions: {len(predictions)}")
+        if len(valid_predictions) > 0:
+            click.echo(f"  - Prediction column: '{prediction_column_name}'")
+            click.echo(f"\n📈 Statistics:")
+            click.echo(f"  - Mean: {stats['mean']:.4f}")
+            click.echo(f"  - Median: {stats['median']:.4f}")
+            click.echo(f"  - Range: [{stats['min']:.4f}, {stats['max']:.4f}]")
+        click.echo("")
+        
+    except click.Abort:
+        # Already handled above
+        raise
+    except Exception as e:
+        click.echo(f"\n✗ Unexpected error: {str(e)}", err=True)
+        click.echo("  Please check the error message above for details.", err=True)
+        raise click.Abort()
 
 
 if __name__ == '__main__':
diff --git a/model.py b/model.py
@@ -655,9 +655,18 @@ def predict(
             "Check that input data contains valid numeric values."
         )
     
+    # Convert preprocessed features to DataFrame with feature names to avoid sklearn warning
+    # Get the transformed feature names from the pipeline
+    transformed_feature_names = _get_feature_names_out(preprocessing_pipeline, expected_feature_names)
+    preprocessed_df = pd.DataFrame(
+        preprocessed_features,
+        columns=transformed_feature_names,
+        index=raw_features_df.index
+    )
+    
     # Generate predictions
     try:
-        predictions_array = model.predict(preprocessed_features)
+        predictions_array = model.predict(preprocessed_df)
     except Exception as e:
         raise RuntimeError(
             f"Failed to generate predictions: {str(e)}"