feat(model): add linear regression training with comprehensive metrics and validation

artemisTurintech · artemisTurintech · commit d65b52a65f54 · 2025-10-23T06:40:37.000Z
diff --git a/model.py b/model.py
@@ -0,0 +1,182 @@
+"""Model training module for Linear Regression.
+
+This module provides functionality to train a scikit-learn LinearRegression model
+on preprocessed data and calculate comprehensive regression metrics. The module
+is designed to work with data that has already been preprocessed (scaled and imputed)
+by the preprocessing pipeline.
+"""
+
+import pandas as pd
+import numpy as np
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
+from typing import Dict, Tuple
+
+
+def train_model(X: pd.DataFrame, y: pd.Series) -> Tuple[LinearRegression, Dict[str, float]]:
+    """
+    Train a Linear Regression model on preprocessed data and calculate metrics.
+    
+    This function trains a scikit-learn LinearRegression model using the provided
+    features and target values. After training, it generates predictions on the
+    training data and calculates comprehensive regression metrics including R² score,
+    Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and Mean Absolute 
+    Error (MAE).
+    
+    Parameters
+    ----------
+    X : pd.DataFrame
+        Preprocessed features DataFrame with numerical values (already scaled/imputed).
+        Must be non-empty and contain no NaN values. Shape should be (n_samples, n_features).
+    y : pd.Series
+        Target values as a pandas Series with numerical values.
+        Must be non-empty, contain no NaN values, and have the same number of samples as X.
+    
+    Returns
+    -------
+    Tuple[LinearRegression, Dict[str, float]]
+        A tuple containing:
+        - model (LinearRegression): Trained LinearRegression model instance with
+          accessible `.coef_` (coefficients) and `.intercept_` attributes
+        - metrics (Dict[str, float]): Dictionary containing regression metrics:
+            - 'r2': R² (coefficient of determination) score
+            - 'mse': Mean Squared Error
+            - 'rmse': Root Mean Squared Error (square root of MSE)
+            - 'mae': Mean Absolute Error
+    
+    Raises
+    ------
+    TypeError
+        If X is not a pandas DataFrame or y is not a pandas Series.
+    ValueError
+        If X or y are empty, contain NaN values, or have mismatched shapes.
+    
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> from sklearn.datasets import make_regression
+    >>> 
+    >>> # Generate synthetic data
+    >>> X_array, y_array = make_regression(n_samples=100, n_features=3, noise=10, random_state=42)
+    >>> X = pd.DataFrame(X_array, columns=['feature1', 'feature2', 'feature3'])
+    >>> y = pd.Series(y_array, name='target')
+    >>> 
+    >>> # Train model and get metrics
+    >>> model, metrics = train_model(X, y)
+    >>> 
+    >>> print(f"R² Score: {metrics['r2']:.4f}")
+    >>> print(f"MSE: {metrics['mse']:.4f}")
+    >>> print(f"RMSE: {metrics['rmse']:.4f}")
+    >>> print(f"MAE: {metrics['mae']:.4f}")
+    >>> 
+    >>> # Access model parameters
+    >>> print(f"Coefficients: {model.coef_}")
+    >>> print(f"Intercept: {model.intercept_}")
+    
+    Notes
+    -----
+    - This function is pure and has no side effects (no file I/O or global state changes)
+    - Input data must be preprocessed (scaled/imputed) before calling this function
+    - Metrics are calculated on the training data itself
+    - The function handles edge cases like single features and perfect fits
+    - For very small datasets or perfect fits, some metrics may be extreme values
+    - The model uses ordinary least squares (OLS) estimation
+    """
+    # Validate inputs
+    _validate_inputs(X, y)
+    
+    # Train the Linear Regression model
+    model = LinearRegression()
+    model.fit(X, y)
+    
+    # Generate predictions on training data for metric calculation
+    y_pred = model.predict(X)
+    
+    # Calculate regression metrics
+    r2 = r2_score(y, y_pred)
+    mse = mean_squared_error(y, y_pred)
+    rmse = np.sqrt(mse)
+    mae = mean_absolute_error(y, y_pred)
+    
+    # Create metrics dictionary
+    metrics = {
+        'r2': float(r2),
+        'mse': float(mse),
+        'rmse': float(rmse),
+        'mae': float(mae)
+    }
+    
+    return model, metrics
+
+
+def _validate_inputs(X: pd.DataFrame, y: pd.Series) -> None:
+    """
+    Validate input data for model training.
+    
+    Ensures that X and y meet all requirements for training:
+    - Correct types (DataFrame and Series)
+    - Non-empty
+    - No NaN values
+    - Matching shapes (same number of samples)
+    
+    Parameters
+    ----------
+    X : pd.DataFrame
+        Features DataFrame to validate.
+    y : pd.Series
+        Target Series to validate.
+    
+    Raises
+    ------
+    TypeError
+        If X is not a pandas DataFrame or y is not a pandas Series.
+    ValueError
+        If X or y are empty, contain NaN values, or have mismatched shapes.
+    """
+    # Check types
+    if not isinstance(X, pd.DataFrame):
+        raise TypeError(
+            f"X must be a pandas DataFrame, got {type(X).__name__} instead."
+        )
+    
+    if not isinstance(y, pd.Series):
+        raise TypeError(
+            f"y must be a pandas Series, got {type(y).__name__} instead."
+        )
+    
+    # Check if empty
+    if X.empty:
+        raise ValueError("X DataFrame is empty (no rows).")
+    
+    if len(y) == 0:
+        raise ValueError("y Series is empty (no values).")
+    
+    if X.shape[1] == 0:
+        raise ValueError("X DataFrame has no columns (no features).")
+    
+    # Check for NaN values
+    if X.isna().any().any():
+        nan_columns = X.columns[X.isna().any()].tolist()
+        raise ValueError(
+            f"X contains NaN values. Columns with NaN: {nan_columns}. "
+            "Please preprocess the data to handle missing values."
+        )
+    
+    if y.isna().any():
+        raise ValueError(
+            "y contains NaN values. Please preprocess the data to handle missing values."
+        )
+    
+    # Check shape matching
+    if X.shape[0] != len(y):
+        raise ValueError(
+            f"Shape mismatch: X has {X.shape[0]} samples but y has {len(y)} samples. "
+            "X and y must have the same number of samples."
+        )
+    
+    # Check for at least 2 samples (minimum for regression)
+    if X.shape[0] < 2:
+        raise ValueError(
+            f"Insufficient data: only {X.shape[0]} sample(s) provided. "
+            "At least 2 samples are required for Linear Regression."
+        )