Merge pull request #61 from turintech/feat/matplotlib-visualization-plots

chrystalla92 · web-flow · commit a2316b1f7682 · 2025-10-23T10:34:03.000+03:00
Add visualizations module with predictions and residuals plotting functions
diff --git a/visualizations.py b/visualizations.py
@@ -0,0 +1,315 @@
+"""Visualization module for model evaluation and feature analysis.
+
+This module provides matplotlib-based visualization functions for evaluating
+linear regression model performance and understanding feature importance. All
+functions are pure (no side effects) and return matplotlib Figure objects for
+flexible use in reports or interactive displays.
+"""
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.figure import Figure
+from typing import Union, List
+
+
+def create_predictions_plot(
+    y_actual: Union[np.ndarray, pd.Series],
+    y_predicted: Union[np.ndarray, pd.Series]
+) -> Figure:
+    """
+    Create a scatter plot comparing actual vs predicted target values.
+    
+    This function generates a scatter plot with actual values on the x-axis and
+    predicted values on the y-axis. An ideal fit line (y=x diagonal) is included
+    to visualize model accuracy. Points close to the diagonal indicate good
+    predictions, while deviations show prediction errors.
+    
+    Parameters
+    ----------
+    y_actual : Union[np.ndarray, pd.Series]
+        Actual (true) target values from the dataset.
+    y_predicted : Union[np.ndarray, pd.Series]
+        Predicted target values from the model.
+    
+    Returns
+    -------
+    Figure
+        Matplotlib Figure object containing the predictions scatter plot.
+        The figure can be saved, displayed, or embedded in reports.
+    
+    Raises
+    ------
+    ValueError
+        If y_actual and y_predicted have different lengths or are empty.
+    
+    Examples
+    --------
+    >>> import numpy as np
+    >>> y_actual = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+    >>> y_predicted = np.array([1.1, 2.2, 2.9, 4.1, 4.8])
+    >>> fig = create_predictions_plot(y_actual, y_predicted)
+    >>> fig.savefig('predictions.png')
+    >>> plt.close(fig)
+    
+    Notes
+    -----
+    - Semi-transparent points (alpha=0.6) help visualize overlapping predictions
+    - The diagonal line represents perfect predictions (y_actual = y_predicted)
+    - Figure size is set to 10x6 inches for readability
+    - Use plt.close(fig) after use to prevent memory leaks
+    """
+    # Convert to numpy arrays for consistent handling
+    y_actual = np.asarray(y_actual)
+    y_predicted = np.asarray(y_predicted)
+    
+    # Validate inputs
+    _validate_arrays(y_actual, y_predicted)
+    
+    # Create figure and axis
+    fig, ax = plt.subplots(figsize=(10, 6))
+    
+    # Create scatter plot with semi-transparent points
+    ax.scatter(y_actual, y_predicted, alpha=0.6, color='steelblue', 
+               edgecolors='navy', linewidth=0.5, label='Predictions')
+    
+    # Add ideal fit line (y=x diagonal)
+    min_val = min(y_actual.min(), y_predicted.min())
+    max_val = max(y_actual.max(), y_predicted.max())
+    ax.plot([min_val, max_val], [min_val, max_val], 
+            'r--', linewidth=2, label='Ideal Fit (y=x)')
+    
+    # Labels and title
+    ax.set_xlabel('Actual Values', fontsize=12, fontweight='bold')
+    ax.set_ylabel('Predicted Values', fontsize=12, fontweight='bold')
+    ax.set_title('Actual vs Predicted Values', fontsize=14, fontweight='bold', pad=20)
+    
+    # Add legend and grid
+    ax.legend(loc='upper left', fontsize=10)
+    ax.grid(True, alpha=0.3, linestyle='--')
+    
+    # Ensure equal aspect for better visual interpretation
+    ax.set_aspect('equal', adjustable='box')
+    
+    # Prevent label cutoff
+    plt.tight_layout()
+    
+    return fig
+
+
+def create_residuals_plot(
+    y_predicted: Union[np.ndarray, pd.Series],
+    residuals: Union[np.ndarray, pd.Series]
+) -> Figure:
+    """
+    Create a residuals plot to assess model fit quality.
+    
+    This function generates a scatter plot of residuals (actual - predicted) versus
+    predicted values. A horizontal line at y=0 represents perfect predictions. Random
+    scatter around zero indicates a good model fit, while patterns suggest systematic
+    errors or model inadequacy.
+    
+    Parameters
+    ----------
+    y_predicted : Union[np.ndarray, pd.Series]
+        Predicted target values from the model.
+    residuals : Union[np.ndarray, pd.Series]
+        Residual values calculated as (actual - predicted).
+    
+    Returns
+    -------
+    Figure
+        Matplotlib Figure object containing the residuals plot.
+        The figure can be saved, displayed, or embedded in reports.
+    
+    Raises
+    ------
+    ValueError
+        If y_predicted and residuals have different lengths or are empty.
+    
+    Examples
+    --------
+    >>> import numpy as np
+    >>> y_predicted = np.array([1.1, 2.2, 2.9, 4.1, 4.8])
+    >>> residuals = np.array([-0.1, -0.2, 0.1, -0.1, 0.2])
+    >>> fig = create_residuals_plot(y_predicted, residuals)
+    >>> fig.savefig('residuals.png')
+    >>> plt.close(fig)
+    
+    Notes
+    -----
+    - Random scatter around y=0 indicates good model fit (homoscedasticity)
+    - Patterns (e.g., funnel shape) suggest heteroscedasticity or non-linearity
+    - Semi-transparent points (alpha=0.6) help visualize overlapping residuals
+    - Different color (coral) from predictions plot for visual distinction
+    - Figure size is set to 10x6 inches for readability
+    - Use plt.close(fig) after use to prevent memory leaks
+    """
+    # Convert to numpy arrays for consistent handling
+    y_predicted = np.asarray(y_predicted)
+    residuals = np.asarray(residuals)
+    
+    # Validate inputs
+    _validate_arrays(y_predicted, residuals)
+    
+    # Create figure and axis
+    fig, ax = plt.subplots(figsize=(10, 6))
+    
+    # Create scatter plot with semi-transparent points
+    ax.scatter(y_predicted, residuals, alpha=0.6, color='coral', 
+               edgecolors='darkred', linewidth=0.5, label='Residuals')
+    
+    # Add horizontal line at y=0
+    ax.axhline(y=0, color='black', linestyle='--', linewidth=2, 
+               label='Zero Line')
+    
+    # Labels and title
+    ax.set_xlabel('Predicted Values', fontsize=12, fontweight='bold')
+    ax.set_ylabel('Residuals (Actual - Predicted)', fontsize=12, fontweight='bold')
+    ax.set_title('Residuals Plot', fontsize=14, fontweight='bold', pad=20)
+    
+    # Add legend and grid
+    ax.legend(loc='upper left', fontsize=10)
+    ax.grid(True, alpha=0.3, linestyle='--')
+    
+    # Prevent label cutoff
+    plt.tight_layout()
+    
+    return fig
+
+
+def create_coefficients_plot(
+    feature_names: List[str],
+    coefficients: Union[np.ndarray, pd.Series, List[float]]
+) -> Figure:
+    """
+    Create a bar chart showing feature importance based on model coefficients.
+    
+    This function generates a horizontal bar chart of model coefficients sorted by
+    absolute value in descending order. Positive coefficients are shown in green
+    (positive correlation with target) and negative coefficients in red (negative
+    correlation), making it easy to identify the most important features and their
+    directional impact.
+    
+    Parameters
+    ----------
+    feature_names : List[str]
+        Names of the features corresponding to each coefficient.
+    coefficients : Union[np.ndarray, pd.Series, List[float]]
+        Model coefficients for each feature (e.g., from LinearRegression.coef_).
+    
+    Returns
+    -------
+    Figure
+        Matplotlib Figure object containing the coefficients bar chart.
+        The figure can be saved, displayed, or embedded in reports.
+    
+    Raises
+    ------
+    ValueError
+        If feature_names and coefficients have different lengths or are empty.
+    
+    Examples
+    --------
+    >>> feature_names = ['age', 'income', 'education', 'experience']
+    >>> coefficients = [0.5, 1.2, -0.3, 0.8]
+    >>> fig = create_coefficients_plot(feature_names, coefficients)
+    >>> fig.savefig('coefficients.png')
+    >>> plt.close(fig)
+    
+    Notes
+    -----
+    - Bars are sorted by absolute coefficient value (most important at top)
+    - Green bars indicate positive correlation with target
+    - Red bars indicate negative correlation with target
+    - Horizontal bar chart makes long feature names more readable
+    - Figure size is set to 10x6 inches for readability
+    - Use plt.close(fig) after use to prevent memory leaks
+    """
+    # Convert coefficients to numpy array for consistent handling
+    coefficients = np.asarray(coefficients)
+    
+    # Validate inputs
+    if len(feature_names) != len(coefficients):
+        raise ValueError(
+            f"Length mismatch: feature_names has {len(feature_names)} elements "
+            f"but coefficients has {len(coefficients)} elements. They must match."
+        )
+    
+    if len(feature_names) == 0:
+        raise ValueError("feature_names and coefficients cannot be empty.")
+    
+    # Create DataFrame for easier sorting
+    coef_df = pd.DataFrame({
+        'feature': feature_names,
+        'coefficient': coefficients
+    })
+    
+    # Sort by absolute value in descending order
+    coef_df['abs_coefficient'] = np.abs(coef_df['coefficient'])
+    coef_df = coef_df.sort_values('abs_coefficient', ascending=True)  # Ascending for horizontal bars
+    
+    # Create figure and axis
+    fig, ax = plt.subplots(figsize=(10, 6))
+    
+    # Create color array based on sign of coefficient
+    colors = ['green' if c > 0 else 'red' for c in coef_df['coefficient']]
+    
+    # Create horizontal bar chart
+    bars = ax.barh(coef_df['feature'], coef_df['coefficient'], color=colors, 
+                   alpha=0.7, edgecolor='black', linewidth=0.8)
+    
+    # Add vertical line at x=0
+    ax.axvline(x=0, color='black', linestyle='-', linewidth=1.5)
+    
+    # Labels and title
+    ax.set_xlabel('Coefficient Value', fontsize=12, fontweight='bold')
+    ax.set_ylabel('Feature Name', fontsize=12, fontweight='bold')
+    ax.set_title('Feature Importance (Model Coefficients)', fontsize=14, 
+                 fontweight='bold', pad=20)
+    
+    # Add grid for better readability
+    ax.grid(True, alpha=0.3, linestyle='--', axis='x')
+    
+    # Add legend
+    from matplotlib.patches import Patch
+    legend_elements = [
+        Patch(facecolor='green', alpha=0.7, edgecolor='black', label='Positive Impact'),
+        Patch(facecolor='red', alpha=0.7, edgecolor='black', label='Negative Impact')
+    ]
+    ax.legend(handles=legend_elements, loc='lower right', fontsize=10)
+    
+    # Prevent label cutoff
+    plt.tight_layout()
+    
+    return fig
+
+
+def _validate_arrays(
+    arr1: np.ndarray,
+    arr2: np.ndarray
+) -> None:
+    """
+    Validate that two arrays have the same length and are non-empty.
+    
+    Parameters
+    ----------
+    arr1 : np.ndarray
+        First array to validate.
+    arr2 : np.ndarray
+        Second array to validate.
+    
+    Raises
+    ------
+    ValueError
+        If arrays have different lengths or are empty.
+    """
+    if len(arr1) == 0 or len(arr2) == 0:
+        raise ValueError(
+            f"Arrays cannot be empty. Got lengths: {len(arr1)} and {len(arr2)}."
+        )
+    
+    if len(arr1) != len(arr2):
+        raise ValueError(
+            f"Arrays must have the same length. Got {len(arr1)} and {len(arr2)}."
+        )