In [1]:
"""
MLOps EDA Pipeline
==================
Exploratory Data Analysis for training data validation and quality checks.
Produces artifacts for downstream pipeline stages.
"""

# =============================================================================
# IMPORTS AND CONFIGURATION
# =============================================================================
import pandas as pd
import numpy as np
import os
import json
import warnings
import datetime
import logging
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import matplotlib
matplotlib.use('Agg')  # Non-interactive backend to prevent memory leaks
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
try:
    import missingno as msno
    HAS_MISSINGNO = True
except ImportError:
    HAS_MISSINGNO = False

# Configure warnings and pandas display
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: "%.3f" % x)

# Plotting defaults
sns.set_style("whitegrid")
plt.rcParams.update({"figure.figsize": (10, 6), "figure.dpi": 120})

In [2]:

# =============================================================================
# LOGGING SETUP
# =============================================================================
def setup_logger(name: str = "eda_pipeline", level: int = logging.INFO) -> logging.Logger:
    """
    Configure logger for the EDA pipeline.
    
    Args:
        name: Logger name
        level: Logging level
        
    Returns:
        Configured logger instance
    """
    logger = logging.getLogger(name)
    logger.setLevel(level)
    
    # Console handler
    if not logger.handlers:
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        handler.setFormatter(formatter)
        logger.addHandler(handler)
    
    return logger

logger = setup_logger()

# =============================================================================
# CONFIGURATION
# =============================================================================
class Config:
    """Configuration constants for the EDA pipeline."""
    ARTIFACTS_DIR = Path("./artifacts")
    PLOTS_DIR = ARTIFACTS_DIR / "plots"
    DATA_PATH = ARTIFACTS_DIR / "raw_data.csv"
    
    # Date range (can be parameterized)
    MAX_DATE = "2024-03-31"
    MIN_DATE = "2023-12-01"
    
    # EDA parameters
    MAX_HIST_COLS = 20  # Limit histograms to prevent memory issues
    MAX_CAT_TOP = 10     # Top categories to display
    
    @classmethod
    def setup_directories(cls):
        """Create necessary directories."""
        cls.ARTIFACTS_DIR.mkdir(exist_ok=True)
        cls.PLOTS_DIR.mkdir(exist_ok=True)
        logger.info(f"Created artifacts directories: {cls.ARTIFACTS_DIR}, {cls.PLOTS_DIR}")

In [3]:
!dvc pull

Collecting                                            |0.00 [00:00,    ?entry/s]
Fetching
![A
  0% Checking cache in '/home/ivett/Documents/code/MLOPs/project/itu-sdse-projec[A
                                                                                [A
![A
  0% Checking cache in 'https://github.com/Jeppe-T-K/itu-sdse-project-data/files[A
md5: 05663184eaf8f324162e4d1bc14be387
Fetching
Building workspace index                             |8.00 [00:00, 2.27kentry/s]
Comparing indexes                                    |9.00 [00:00, 1.41kentry/s]
Applying changes                                      |0.00 [00:00,     ?file/s]
Everything is up to date.
[0m

In [4]:
# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================
def to_json_safe(obj):
    """
    Convert numpy/pandas types to JSON-serializable Python types.
    
    Args:
        obj: Object to convert
        
    Returns:
        JSON-safe version of the object
    """
    if isinstance(obj, dict):
        return {str(k): to_json_safe(v) for k, v in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return [to_json_safe(v) for v in obj]
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (pd.Timestamp, datetime.date, datetime.datetime)):
        return obj.isoformat()
    elif isinstance(obj, np.generic):
        return obj.item()
    elif isinstance(obj, (int, float, str, bool, type(None))):
        return obj
    else:
        return str(obj)

def save_plot(fig: plt.Figure, filename: str) -> None:
    """
    Save matplotlib figure and properly close it to prevent memory leaks.
    
    Args:
        fig: Matplotlib figure object
        filename: Output filename
    """
    try:
        filepath = Config.PLOTS_DIR / filename
        fig.savefig(filepath, bbox_inches='tight', dpi=120)
        logger.debug(f"Saved plot: {filename}")
    except Exception as e:
        logger.error(f"Failed to save plot {filename}: {e}")
    finally:
        plt.close(fig)  # Critical for memory management

def describe_numeric_col(x: pd.Series) -> pd.Series:
    """
    Calculate descriptive statistics for a numeric column.
    
    Args:
        x: Pandas Series (numeric)
        
    Returns:
        Series with descriptive stats
    """
    return pd.Series({
        "Count": x.count(),
        "Missing": x.isnull().sum(),
        "Mean": x.mean(),
        "Std": x.std(),
        "Min": x.min(),
        "25%": x.quantile(0.25),
        "50%": x.quantile(0.50),
        "75%": x.quantile(0.75),
        "Max": x.max()
    })

def impute_missing_values(x: pd.Series, method: str = "mean") -> pd.Series:
    """
    Impute missing values in a column.
    
    Args:
        x: Pandas Series
        method: Imputation method ("mean", "median", "mode")
        
    Returns:
        Series with imputed values
    """
    if x.dtype in ['float64', 'int64']:
        if method == "mean":
            return x.fillna(x.mean())
        elif method == "median":
            return x.fillna(x.median())
    
    # For categorical/other types, use mode
    if not x.mode().empty:
        return x.fillna(x.mode()[0])
    return x


In [5]:
# =============================================================================
# DATA LOADING AND PREPROCESSING
# =============================================================================
def load_and_filter_data(
    data_path: Path,
    min_date: str,
    max_date: str
) -> Tuple[pd.DataFrame, Dict[str, str]]:
    """
    Load data and apply date filtering.
    
    Args:
        data_path: Path to the CSV file
        min_date: Minimum date string
        max_date: Maximum date string
        
    Returns:
        Filtered dataframe and date limits dictionary
    """
    logger.info(f"Loading training data from {data_path}")
    
    # Check if file exists
    if not data_path.exists():
        logger.warning(f"Data file not found at {data_path}. Attempting DVC pull...")
        os.system("dvc pull")
    
    data = pd.read_csv(data_path)
    logger.info(f"Loaded {len(data):,} rows, {len(data.columns)} columns")
    
    # Parse dates
    max_date_parsed = pd.to_datetime(max_date or datetime.datetime.now()).date()
    min_date_parsed = pd.to_datetime(min_date).date()
    
    # Apply date filter if date_part column exists
    if "date_part" in data.columns:
        data["date_part"] = pd.to_datetime(data["date_part"]).dt.date
        data = data[
            (data["date_part"] >= min_date_parsed) & 
            (data["date_part"] <= max_date_parsed)
        ].reset_index(drop=True)
        
        actual_min = data["date_part"].min()
        actual_max = data["date_part"].max()
        
        logger.info(f"Date filtered: {actual_min} to {actual_max}")
        logger.info(f"Total rows after filtering: {len(data):,}")
        
        date_limits = {"min_date": str(actual_min), "max_date": str(actual_max)}
    else:
        logger.warning("No 'date_part' column found; skipping date filtering")
        date_limits = {"min_date": min_date, "max_date": max_date}
    
    return data, date_limits

In [6]:
# # =============================================================================
# # EDA ANALYSIS FUNCTIONS
# # =============================================================================
# def analyze_missing_values(data: pd.DataFrame) -> pd.Series:
#     """
#     Analyze and report missing values.
    
#     Args:
#         data: Input dataframe
        
#     Returns:
#         Series of columns with missing value counts
#     """
#     missing = data.isnull().sum()
#     missing = missing[missing > 0].sort_values(ascending=False)
    
#     if not missing.empty:
#         logger.info(f"Found {len(missing)} columns with missing values")
#         for col, count in missing.head(10).items():
#             pct = 100 * count / len(data)
#             logger.info(f"  {col}: {count:,} ({pct:.2f}%)")
#     else:
#         logger.info("No missing values detected ✅")
    
#     return missing

# def analyze_numeric_columns(data: pd.DataFrame) -> pd.DataFrame:
#     """
#     Comprehensive numeric column analysis.
    
#     Args:
#         data: Input dataframe
        
#     Returns:
#         Descriptive statistics dataframe
#     """
#     numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
#     logger.info(f"Analyzing {len(numeric_cols)} numeric columns")
    
#     if not numeric_cols:
#         logger.warning("No numeric columns found")
#         return pd.DataFrame()
    
#     # Descriptive statistics
#     num_descr = data[numeric_cols].describe().T
#     num_descr['missing'] = data[numeric_cols].isnull().sum()
#     num_descr = num_descr[['count', 'missing', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]
    
#     # Save to CSV
#     output_path = Config.ARTIFACTS_DIR / "numeric_describe.csv"
#     num_descr.to_csv(output_path)
#     logger.info(f"Saved numeric summary to {output_path}")
    
#     return num_descr

# def plot_numeric_distributions(data: pd.DataFrame, max_cols: int = 20) -> None:
#     """
#     Generate histograms for numeric columns.
    
#     Args:
#         data: Input dataframe
#         max_cols: Maximum number of columns to plot
#     """
#     numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
#     cols_to_plot = numeric_cols[:max_cols]
    
#     logger.info(f"Generating histograms for {len(cols_to_plot)} numeric columns")
    
#     for col in cols_to_plot:
#         fig, ax = plt.subplots(figsize=(10, 6))
#         try:
#             data[col].dropna().hist(bins=50, edgecolor='black', ax=ax)
#             ax.set_title(f'Distribution of {col}')
#             ax.set_xlabel(col)
#             ax.set_ylabel('Frequency')
#             save_plot(fig, f"hist_{col}.png")
#         except Exception as e:
#             logger.error(f"Failed to plot histogram for {col}: {e}")
#             plt.close(fig)

# def plot_correlation_heatmap(data: pd.DataFrame) -> None:
#     """
#     Generate correlation heatmap for numeric columns.
    
#     Args:
#         data: Input dataframe
#     """
#     numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
#     if len(numeric_cols) < 2:
#         logger.warning("Need at least 2 numeric columns for correlation heatmap")
#         return
    
#     logger.info("Generating correlation heatmap")
    
#     try:
#         corr = data[numeric_cols].corr()
        
#         fig, ax = plt.subplots(figsize=(min(14, 1 + len(numeric_cols)), 
#                                          min(12, 1 + len(numeric_cols))))
#         sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
#                     cbar_kws={'shrink': 0.8}, ax=ax)
#         ax.set_title('Correlation Heatmap', fontsize=16, pad=20)
#         save_plot(fig, "correlation_heatmap.png")
#     except Exception as e:
#         logger.error(f"Failed to create correlation heatmap: {e}")

# def analyze_categorical_columns(data: pd.DataFrame) -> List[Dict]:
#     """
#     Analyze categorical columns and save summary.
    
#     Args:
#         data: Input dataframe
        
#     Returns:
#         List of dictionaries with categorical summaries
#     """
#     cat_cols = data.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
#     logger.info(f"Analyzing {len(cat_cols)} categorical columns")
    
#     if not cat_cols:
#         logger.warning("No categorical columns found")
#         return []
    
#     cat_summary = []
#     for col in cat_cols:
#         top_values = data[col].value_counts(dropna=False).head(Config.MAX_CAT_TOP)
#         unique_count = data[col].nunique(dropna=True)
#         missing_count = data[col].isnull().sum()
        
#         cat_summary.append({
#             'column': col,
#             'unique': unique_count,
#             'missing': missing_count,
#             'top_values': {str(k): int(v) for k, v in top_values.to_dict().items()}
#         })
        
#         logger.info(f"  {col}: {unique_count} unique values, {missing_count} missing")
    
#     # Save summary
#     cat_summary_safe = to_json_safe(cat_summary)
#     output_path = Config.ARTIFACTS_DIR / "categorical_summary.json"
#     with open(output_path, 'w') as f:
#         json.dump(cat_summary_safe, f, indent=2)
#     logger.info(f"Saved categorical summary to {output_path}")
    
#     return cat_summary

# def plot_categorical_distributions(data: pd.DataFrame) -> None:
#     """
#     Generate bar plots for categorical columns.
    
#     Args:
#         data: Input dataframe
#     """
#     cat_cols = data.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    
#     logger.info(f"Generating bar plots for {len(cat_cols)} categorical columns")
    
#     for col in cat_cols:
#         fig, ax = plt.subplots(figsize=(10, 6))
#         try:
#             top = data[col].value_counts(dropna=False).nlargest(Config.MAX_CAT_TOP)
#             top.plot(kind='barh', ax=ax)
#             ax.set_title(f'Top {Config.MAX_CAT_TOP} Categories in {col}')
#             ax.set_xlabel('Count')
#             ax.set_ylabel('Category')
#             save_plot(fig, f"cat_{col}.png")
#         except Exception as e:
#             logger.error(f"Failed to plot categories for {col}: {e}")
#             plt.close(fig)

# def plot_missingness(data: pd.DataFrame) -> None:
#     """
#     Visualize missing data patterns.
    
#     Args:
#         data: Input dataframe
#     """
#     logger.info("Generating missingness visualizations")
    
#     if not HAS_MISSINGNO:
#         logger.warning("missingno library not available, using fallback")
#         missing = data.isnull().sum()
#         missing = missing[missing > 0].sort_values(ascending=False)
        
#         if not missing.empty:
#             fig, ax = plt.subplots(figsize=(10, 6))
#             missing.plot(kind='barh', ax=ax)
#             ax.set_title('Missing Value Counts')
#             ax.set_xlabel('Count')
#             save_plot(fig, "missing_counts.png")
#         return
    
#     try:
#         # Missingness matrix
#         fig = msno.matrix(data, figsize=(12, 6)).get_figure()
#         fig.suptitle('Missingness Matrix')
#         save_plot(fig, "missingness_matrix.png")
        
#         # Missingness heatmap
#         fig = msno.heatmap(data, figsize=(10, 8)).get_figure()
#         fig.suptitle('Missingness Correlation')
#         save_plot(fig, "missingness_heatmap.png")
#     except Exception as e:
#         logger.error(f"missingno visualization failed: {e}")

# def analyze_time_series(data: pd.DataFrame) -> None:
#     """
#     Analyze temporal patterns if date column exists.
    
#     Args:
#         data: Input dataframe
#     """
#     if 'date_part' not in data.columns:
#         logger.info("No 'date_part' column found; skipping time series analysis")
#         return
    
#     logger.info("Analyzing time series patterns")
    
#     try:
#         daily_counts = data.groupby('date_part').size()
        
#         fig, ax = plt.subplots(figsize=(12, 6))
#         daily_counts.plot(ax=ax, marker='o')
#         ax.set_title('Rows Over Time')
#         ax.set_xlabel('Date')
#         ax.set_ylabel('Row Count')
#         ax.grid(True, alpha=0.3)
#         save_plot(fig, "time_series.png")
#     except Exception as e:
#         logger.error(f"Time series analysis failed: {e}")

# def identify_target_candidates(data: pd.DataFrame) -> List[str]:
#     """
#     Identify potential target columns based on naming patterns.
    
#     Args:
#         data: Input dataframe
        
#     Returns:
#         List of potential target column names
#     """
#     target_keywords = ['target', 'label', 'y', 'lead', 'outcome', 'prediction']
#     candidates = [
#         col for col in data.columns 
#         if any(keyword in col.lower() for keyword in target_keywords)
#     ]
    
#     if candidates:
#         logger.info(f"Identified target candidates: {candidates}")
#     else:
#         logger.info("No obvious target columns found")
#         # Show top variance columns as fallback
#         numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
#         if len(numeric_cols) > 0:
#             variances = data[numeric_cols].var().sort_values(ascending=False).head(5)
#             logger.info(f"Top variance columns:\n{variances}")
    
#     return candidates

# # =============================================================================
# # MAIN PIPELINE
# # =============================================================================
# def run_eda_pipeline():
#     """Execute complete EDA pipeline."""
#     logger.info("="*70)
#     logger.info("Starting EDA Pipeline")
#     logger.info("="*70)
    
#     # Setup
#     Config.setup_directories()
    
#     # Load data
#     data, date_limits = load_and_filter_data(
#         Config.DATA_PATH,
#         Config.MIN_DATE,
#         Config.MAX_DATE
#     )
    
#     # Save date limits
#     with open(Config.ARTIFACTS_DIR / "date_limits.json", "w") as f:
#         json.dump(date_limits, f, indent=2)
    
#     # Display basic info
#     logger.info(f"Data shape: {data.shape}")
#     logger.info(f"Columns: {list(data.columns)}")
    
#     # Missing values analysis
#     missing = analyze_missing_values(data)
    
#     # Numeric analysis
#     logger.info("\n" + "="*70)
#     logger.info("Numeric Column Analysis")
#     logger.info("="*70)
#     num_stats = analyze_numeric_columns(data)
#     if not num_stats.empty:
#         plot_numeric_distributions(data, Config.MAX_HIST_COLS)
#         plot_correlation_heatmap(data)
    
#     # Categorical analysis
#     logger.info("\n" + "="*70)
#     logger.info("Categorical Column Analysis")
#     logger.info("="*70)
#     cat_summary = analyze_categorical_columns(data)
#     if cat_summary:
#         plot_categorical_distributions(data)
    
#     # Missingness visualization
#     logger.info("\n" + "="*70)
#     logger.info("Missingness Analysis")
#     logger.info("="*70)
#     plot_missingness(data)
    
#     # Time series analysis
#     logger.info("\n" + "="*70)
#     logger.info("Time Series Analysis")
#     logger.info("="*70)
#     analyze_time_series(data)
    
#     # Target identification
#     logger.info("\n" + "="*70)
#     logger.info("Target Column Identification")
#     logger.info("="*70)
#     target_candidates = identify_target_candidates(data)
    
#     logger.info("\n" + "="*70)
#     logger.info("EDA Pipeline Complete ✅")
#     logger.info(f"Artifacts saved to: {Config.ARTIFACTS_DIR}")
#     logger.info(f"Plots saved to: {Config.PLOTS_DIR}")
#     logger.info("="*70)

In [7]:
# v2 Data analysis 
# =============================================================================
# MODERN MLOPS EDA VISUALS: ONLY THE ESSENTIALS
# =============================================================================

def plot_missingness(data: pd.DataFrame) -> None:
    """
    Visualize missing data patterns.
    """
    logger.info("Generating missingness visualizations")
    try:
        fig = msno.matrix(data, figsize=(12, 6)).get_figure()
        fig.suptitle('Missingness Matrix')
        save_plot(fig, "missingness_matrix.png")
    except Exception as e:
        logger.warning("missingno failed, using bar fallback: %s", e)
        missing = data.isnull().sum()
        missing = missing[missing > 0].sort_values(ascending=False)
        if not missing.empty:
            fig, ax = plt.subplots(figsize=(10, 6))
            missing.plot(kind='barh', ax=ax)
            ax.set_title('Missing Value Counts')
            ax.set_xlabel('Count')
            save_plot(fig, "missing_counts.png")
        else:
            logger.info("No missing values detected ✅")


def plot_correlation_heatmap(data: pd.DataFrame) -> None:
    """
    Correlation heatmap for key numeric columns (skip ID-like).
    """
    # Only use relevant (non-id) numeric columns
    numeric_cols = [c for c in data.select_dtypes(include=['int64', 'float64']).columns if 'id' not in c.lower()]
    if len(numeric_cols) < 2:
        logger.info("Not enough numeric columns for correlation heatmap")
        return
    try:
        corr = data[numeric_cols].corr()
        fig, ax = plt.subplots(figsize=(min(14, 1+len(numeric_cols)), min(12, 1+len(numeric_cols))))
        sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0, ax=ax)
        ax.set_title('Correlation Heatmap')
        save_plot(fig, "correlation_heatmap.png")
    except Exception as e:
        logger.error("Failed to create correlation heatmap: %s", e)


def plot_target_distribution(data: pd.DataFrame, target_col: str) -> None:
    """
    Plot for the key target/label column (for classification/regression). 
    """
    if target_col not in data.columns:
        logger.warning(f"{target_col} not in columns, skipping target plot.")
        return
    try:
        fig, ax = plt.subplots(figsize=(8, 5))
        if data[target_col].dtype in ['object', 'category', 'bool']:
            data[target_col].value_counts(dropna=False).plot(kind='bar', ax=ax)
            ax.set_title(f"Class Distribution: {target_col}")
        else:
            data[target_col].hist(bins=25, ax=ax)
            ax.set_title(f"Distribution: {target_col}")
        save_plot(fig, f"target_{target_col}_dist.png")
    except Exception as e:
        logger.error(f"Could not plot target {target_col}: {e}")


def plot_key_feature_distributions(data: pd.DataFrame, key_features: list) -> None:
    """
    Plot distribution for selected, relevant features only (not all columns).
    """
    for col in key_features:
        if col in data.columns:
            try:
                fig, ax = plt.subplots(figsize=(8, 5))
                if data[col].dtype in ['object', 'category', 'bool']:
                    data[col].value_counts(dropna=False).head(10).plot(kind='barh', ax=ax)
                    ax.set_title(f'Top Categories in {col}')
                else:
                    data[col].hist(bins=25, ax=ax)
                    ax.set_title(f'Distribution of {col}')
                save_plot(fig, f"{col}_dist.png")
            except Exception as e:
                logger.error(f"Could not plot feature {col}: {e}")


def plot_time_series_row_count(data: pd.DataFrame) -> None:
    """
    Row count over time, using a date column for drift checks.
    """
    if 'date_part' in data.columns:
        try:
            fig, ax = plt.subplots(figsize=(12, 6))
            data.groupby('date_part').size().plot(ax=ax, marker='o')
            ax.set_title('Rows Over Time')
            ax.set_ylabel('Row Count')
            ax.set_xlabel('Date')
            save_plot(fig, "time_series_row_count.png")
        except Exception as e:
            logger.error("Could not plot time series row count: %s", e)
            
# =====================================================================
# Replace full-featured plotting blocks in your pipeline with ONLY these
# =====================================================================
def run_eda_pipeline():
    logger.info("="*70)
    logger.info("Starting Minimal EDA Pipeline For MLOps")
    logger.info("="*70)

    Config.setup_directories()
    data, date_limits = load_and_filter_data(Config.DATA_PATH, Config.MIN_DATE, Config.MAX_DATE)
    with open(Config.ARTIFACTS_DIR / "date_limits.json", "w") as f:
        json.dump(date_limits, f, indent=2)

    logger.info(f"Data shape: {data.shape}")
    logger.info(f"Columns: {list(data.columns)}")
    missing = analyze_missing_values(data)

    # === ESSENTIAL VISUALS ONLY ===
    plot_missingness(data)
    plot_correlation_heatmap(data)
    plot_target_distribution(data, target_col='lead_indicator')  # or your actual target
    # Pass the most important features for your use case here:
    plot_key_feature_distributions(data, key_features=['customer_group', 'time_spent', 'n_visits', 'purchases'])
    plot_time_series_row_count(data)

    logger.info("="*70)
    logger.info("Minimal EDA Pipeline Complete ✅")
    logger.info(f"Artifacts saved to: {Config.ARTIFACTS_DIR / 'plots'}")
    logger.info("="*70)
    

def analyze_missing_values(data: pd.DataFrame) -> pd.Series:
    """
    Analyze and report missing values.
    
    Args:
        data: Input dataframe
        
    Returns:
        Series of columns with missing value counts
    """
    missing = data.isnull().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    
    if not missing.empty:
        logger.info(f"Found {len(missing)} columns with missing values")
        for col, count in missing.head(10).items():
            pct = 100 * count / len(data)
            logger.info(f"  {col}: {count:,} ({pct:.2f}%)")
    else:
        logger.info("No missing values detected ✅")
    
    return missing




In [8]:
# =====================================================================
# Replace full-featured plotting blocks in your pipeline with ONLY these
# =====================================================================
def run_eda_pipeline():
    logger.info("="*70)
    logger.info("Starting Minimal EDA Pipeline For MLOps")
    logger.info("="*70)

    Config.setup_directories()
    data, date_limits = load_and_filter_data(Config.DATA_PATH, Config.MIN_DATE, Config.MAX_DATE)
    with open(Config.ARTIFACTS_DIR / "date_limits.json", "w") as f:
        json.dump(date_limits, f, indent=2)

    logger.info(f"Data shape: {data.shape}")
    logger.info(f"Columns: {list(data.columns)}")
    missing = analyze_missing_values(data)

    # === ESSENTIAL VISUALS ONLY ===
    plot_missingness(data)
    plot_correlation_heatmap(data)
    plot_target_distribution(data, target_col='lead_indicator')  # or your actual target
    # Pass the most important features for your use case here:
    plot_key_feature_distributions(data, key_features=['customer_group', 'time_spent', 'n_visits', 'purchases'])
    plot_time_series_row_count(data)

    logger.info("="*70)
    logger.info("Minimal EDA Pipeline Complete ✅")
    logger.info(f"Artifacts saved to: {Config.ARTIFACTS_DIR / 'plots'}")
    logger.info("="*70)
    

In [9]:
# =============================================================================
# EXECUTION
# =============================================================================
if __name__ == "__main__":
    run_eda_pipeline()


2025-10-23 13:43:47 - eda_pipeline - INFO - Starting Minimal EDA Pipeline For MLOps
2025-10-23 13:43:47 - eda_pipeline - INFO - Created artifacts directories: artifacts, artifacts/plots
2025-10-23 13:43:47 - eda_pipeline - INFO - Loading training data from artifacts/raw_data.csv
2025-10-23 13:43:47 - eda_pipeline - INFO - Loaded 12,345 rows, 19 columns
2025-10-23 13:43:47 - eda_pipeline - INFO - Date filtered: 2024-01-01 to 2024-01-31
2025-10-23 13:43:47 - eda_pipeline - INFO - Total rows after filtering: 12,345
2025-10-23 13:43:47 - eda_pipeline - INFO - Data shape: (12345, 19)
2025-10-23 13:43:47 - eda_pipeline - INFO - Columns: ['lead_id', 'lead_indicator', 'date_part', 'is_active', 'marketing_consent', 'first_booking', 'existing_customer', 'last_seen', 'source', 'domain', 'country', 'visited_learn_more_before_booking', 'visited_faq', 'purchases', 'time_spent', 'customer_group', 'onboarding', 'customer_code', 'n_visits']
2025-10-23 13:43:47 - eda_pipeline - INFO - Found 2 columns wi

2025-10-23 13:43:49 - eda_pipeline - INFO - Minimal EDA Pipeline Complete ✅
2025-10-23 13:43:49 - eda_pipeline - INFO - Artifacts saved to: artifacts/plots
