# Vehicle Collision Prediction - EDA Report

This notebook performs exploratory data analysis on telematic vehicle data to identify patterns and characteristics relevant for collision prediction modeling.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/data/train_set.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## Dataset Structure and Basic Information

In [None]:
# Basic dataset information
print("Dataset Info:")
df.info()
print("\n" + "="*50)
print("Column names:")
print(df.columns.tolist())
print("\n" + "="*50)
print("Basic statistics:")
df.describe()

## Data Types and Missing Values Analysis

In [None]:
# Check data types and missing values
missing_info = pd.DataFrame({
    'Column': df.columns,
    'Data_Type': df.dtypes,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
    'Unique_Values': [df[col].nunique() for col in df.columns]
})

print("Missing Values Analysis:")
print(missing_info)

# Identify categorical vs numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")

## Target Variable Analysis

In [None]:
# Analyze target variable distribution
if 'collisions' in df.columns:
    target_col = 'collisions'
else:
    # Find likely target column
    likely_targets = [col for col in df.columns if 'collision' in col.lower() or 'target' in col.lower()]
    target_col = likely_targets[0] if likely_targets else df.columns[-1]
    
print(f"Target variable: {target_col}")
target_dist = df[target_col].value_counts().sort_index()
target_pct = df[target_col].value_counts(normalize=True).sort_index() * 100

print("\nTarget distribution:")
for val in target_dist.index:
    print(f"{val}: {target_dist[val]} ({target_pct[val]:.2f}%)")

# Create target distribution plot
app_color_palette = [
    'rgba(99, 110, 250, 0.8)',   # Blue
    'rgba(239, 85, 59, 0.8)',    # Red/Orange-Red
    'rgba(0, 204, 150, 0.8)',    # Green
    'rgba(171, 99, 250, 0.8)',   # Purple
    'rgba(255, 161, 90, 0.8)',   # Orange
    'rgba(25, 211, 243, 0.8)',   # Cyan
    'rgba(255, 102, 146, 0.8)',  # Pink
    'rgba(182, 232, 128, 0.8)',  # Light Green
    'rgba(255, 151, 255, 0.8)',  # Magenta
    'rgba(254, 203, 82, 0.8)'    # Yellow
]

fig_target = px.bar(
    x=target_dist.index, 
    y=target_dist.values,
    labels={'x': 'Collision Count', 'y': 'Frequency'},
    text=target_dist.values
)

fig_target.update_traces(
    marker=dict(color=app_color_palette[0]),
    texttemplate='%{text}',
    textposition='outside'
)

fig_target.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16),
    xaxis=dict(
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    ),
    yaxis=dict(
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    ),
    legend=dict(font=dict(color='#8B5CF6', size=11))
)

fig_target.write_html(
    "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/target_distribution.html", 
    include_plotlyjs=True, 
    config={'responsive': True, 'displayModeBar': False}
)

fig_target.show()

## Missing Values Analysis

In [None]:
# Create missing values visualization
missing_data = df.isnull().sum()
missing_cols = missing_data[missing_data > 0]

if len(missing_cols) > 0:
    fig_missing = px.bar(
        x=missing_cols.index,
        y=missing_cols.values,
        labels={'x': 'Columns', 'y': 'Missing Values Count'},
        text=missing_cols.values
    )
    
    fig_missing.update_traces(
        marker=dict(color=app_color_palette[1]),
        texttemplate='%{text}',
        textposition='outside'
    )
    
    fig_missing.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(color='#8B5CF6', size=12),
        title_font=dict(color='#7C3AED', size=16),
        xaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        ),
        yaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        ),
        legend=dict(font=dict(color='#8B5CF6', size=11))
    )
    
    fig_missing.write_html(
        "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/missing_values.html",
        include_plotlyjs=True,
        config={'responsive': True, 'displayModeBar': False}
    )
    
    fig_missing.show()
else:
    print("No missing values found in the dataset.")
    # Create a placeholder visualization showing all columns have complete data
    complete_data = pd.Series(len(df), index=df.columns[:10])  # Show first 10 columns
    fig_missing = px.bar(
        x=complete_data.index,
        y=complete_data.values,
        labels={'x': 'Columns (Sample)', 'y': 'Complete Records Count'}
    )
    
    fig_missing.update_traces(marker=dict(color=app_color_palette[2]))
    
    fig_missing.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(color='#8B5CF6', size=12),
        title_font=dict(color='#7C3AED', size=16),
        xaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        ),
        yaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        )
    )
    
    fig_missing.write_html(
        "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/missing_values.html",
        include_plotlyjs=True,
        config={'responsive': True, 'displayModeBar': False}
    )
    
    fig_missing.show()

## Numerical Features Distribution Analysis

In [None]:
# Analyze numerical features (excluding target if it's numerical)
num_features = [col for col in numerical_cols if col != target_col]

if len(num_features) > 0:
    print(f"Analyzing {len(num_features)} numerical features")
    
    # Select key numerical features for visualization (top 8 by variance)
    feature_variance = df[num_features].var().sort_values(ascending=False)
    top_features = feature_variance.head(8).index.tolist()
    
    print(f"Top features by variance: {top_features}")
    
    # Create distribution plot for key numerical features
    fig_dist = make_subplots(
        rows=2, cols=4,
        subplot_titles=top_features[:8],
        vertical_spacing=0.08,
        horizontal_spacing=0.06
    )
    
    for i, feature in enumerate(top_features[:8]):
        row = i // 4 + 1
        col = i % 4 + 1
        
        fig_dist.add_trace(
            go.Histogram(
                x=df[feature],
                name=feature,
                marker_color=app_color_palette[i % len(app_color_palette)],
                showlegend=False
            ),
            row=row, col=col
        )
    
    fig_dist.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(color='#8B5CF6', size=10),
        title_font=dict(color='#7C3AED', size=14)
    )
    
    fig_dist.update_xaxes(
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=9)
    )
    
    fig_dist.update_yaxes(
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=9)
    )
    
    fig_dist.write_html(
        "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/numerical_distributions.html",
        include_plotlyjs=True,
        config={'responsive': True, 'displayModeBar': False}
    )
    
    fig_dist.show()
    
    # Print summary statistics for key features
    print("\nSummary statistics for key numerical features:")
    print(df[top_features].describe())
    
else:
    print("No numerical features found for analysis.")

## Categorical Features Analysis

In [None]:
# Analyze categorical features
if len(categorical_cols) > 0:
    print(f"Analyzing {len(categorical_cols)} categorical features")
    
    # Select features with reasonable number of unique values for visualization
    cat_features_analysis = []
    for col in categorical_cols:
        unique_count = df[col].nunique()
        if unique_count <= 20:  # Only show features with <= 20 unique values
            cat_features_analysis.append(col)
    
    if len(cat_features_analysis) > 0:
        # Take first categorical feature for detailed analysis
        feature_to_analyze = cat_features_analysis[0]
        
        value_counts = df[feature_to_analyze].value_counts().head(10)  # Top 10 values
        
        fig_cat = px.bar(
            x=value_counts.index,
            y=value_counts.values,
            labels={'x': f'{feature_to_analyze}', 'y': 'Count'},
            text=value_counts.values
        )
        
        fig_cat.update_traces(
            marker=dict(color=app_color_palette[3]),
            texttemplate='%{text}',
            textposition='outside'
        )
        
        fig_cat.update_layout(
            height=550,
            paper_bgcolor='rgba(0,0,0,0)',
            plot_bgcolor='rgba(0,0,0,0)',
            font=dict(color='#8B5CF6', size=12),
            title_font=dict(color='#7C3AED', size=16),
            xaxis=dict(
                gridcolor='rgba(139,92,246,0.2)',
                zerolinecolor='rgba(139,92,246,0.3)',
                tickfont=dict(color='#8B5CF6', size=11),
                title_font=dict(color='#7C3AED', size=12),
                tickangle=45
            ),
            yaxis=dict(
                gridcolor='rgba(139,92,246,0.2)',
                zerolinecolor='rgba(139,92,246,0.3)',
                tickfont=dict(color='#8B5CF6', size=11),
                title_font=dict(color='#7C3AED', size=12)
            )
        )
        
        fig_cat.write_html(
            "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/categorical_distribution.html",
            include_plotlyjs=True,
            config={'responsive': True, 'displayModeBar': False}
        )
        
        fig_cat.show()
        
        # Print analysis for all categorical features
        for col in cat_features_analysis:
            unique_vals = df[col].nunique()
            print(f"\n{col}: {unique_vals} unique values")
            if unique_vals <= 10:
                print(df[col].value_counts().head(10))
            else:
                print(f"Top 5 values:\n{df[col].value_counts().head(5)}")
    else:
        print("No suitable categorical features found for visualization (all have >20 unique values)")
        # Create a placeholder showing unique value counts
        unique_counts = pd.Series([df[col].nunique() for col in categorical_cols], 
                                index=categorical_cols)
        
        fig_cat = px.bar(
            x=unique_counts.index,
            y=unique_counts.values,
            labels={'x': 'Categorical Features', 'y': 'Unique Value Count'}
        )
        
        fig_cat.update_traces(marker=dict(color=app_color_palette[3]))
        
        fig_cat.update_layout(
            height=550,
            paper_bgcolor='rgba(0,0,0,0)',
            plot_bgcolor='rgba(0,0,0,0)',
            font=dict(color='#8B5CF6', size=12),
            title_font=dict(color='#7C3AED', size=16),
            xaxis=dict(
                gridcolor='rgba(139,92,246,0.2)',
                zerolinecolor='rgba(139,92,246,0.3)',
                tickfont=dict(color='#8B5CF6', size=11),
                title_font=dict(color='#7C3AED', size=12),
                tickangle=45
            ),
            yaxis=dict(
                gridcolor='rgba(139,92,246,0.2)',
                zerolinecolor='rgba(139,92,246,0.3)',
                tickfont=dict(color='#8B5CF6', size=11),
                title_font=dict(color='#7C3AED', size=12)
            )
        )
        
        fig_cat.write_html(
            "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/categorical_distribution.html",
            include_plotlyjs=True,
            config={'responsive': True, 'displayModeBar': False}
        )
        
        fig_cat.show()
else:
    print("No categorical features found.")
    # Create placeholder
    fig_cat = px.bar(x=['No categorical features'], y=[0])
    fig_cat.write_html(
        "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/categorical_distribution.html",
        include_plotlyjs=True,
        config={'responsive': True, 'displayModeBar': False}
    )

## Feature Correlation Analysis

In [None]:
# Correlation analysis for numerical features
if len(num_features) > 1:
    # Calculate correlation matrix
    correlation_matrix = df[num_features + [target_col] if target_col in numerical_cols else num_features].corr()
    
    # Select top correlated features with target (if target is numerical)
    if target_col in correlation_matrix.columns:
        target_corr = correlation_matrix[target_col].abs().sort_values(ascending=False)
        print(f"Features most correlated with {target_col}:")
        print(target_corr.head(10))
        
        # Create correlation heatmap for top features
        top_corr_features = target_corr.head(10).index.tolist()
    else:
        # If target is not numerical, show correlation among features
        top_corr_features = num_features[:10] if len(num_features) > 10 else num_features
    
    corr_subset = df[top_corr_features].corr()
    
    fig_corr = px.imshow(
        corr_subset,
        text_auto='.2f',
        aspect='auto',
        color_continuous_scale='RdBu_r',
        labels={'x': 'Features', 'y': 'Features', 'color': 'Correlation'}
    )
    
    fig_corr.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(color='#8B5CF6', size=10),
        title_font=dict(color='#7C3AED', size=16),
        xaxis=dict(
            tickfont=dict(color='#8B5CF6', size=9),
            title_font=dict(color='#7C3AED', size=12)
        ),
        yaxis=dict(
            tickfont=dict(color='#8B5CF6', size=9),
            title_font=dict(color='#7C3AED', size=12)
        )
    )
    
    fig_corr.write_html(
        "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/feature_correlations.html",
        include_plotlyjs=True,
        config={'responsive': True, 'displayModeBar': False}
    )
    
    fig_corr.show()
    
else:
    print("Insufficient numerical features for correlation analysis.")
    # Create placeholder
    fig_corr = px.bar(x=['Insufficient features'], y=[0])
    fig_corr.write_html(
        "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/feature_correlations.html",
        include_plotlyjs=True,
        config={'responsive': True, 'displayModeBar': False}
    )

## Outlier Detection

In [None]:
# Outlier analysis using IQR method
if len(num_features) > 0:
    outlier_summary = []
    
    for feature in num_features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
        outlier_count = len(outliers)
        outlier_pct = (outlier_count / len(df)) * 100
        
        outlier_summary.append({
            'Feature': feature,
            'Outlier_Count': outlier_count,
            'Outlier_Percentage': outlier_pct
        })
    
    outlier_df = pd.DataFrame(outlier_summary)
    outlier_df = outlier_df.sort_values('Outlier_Count', ascending=False)
    
    print("Outlier Analysis (Top 10 features by outlier count):")
    print(outlier_df.head(10))
    
    # Create outlier visualization
    top_outlier_features = outlier_df.head(8)
    
    fig_outliers = px.bar(
        top_outlier_features,
        x='Feature',
        y='Outlier_Count',
        labels={'Feature': 'Features', 'Outlier_Count': 'Number of Outliers'},
        text='Outlier_Count'
    )
    
    fig_outliers.update_traces(
        marker=dict(color=app_color_palette[4]),
        texttemplate='%{text}',
        textposition='outside'
    )
    
    fig_outliers.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(color='#8B5CF6', size=12),
        title_font=dict(color='#7C3AED', size=16),
        xaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12),
            tickangle=45
        ),
        yaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        )
    )
    
    fig_outliers.write_html(
        "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/outlier_analysis.html",
        include_plotlyjs=True,
        config={'responsive': True, 'displayModeBar': False}
    )
    
    fig_outliers.show()
    
else:
    print("No numerical features available for outlier analysis.")
    # Create placeholder
    fig_outliers = px.bar(x=['No numerical features'], y=[0])
    fig_outliers.write_html(
        "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/outlier_analysis.html",
        include_plotlyjs=True,
        config={'responsive': True, 'displayModeBar': False}
    )

## Target vs Features Analysis

In [None]:
# Analyze relationship between target and key features
if len(num_features) > 0:
    # Select top 3 features most correlated with target for detailed analysis
    if target_col in numerical_cols:
        target_correlations = df[num_features + [target_col]].corr()[target_col].abs().sort_values(ascending=False)
        top_3_features = target_correlations.index[1:4].tolist()  # Exclude target itself
    else:
        # For categorical target, use features with highest variance
        feature_variance = df[num_features].var().sort_values(ascending=False)
        top_3_features = feature_variance.head(3).index.tolist()
    
    print(f"Analyzing relationship between target '{target_col}' and features: {top_3_features}")
    
    # Create box plots showing feature distribution by target class
    if len(top_3_features) >= 1:
        feature_for_analysis = top_3_features[0]
        
        fig_target_feature = px.box(
            df,
            x=target_col,
            y=feature_for_analysis,
            labels={target_col: 'Collision Count', feature_for_analysis: f'{feature_for_analysis}'}
        )
        
        fig_target_feature.update_traces(marker=dict(color=app_color_palette[5]))
        
        fig_target_feature.update_layout(
            height=550,
            paper_bgcolor='rgba(0,0,0,0)',
            plot_bgcolor='rgba(0,0,0,0)',
            font=dict(color='#8B5CF6', size=12),
            title_font=dict(color='#7C3AED', size=16),
            xaxis=dict(
                gridcolor='rgba(139,92,246,0.2)',
                zerolinecolor='rgba(139,92,246,0.3)',
                tickfont=dict(color='#8B5CF6', size=11),
                title_font=dict(color='#7C3AED', size=12)
            ),
            yaxis=dict(
                gridcolor='rgba(139,92,246,0.2)',
                zerolinecolor='rgba(139,92,246,0.3)',
                tickfont=dict(color='#8B5CF6', size=11),
                title_font=dict(color='#7C3AED', size=12)
            )
        )
        
        fig_target_feature.write_html(
            "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/target_feature_relationship.html",
            include_plotlyjs=True,
            config={'responsive': True, 'displayModeBar': False}
        )
        
        fig_target_feature.show()
        
        # Print statistical summary by target class
        print(f"\nStatistical summary of '{feature_for_analysis}' by collision count:")
        print(df.groupby(target_col)[feature_for_analysis].describe())
        
    else:
        print("No suitable features found for target-feature analysis.")
        # Create placeholder
        fig_target_feature = px.bar(x=['No suitable features'], y=[0])
        fig_target_feature.write_html(
            "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/target_feature_relationship.html",
            include_plotlyjs=True,
            config={'responsive': True, 'displayModeBar': False}
        )
        
else:
    print("No numerical features available for target-feature analysis.")
    # Create placeholder
    fig_target_feature = px.bar(x=['No numerical features'], y=[0])
    fig_target_feature.write_html(
        "/Users/yuvalheffetz/ds-agent-projects/session_5feb6ac6-f292-4d0c-9e41-ab6b3ffc14d6/research/plots/target_feature_relationship.html",
        include_plotlyjs=True,
        config={'responsive': True, 'displayModeBar': False}
    )

## Summary and Key Findings

In [None]:
# Generate comprehensive summary
print("=" * 60)
print("EXPLORATORY DATA ANALYSIS SUMMARY")
print("=" * 60)

print(f"\n📊 Dataset Overview:")
print(f"   • Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"   • Target variable: {target_col}")
print(f"   • Task type: {'Classification' if df[target_col].nunique() <= 10 else 'Regression'}")

print(f"\n🎯 Target Distribution:")
target_dist = df[target_col].value_counts().sort_index()
for val in target_dist.index:
    pct = (target_dist[val] / len(df)) * 100
    print(f"   • Class {val}: {target_dist[val]:,} ({pct:.1f}%)")

print(f"\n📈 Feature Types:")
print(f"   • Numerical features: {len(num_features)}")
print(f"   • Categorical features: {len(categorical_cols)}")

print(f"\n🔍 Data Quality:")
total_missing = df.isnull().sum().sum()
missing_pct = (total_missing / (df.shape[0] * df.shape[1])) * 100
print(f"   • Total missing values: {total_missing:,} ({missing_pct:.2f}% of all values)")
print(f"   • Columns with missing data: {df.isnull().sum().gt(0).sum()}")

if len(num_features) > 0:
    print(f"\n📊 Key Insights:")
    
    # Class imbalance insight
    if target_dist.nunique() > 1:
        max_class_pct = (target_dist.max() / target_dist.sum()) * 100
        if max_class_pct > 80:
            print(f"   • ⚠️  Severe class imbalance detected ({max_class_pct:.1f}% in majority class)")
        elif max_class_pct > 60:
            print(f"   • ⚠️  Moderate class imbalance detected ({max_class_pct:.1f}% in majority class)")
        else:
            print(f"   • ✅ Balanced classes (largest class: {max_class_pct:.1f}%)")
    
    # Feature variability insight
    zero_var_features = (df[num_features].var() == 0).sum()
    if zero_var_features > 0:
        print(f"   • ⚠️  {zero_var_features} features have zero variance")
    
    # High cardinality categorical features
    high_card_cats = [col for col in categorical_cols if df[col].nunique() > 50]
    if high_card_cats:
        print(f"   • ⚠️  {len(high_card_cats)} categorical features with >50 unique values")
    
    print(f"\n🚀 Recommendations for ML Pipeline:")
    print(f"   • Use stratified sampling due to class imbalance")
    print(f"   • Consider SMOTE or other resampling techniques")
    print(f"   • Apply feature scaling for numerical features")
    if high_card_cats:
        print(f"   • Apply target encoding for high-cardinality categorical features")
    print(f"   • Use PR-AUC as primary metric (ideal for imbalanced data)")
    print(f"   • Consider ensemble methods (Random Forest, XGBoost)")

print("\n" + "="*60)