# Vehicle Collision Prediction - Exploratory Data Analysis

This notebook performs comprehensive EDA on telematic vehicle data to identify potential car collisions.
The analysis will inform preprocessing and feature engineering decisions for the ML pipeline.

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os

warnings.filterwarnings('ignore')

# Color palette for consistency
app_color_palette = [
    'rgba(99, 110, 250, 0.8)',   # Blue
    'rgba(239, 85, 59, 0.8)',    # Red/Orange-Red
    'rgba(0, 204, 150, 0.8)',    # Green
    'rgba(171, 99, 250, 0.8)',   # Purple
    'rgba(255, 161, 90, 0.8)',   # Orange
    'rgba(25, 211, 243, 0.8)',   # Cyan
    'rgba(255, 102, 146, 0.8)',  # Pink
    'rgba(182, 232, 128, 0.8)',  # Light Green
    'rgba(255, 151, 255, 0.8)',  # Magenta
    'rgba(254, 203, 82, 0.8)'    # Yellow
]

# Ensure plots directory exists
plots_dir = '/Users/yuvalheffetz/ds-agent-projects/session_6a348ddd-12b5-4ee2-af3a-daf992d9a288/research/plots'
os.makedirs(plots_dir, exist_ok=True)

## 1. Dataset Loading and Initial Inspection

In [2]:
# Load the training dataset
train_path = '/Users/yuvalheffetz/ds-agent-projects/session_6a348ddd-12b5-4ee2-af3a-daf992d9a288/data/train_set.csv'
df = pd.read_csv(train_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
df.head()

Dataset shape: (7667, 13)

Columns: ['driver_id', 'month', 'count_trip', 'miles', 'drive_hours', 'count_brakes', 'count_accelarations', 'time_speeding_hours', 'time_phoneuse_hours', 'highway_miles', 'night_drive_hrs', 'maximum_speed', 'collisions']

First few rows:


Unnamed: 0,driver_id,month,count_trip,miles,drive_hours,count_brakes,count_accelarations,time_speeding_hours,time_phoneuse_hours,highway_miles,night_drive_hrs,maximum_speed,collisions
0,JRN-504,Oct-22,7.0,563.82972,27.17,161.0,161.0,0.0,1.261541,129.3528,0.0,80.0,0
1,TIL-876,Nov-22,,,,,,,,,,,0
2,IWK-764,Apr-22,8.0,370.10076,19.47,201.0,201.0,0.290719,0.329752,28.1114,0.0,86.0,0
3,KTJ-773,May-22,1.0,10.19676,0.65,5.0,6.0,0.0,1.780111,0.0,0.0,81.0,0
4,JNT-352,Jun-22,3.0,15.12456,1.4,8.0,8.0,0.0,1.527254,0.9385,0.0,62.0,0


## 2. Data Types and Missing Values Analysis

In [3]:
# Data types and basic info
print("Data Types:")
print(df.dtypes)
print(f"\nDataset Info:")
df.info()

# Missing values analysis
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_summary = pd.DataFrame({
    'Missing_Count': missing_values,
    'Missing_Percentage': missing_percentage
})
missing_summary = missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

print(f"\nMissing Values Summary:")
print(missing_summary)

Data Types:
driver_id               object
month                   object
count_trip             float64
miles                  float64
drive_hours            float64
count_brakes           float64
count_accelarations    float64
time_speeding_hours    float64
time_phoneuse_hours    float64
highway_miles          float64
night_drive_hrs        float64
maximum_speed          float64
collisions               int64
dtype: object

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7667 entries, 0 to 7666
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   driver_id            7667 non-null   object 
 1   month                7667 non-null   object 
 2   count_trip           7611 non-null   float64
 3   miles                7611 non-null   float64
 4   drive_hours          7611 non-null   float64
 5   count_brakes         7611 non-null   float64
 6   count_accelarations  7611 non-null   float64

In [4]:
# Create missing values visualization
if len(missing_summary) > 0:
    fig = px.bar(
        x=missing_summary.index,
        y=missing_summary['Missing_Percentage'],
        labels={'x': 'Features', 'y': 'Missing Percentage (%)'},
    )
    
    fig.update_traces(marker=dict(color=app_color_palette[0]))
    fig.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(color='#8B5CF6', size=12),
        title_font=dict(color='#7C3AED', size=16),
        xaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        ),
        yaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        ),
        legend=dict(font=dict(color='#8B5CF6', size=11))
    )
    
    fig.write_html(f"{plots_dir}/missing_values_analysis.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
else:
    # Create a plot showing no missing values
    fig = go.Figure()
    fig.add_annotation(
        text="No Missing Values Found in Dataset",
        xref="paper", yref="paper",
        x=0.5, y=0.5, xanchor='center', yanchor='middle',
        showarrow=False,
        font=dict(size=20, color='#8B5CF6')
    )
    fig.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(visible=False),
        yaxis=dict(visible=False)
    )
    fig.write_html(f"{plots_dir}/missing_values_analysis.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})

## 3. Target Variable Analysis

In [5]:
# Target variable analysis
target_col = 'collisions'
target_distribution = df[target_col].value_counts().sort_index()
target_percentage = (df[target_col].value_counts(normalize=True) * 100).sort_index()

print(f"Target Variable Distribution:")
for val, count in target_distribution.items():
    pct = target_percentage[val]
    print(f"{val} collisions: {count:,} samples ({pct:.1f}%)")

print(f"\nTarget Statistics:")
print(df[target_col].describe())

Target Variable Distribution:
0 collisions: 7,301 samples (95.2%)
1 collisions: 353 samples (4.6%)
2 collisions: 13 samples (0.2%)

Target Statistics:
count    7667.000000
mean        0.049433
std         0.224470
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         2.000000
Name: collisions, dtype: float64


In [6]:
# Create target distribution visualization
labels = [f"{int(val)} Collisions" for val in target_distribution.index]
values = target_distribution.values

fig = px.pie(
    values=values,
    names=labels,
    color_discrete_sequence=app_color_palette[:len(labels)]
)

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16),
    legend=dict(font=dict(color='#8B5CF6', size=11))
)

fig.write_html(f"{plots_dir}/target_distribution.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})

## 4. Numerical Features Analysis

In [7]:
# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numerical_cols:
    numerical_cols.remove(target_col)

categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")

# Basic statistics for numerical features
if len(numerical_cols) > 0:
    print(f"\nNumerical Features Statistics:")
    print(df[numerical_cols].describe())

Numerical columns (10): ['count_trip', 'miles', 'drive_hours', 'count_brakes', 'count_accelarations', 'time_speeding_hours', 'time_phoneuse_hours', 'highway_miles', 'night_drive_hrs', 'maximum_speed']
Categorical columns (2): ['driver_id', 'month']

Numerical Features Statistics:
        count_trip          miles   drive_hours  count_brakes  \
count  7611.000000    7611.000000   7611.000000   7611.000000   
mean      8.481934     508.284395     24.706039    118.184928   
std       7.693353    3616.217663    161.050536   1058.391851   
min       1.000000       0.000000      0.000000      0.000000   
25%       2.000000      38.520660      1.910000      8.000000   
50%       6.000000     166.949160      8.310000     26.000000   
75%      13.000000     579.740880     28.890000    105.000000   
max      31.000000  309382.684000  13720.630000  90489.000000   

       count_accelarations  time_speeding_hours  time_phoneuse_hours  \
count           7611.00000          7611.000000          6640

In [8]:
# Create numerical features distribution plot
if len(numerical_cols) > 0:
    # Select top numerical features for visualization
    cols_to_plot = numerical_cols[:6]  # Limit to 6 for readability
    
    fig = make_subplots(
        rows=2, cols=3,
        subplot_titles=[col.replace('_', ' ').title() for col in cols_to_plot]
    )
    
    for i, col in enumerate(cols_to_plot):
        row = (i // 3) + 1
        col_pos = (i % 3) + 1
        
        fig.add_trace(
            go.Histogram(
                x=df[col],
                name=col,
                marker_color=app_color_palette[i % len(app_color_palette)],
                showlegend=False
            ),
            row=row, col=col_pos
        )
    
    fig.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(color='#8B5CF6', size=12),
        title_font=dict(color='#7C3AED', size=16)
    )
    
    fig.update_xaxes(
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=10),
        title_font=dict(color='#7C3AED', size=11)
    )
    fig.update_yaxes(
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=10),
        title_font=dict(color='#7C3AED', size=11)
    )
    
    fig.write_html(f"{plots_dir}/numerical_distributions.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
else:
    # No numerical features found
    fig = go.Figure()
    fig.add_annotation(
        text="No Numerical Features Found",
        xref="paper", yref="paper",
        x=0.5, y=0.5, xanchor='center', yanchor='middle',
        showarrow=False,
        font=dict(size=20, color='#8B5CF6')
    )
    fig.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(visible=False),
        yaxis=dict(visible=False)
    )
    fig.write_html(f"{plots_dir}/numerical_distributions.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})

## 5. Categorical Features Analysis

In [9]:
# Analyze categorical features
if len(categorical_cols) > 0:
    print("Categorical Features Analysis:")
    for col in categorical_cols:
        unique_vals = df[col].nunique()
        print(f"\n{col}:")
        print(f"  - Unique values: {unique_vals}")
        if unique_vals <= 20:  # Show value counts for low cardinality features
            print(f"  - Value counts:")
            for val, count in df[col].value_counts().head(10).items():
                print(f"    {val}: {count}")
        else:
            print(f"  - Top 5 values:")
            for val, count in df[col].value_counts().head(5).items():
                print(f"    {val}: {count}")

Categorical Features Analysis:

driver_id:
  - Unique values: 7663
  - Top 5 values:
    YEP-237: 2
    VNJ-703: 2
    FZQ-261: 2
    YDP-996: 2
    JRN-504: 1

month:
  - Unique values: 12
  - Value counts:
    Nov-22: 680
    Aug-22: 678
    Jan-22: 674
    May-22: 657
    Jun-22: 656
    Feb-22: 634
    Apr-22: 633
    Mar-22: 626
    Sep-22: 624
    Jul-22: 614


In [10]:
# Create categorical features visualization
if len(categorical_cols) > 0:
    # Select the first categorical feature with reasonable cardinality
    plot_col = None
    for col in categorical_cols:
        if df[col].nunique() <= 20 and df[col].nunique() >= 2:
            plot_col = col
            break
    
    if plot_col:
        value_counts = df[plot_col].value_counts().head(10)
        
        fig = px.bar(
            x=value_counts.index,
            y=value_counts.values,
            labels={'x': plot_col.replace('_', ' ').title(), 'y': 'Count'}
        )
        
        fig.update_traces(marker=dict(color=app_color_palette[0]))
        fig.update_layout(
            height=550,
            paper_bgcolor='rgba(0,0,0,0)',
            plot_bgcolor='rgba(0,0,0,0)',
            font=dict(color='#8B5CF6', size=12),
            title_font=dict(color='#7C3AED', size=16),
            xaxis=dict(
                gridcolor='rgba(139,92,246,0.2)',
                zerolinecolor='rgba(139,92,246,0.3)',
                tickfont=dict(color='#8B5CF6', size=11),
                title_font=dict(color='#7C3AED', size=12)
            ),
            yaxis=dict(
                gridcolor='rgba(139,92,246,0.2)',
                zerolinecolor='rgba(139,92,246,0.3)',
                tickfont=dict(color='#8B5CF6', size=11),
                title_font=dict(color='#7C3AED', size=12)
            )
        )
        
        fig.write_html(f"{plots_dir}/categorical_distribution.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
    else:
        # Create message about high cardinality
        fig = go.Figure()
        fig.add_annotation(
            text="Categorical Features Have High Cardinality<br>Individual distributions not suitable for visualization",
            xref="paper", yref="paper",
            x=0.5, y=0.5, xanchor='center', yanchor='middle',
            showarrow=False,
            font=dict(size=18, color='#8B5CF6')
        )
        fig.update_layout(
            height=550,
            paper_bgcolor='rgba(0,0,0,0)',
            plot_bgcolor='rgba(0,0,0,0)',
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        fig.write_html(f"{plots_dir}/categorical_distribution.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
else:
    # No categorical features
    fig = go.Figure()
    fig.add_annotation(
        text="No Categorical Features Found",
        xref="paper", yref="paper",
        x=0.5, y=0.5, xanchor='center', yanchor='middle',
        showarrow=False,
        font=dict(size=20, color='#8B5CF6')
    )
    fig.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(visible=False),
        yaxis=dict(visible=False)
    )
    fig.write_html(f"{plots_dir}/categorical_distribution.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})

## 6. Feature Correlation Analysis

In [11]:
# Correlation analysis with target variable
if len(numerical_cols) > 0:
    # Calculate correlations with target
    correlations = df[numerical_cols + [target_col]].corr()[target_col].drop(target_col).sort_values(key=abs, ascending=False)
    
    print(f"Correlations with Target Variable ({target_col}):")
    for feature, corr in correlations.items():
        print(f"{feature}: {corr:.4f}")
    
    # Overall correlation matrix for top features
    top_features = correlations.head(10).index.tolist() + [target_col]
    corr_matrix = df[top_features].corr()
    
    print(f"\nCorrelation Matrix (Top Features):")
    print(corr_matrix.round(3))

Correlations with Target Variable (collisions):
count_trip: 0.2456
drive_hours: 0.1097
miles: 0.1051
count_brakes: 0.0983
count_accelarations: 0.0983
highway_miles: 0.0892
time_speeding_hours: 0.0672
maximum_speed: -0.0646
night_drive_hrs: 0.0168
time_phoneuse_hours: -0.0040

Correlation Matrix (Top Features):
                     count_trip  drive_hours  miles  count_brakes  \
count_trip                1.000        0.193  0.179         0.144   
drive_hours               0.193        1.000  0.996         0.984   
miles                     0.179        0.996  1.000         0.988   
count_brakes              0.144        0.984  0.988         1.000   
count_accelarations       0.144        0.984  0.988         1.000   
highway_miles             0.125        0.984  0.992         0.977   
time_speeding_hours       0.144        0.833  0.847         0.827   
maximum_speed            -0.030       -0.006 -0.005        -0.006   
night_drive_hrs           0.076        0.142  0.106         0.099  

In [12]:
# Create correlation heatmap
if len(numerical_cols) > 0 and len(correlations) > 0:
    # Select top correlated features for visualization
    top_corr_features = correlations.head(10).index.tolist()
    if len(top_corr_features) > 0:
        corr_data = df[top_corr_features + [target_col]].corr()
        
        fig = px.imshow(
            corr_data.values,
            x=corr_data.columns,
            y=corr_data.columns,
            color_continuous_scale='RdBu_r',
            aspect='auto'
        )
        
        # Add correlation values as text
        for i in range(len(corr_data.columns)):
            for j in range(len(corr_data.columns)):
                fig.add_annotation(
                    x=j, y=i,
                    text=str(round(corr_data.iloc[i, j], 2)),
                    showarrow=False,
                    font=dict(color='white' if abs(corr_data.iloc[i, j]) > 0.5 else 'black')
                )
        
        fig.update_layout(
            height=550,
            paper_bgcolor='rgba(0,0,0,0)',
            plot_bgcolor='rgba(0,0,0,0)',
            font=dict(color='#8B5CF6', size=12),
            title_font=dict(color='#7C3AED', size=16),
            xaxis=dict(
                tickfont=dict(color='#8B5CF6', size=11),
                title_font=dict(color='#7C3AED', size=12)
            ),
            yaxis=dict(
                tickfont=dict(color='#8B5CF6', size=11),
                title_font=dict(color='#7C3AED', size=12)
            )
        )
        
        fig.write_html(f"{plots_dir}/correlation_analysis.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
    else:
        # No significant correlations
        fig = go.Figure()
        fig.add_annotation(
            text="No Strong Correlations Found",
            xref="paper", yref="paper",
            x=0.5, y=0.5, xanchor='center', yanchor='middle',
            showarrow=False,
            font=dict(size=20, color='#8B5CF6')
        )
        fig.update_layout(
            height=550,
            paper_bgcolor='rgba(0,0,0,0)',
            plot_bgcolor='rgba(0,0,0,0)',
            xaxis=dict(visible=False),
            yaxis=dict(visible=False)
        )
        fig.write_html(f"{plots_dir}/correlation_analysis.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
else:
    # No numerical features for correlation
    fig = go.Figure()
    fig.add_annotation(
        text="No Numerical Features Available for Correlation Analysis",
        xref="paper", yref="paper",
        x=0.5, y=0.5, xanchor='center', yanchor='middle',
        showarrow=False,
        font=dict(size=18, color='#8B5CF6')
    )
    fig.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(visible=False),
        yaxis=dict(visible=False)
    )
    fig.write_html(f"{plots_dir}/correlation_analysis.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})

## 7. Outlier Detection

In [13]:
# Outlier detection using IQR method
outlier_summary = {}

if len(numerical_cols) > 0:
    print("Outlier Analysis (IQR Method):")
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_count = len(outliers)
        outlier_percentage = (outlier_count / len(df)) * 100
        
        outlier_summary[col] = {
            'count': outlier_count,
            'percentage': outlier_percentage,
            'lower_bound': lower_bound,
            'upper_bound': upper_bound
        }
        
        print(f"{col}: {outlier_count} outliers ({outlier_percentage:.1f}%)")

    # Select feature with most outliers for visualization
    if outlier_summary:
        plot_feature = max(outlier_summary.keys(), key=lambda x: outlier_summary[x]['count'])
        print(f"\nVisualizing outliers for: {plot_feature}")

Outlier Analysis (IQR Method):
count_trip: 59 outliers (0.8%)
miles: 726 outliers (9.5%)
drive_hours: 695 outliers (9.1%)
count_brakes: 946 outliers (12.3%)
count_accelarations: 950 outliers (12.4%)
time_speeding_hours: 1306 outliers (17.0%)
time_phoneuse_hours: 0 outliers (0.0%)
highway_miles: 926 outliers (12.1%)
night_drive_hrs: 195 outliers (2.5%)
maximum_speed: 0 outliers (0.0%)

Visualizing outliers for: time_speeding_hours


In [14]:
# Create outlier visualization
if len(numerical_cols) > 0 and outlier_summary:
    # Create box plots for top numerical features
    top_features = list(outlier_summary.keys())[:6]  # Top 6 features
    
    fig = go.Figure()
    
    for i, col in enumerate(top_features):
        fig.add_trace(go.Box(
            y=df[col],
            name=col.replace('_', ' ').title(),
            marker_color=app_color_palette[i % len(app_color_palette)]
        ))
    
    fig.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(color='#8B5CF6', size=12),
        title_font=dict(color='#7C3AED', size=16),
        xaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        ),
        yaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        ),
        legend=dict(font=dict(color='#8B5CF6', size=11))
    )
    
    fig.write_html(f"{plots_dir}/outlier_detection.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
else:
    # No numerical features for outlier analysis
    fig = go.Figure()
    fig.add_annotation(
        text="No Numerical Features Available for Outlier Detection",
        xref="paper", yref="paper",
        x=0.5, y=0.5, xanchor='center', yanchor='middle',
        showarrow=False,
        font=dict(size=18, color='#8B5CF6')
    )
    fig.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(visible=False),
        yaxis=dict(visible=False)
    )
    fig.write_html(f"{plots_dir}/outlier_detection.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})

## 8. Feature vs Target Analysis

In [15]:
# Analyze feature distributions by target classes
if len(numerical_cols) > 0:
    # Select most correlated feature with target for detailed analysis
    if len(correlations) > 0:
        top_feature = correlations.index[0]
        print(f"Analyzing {top_feature} by collision classes:")
        
        for collision_class in sorted(df[target_col].unique()):
            subset = df[df[target_col] == collision_class]
            print(f"\nCollision class {collision_class} ({len(subset)} samples):")
            print(f"  {top_feature} - Mean: {subset[top_feature].mean():.3f}, Std: {subset[top_feature].std():.3f}")
            print(f"  {top_feature} - Min: {subset[top_feature].min():.3f}, Max: {subset[top_feature].max():.3f}")

Analyzing count_trip by collision classes:

Collision class 0 (7301 samples):
  count_trip - Mean: 8.069, Std: 7.421
  count_trip - Min: 1.000, Max: 31.000

Collision class 1 (353 samples):
  count_trip - Mean: 16.415, Std: 8.282
  count_trip - Min: 1.000, Max: 31.000

Collision class 2 (13 samples):
  count_trip - Mean: 26.000, Std: 4.021
  count_trip - Min: 18.000, Max: 30.000


In [16]:
# Create feature vs target visualization
if len(numerical_cols) > 0 and len(correlations) > 0:
    top_feature = correlations.index[0]
    
    fig = px.box(
        df, x=target_col, y=top_feature,
        labels={target_col: 'Number of Collisions', top_feature: top_feature.replace('_', ' ').title()}
    )
    
    fig.update_traces(marker=dict(color=app_color_palette[0]))
    fig.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(color='#8B5CF6', size=12),
        title_font=dict(color='#7C3AED', size=16),
        xaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        ),
        yaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        )
    )
    
    fig.write_html(f"{plots_dir}/feature_target_analysis.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
else:
    # No suitable features for target analysis
    fig = go.Figure()
    fig.add_annotation(
        text="No Suitable Features Available for Target Analysis",
        xref="paper", yref="paper",
        x=0.5, y=0.5, xanchor='center', yanchor='middle',
        showarrow=False,
        font=dict(size=18, color='#8B5CF6')
    )
    fig.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(visible=False),
        yaxis=dict(visible=False)
    )
    fig.write_html(f"{plots_dir}/feature_target_analysis.html", include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})

## Summary and Conclusions

This EDA has provided comprehensive insights into the vehicle collision prediction dataset. Key findings will inform the preprocessing and feature engineering steps in the ML pipeline development.