# Machine Failure Prediction - Exploratory Data Analysis

This notebook contains the exploratory data analysis for the machine failure prediction dataset.
The goal is to understand the data characteristics and inform preprocessing decisions for the ML pipeline.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Set up paths
data_path = '/Users/yuvalheffetz/ds-agent-projects/session_ce64539f-782b-46c7-ab41-9bf37519daed/data/train_set.csv'
plots_dir = Path('/Users/yuvalheffetz/ds-agent-projects/session_ce64539f-782b-46c7-ab41-9bf37519daed/research/plots')
plots_dir.mkdir(exist_ok=True)

# App color palette
app_color_palette = [
    'rgba(99, 110, 250, 0.8)',   # Blue
    'rgba(239, 85, 59, 0.8)',    # Red/Orange-Red
    'rgba(0, 204, 150, 0.8)',    # Green
    'rgba(171, 99, 250, 0.8)',   # Purple
    'rgba(255, 161, 90, 0.8)',   # Orange
    'rgba(25, 211, 243, 0.8)',   # Cyan
    'rgba(255, 102, 146, 0.8)',  # Pink
    'rgba(182, 232, 128, 0.8)',  # Light Green
    'rgba(255, 151, 255, 0.8)',  # Magenta
    'rgba(254, 203, 82, 0.8)'    # Yellow
]

## 1. Load and Examine Dataset Structure

In [None]:
# Load the dataset
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"Number of features: {df.shape[1] - 1}")
print(f"Number of samples: {df.shape[0]}")
print("\nColumn names:")
print(df.columns.tolist())

print("\nFirst few rows:")
df.head()

## 2. Data Types and Missing Values Analysis

In [None]:
# Check data types and missing values
print("Data types:")
print(df.dtypes)

print("\nMissing values:")
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
print(missing_df)

if missing_df.empty:
    print("No missing values found in the dataset!")

# Basic statistics
print("\nBasic statistics:")
df.describe()

## 3. Target Variable Analysis

In [None]:
# Analyze target variable
target_col = 'failure'
target_counts = df[target_col].value_counts()
target_props = df[target_col].value_counts(normalize=True)

print(f"Target variable '{target_col}' distribution:")
print(f"Class 0 (No failure): {target_counts[0]} ({target_props[0]:.2%})")
print(f"Class 1 (Failure): {target_counts[1]} ({target_props[1]:.2%})")
print(f"Class balance ratio: {target_counts[1] / target_counts[0]:.2f}")

# Create target distribution plot
fig = px.bar(x=['No Failure', 'Failure'], 
             y=[target_counts[0], target_counts[1]],
             labels={'x': 'Machine Status', 'y': 'Count'},
             color=['No Failure', 'Failure'])

# Apply styling
fig.update_traces(marker=dict(color=[app_color_palette[0], app_color_palette[1]]))
fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16),
    xaxis=dict(
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    ),
    yaxis=dict(
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    ),
    legend=dict(font=dict(color='#8B5CF6', size=11)),
    showlegend=False
)

fig.write_html(plots_dir / "target_distribution.html", 
               include_plotlyjs=True, 
               config={'responsive': True, 'displayModeBar': False})
fig.show()

## 4. Feature Types and Categories

In [None]:
# Identify feature types
features = [col for col in df.columns if col != target_col]
numerical_features = df[features].select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df[features].select_dtypes(exclude=[np.number]).columns.tolist()

print(f"Total features: {len(features)}")
print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

if categorical_features:
    print(f"\nCategorical features: {categorical_features}")
    for cat_col in categorical_features:
        print(f"\n{cat_col} unique values: {df[cat_col].nunique()}")
        print(df[cat_col].value_counts())

print(f"\nNumerical features: {numerical_features[:10]}...") if len(numerical_features) > 10 else print(f"\nNumerical features: {numerical_features}")

## 5. Numerical Features Distribution Analysis

In [None]:
# Analyze numerical features distributions
print("Numerical features statistics:")
num_stats = df[numerical_features].describe()
print(num_stats)

# Check for skewness
skewness = df[numerical_features].skew().sort_values(key=abs, ascending=False)
print("\nSkewness (absolute values > 1 indicate high skewness):")
print(skewness.head(10))

# Create distribution plots for most skewed features
most_skewed_features = skewness.head(6).index.tolist()

# Create subplots for feature distributions
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=most_skewed_features,
    vertical_spacing=0.08
)

for i, feature in enumerate(most_skewed_features):
    row = i // 3 + 1
    col = i % 3 + 1
    
    fig.add_trace(
        go.Histogram(x=df[feature], name=feature, showlegend=False,
                    marker_color=app_color_palette[i % len(app_color_palette)]),
        row=row, col=col
    )

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=10),
    title_font=dict(color='#7C3AED', size=14)
)

# Update all subplot axes
fig.update_xaxes(
    gridcolor='rgba(139,92,246,0.2)',
    zerolinecolor='rgba(139,92,246,0.3)',
    tickfont=dict(color='#8B5CF6', size=9)
)
fig.update_yaxes(
    gridcolor='rgba(139,92,246,0.2)',
    zerolinecolor='rgba(139,92,246,0.3)',
    tickfont=dict(color='#8B5CF6', size=9)
)

fig.write_html(plots_dir / "feature_distributions.html", 
               include_plotlyjs=True, 
               config={'responsive': True, 'displayModeBar': False})
fig.show()

## 6. Outlier Detection

In [None]:
# Detect outliers using IQR method
outlier_counts = {}
outlier_info = []

for feature in numerical_features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    outlier_count = len(outliers)
    outlier_percentage = (outlier_count / len(df)) * 100
    
    outlier_counts[feature] = outlier_count
    outlier_info.append({
        'feature': feature,
        'outlier_count': outlier_count,
        'outlier_percentage': outlier_percentage,
        'lower_bound': lower_bound,
        'upper_bound': upper_bound
    })

# Sort by outlier count
outlier_df = pd.DataFrame(outlier_info).sort_values('outlier_count', ascending=False)
print("Features with most outliers:")
print(outlier_df.head(10))

# Create boxplot for features with most outliers
top_outlier_features = outlier_df.head(6)['feature'].tolist()

fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=top_outlier_features,
    vertical_spacing=0.1
)

for i, feature in enumerate(top_outlier_features):
    row = i // 3 + 1
    col = i % 3 + 1
    
    fig.add_trace(
        go.Box(y=df[feature], name=feature, showlegend=False,
               marker_color=app_color_palette[i % len(app_color_palette)]),
        row=row, col=col
    )

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=10),
    title_font=dict(color='#7C3AED', size=14)
)

fig.update_xaxes(
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=9)
)
fig.update_yaxes(
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=9)
)

fig.write_html(plots_dir / "outlier_analysis.html", 
               include_plotlyjs=True, 
               config={'responsive': True, 'displayModeBar': False})
fig.show()

## 7. Feature Correlation Analysis

In [None]:
# Calculate correlation matrix
corr_matrix = df[numerical_features + [target_col]].corr()

# Find highly correlated feature pairs (excluding target)
high_corr_pairs = []
for i in range(len(numerical_features)):
    for j in range(i+1, len(numerical_features)):
        corr_val = corr_matrix.iloc[i, j]
        if abs(corr_val) > 0.7:  # High correlation threshold
            high_corr_pairs.append({
                'feature1': numerical_features[i],
                'feature2': numerical_features[j],
                'correlation': corr_val
            })

print(f"Highly correlated feature pairs (|correlation| > 0.7): {len(high_corr_pairs)}")
if high_corr_pairs:
    high_corr_df = pd.DataFrame(high_corr_pairs).sort_values('correlation', key=abs, ascending=False)
    print(high_corr_df.head(10))

# Correlation with target
target_correlations = corr_matrix[target_col].drop(target_col).sort_values(key=abs, ascending=False)
print(f"\nTop 10 features correlated with target:")
print(target_correlations.head(10))

# Create correlation heatmap for top features
top_features = target_correlations.head(15).index.tolist() + [target_col]
corr_subset = df[top_features].corr()

fig = go.Figure(data=go.Heatmap(
    z=corr_subset.values,
    x=corr_subset.columns,
    y=corr_subset.columns,
    colorscale='RdBu',
    zmid=0,
    text=np.round(corr_subset.values, 2),
    texttemplate="%{text}",
    textfont={"size": 8},
    hoverongaps=False
))

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=10),
    title_font=dict(color='#7C3AED', size=14),
    xaxis=dict(
        tickangle=45,
        tickfont=dict(color='#8B5CF6', size=9)
    ),
    yaxis=dict(
        tickfont=dict(color='#8B5CF6', size=9)
    )
)

fig.write_html(plots_dir / "correlation_heatmap.html", 
               include_plotlyjs=True, 
               config={'responsive': True, 'displayModeBar': False})
fig.show()

## 8. Feature-Target Relationship Analysis

In [None]:
# Analyze top correlated features with target by class
top_target_features = target_correlations.head(8).index.tolist()

fig = make_subplots(
    rows=2, cols=4,
    subplot_titles=top_target_features,
    vertical_spacing=0.1
)

for i, feature in enumerate(top_target_features):
    row = i // 4 + 1
    col = i % 4 + 1
    
    # Create separate traces for each class
    for class_val in [0, 1]:
        class_data = df[df[target_col] == class_val][feature]
        class_name = 'No Failure' if class_val == 0 else 'Failure'
        
        fig.add_trace(
            go.Box(y=class_data, 
                   name=f'{class_name}',
                   showlegend=(i == 0),  # Only show legend for first subplot
                   marker_color=app_color_palette[class_val]),
            row=row, col=col
        )

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=9),
    title_font=dict(color='#7C3AED', size=12),
    legend=dict(font=dict(color='#8B5CF6', size=10))
)

fig.update_xaxes(
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=8)
)
fig.update_yaxes(
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=8)
)

fig.write_html(plots_dir / "feature_target_relationship.html", 
               include_plotlyjs=True, 
               config={'responsive': True, 'displayModeBar': False})
fig.show()

## 9. Class Separation Analysis

In [None]:
# Analyze class separation for most discriminative features
most_discriminative = target_correlations.head(6).index.tolist()

# Calculate mean differences between classes
class_differences = []
for feature in most_discriminative:
    mean_0 = df[df[target_col] == 0][feature].mean()
    mean_1 = df[df[target_col] == 1][feature].mean()
    std_0 = df[df[target_col] == 0][feature].std()
    std_1 = df[df[target_col] == 1][feature].std()
    
    # Effect size (Cohen's d)
    pooled_std = np.sqrt((std_0**2 + std_1**2) / 2)
    cohens_d = abs(mean_1 - mean_0) / pooled_std if pooled_std > 0 else 0
    
    class_differences.append({
        'feature': feature,
        'mean_no_failure': mean_0,
        'mean_failure': mean_1,
        'difference': abs(mean_1 - mean_0),
        'cohens_d': cohens_d
    })

class_diff_df = pd.DataFrame(class_differences).sort_values('cohens_d', ascending=False)
print("Class separation analysis (Cohen's d):")
print(class_diff_df)

# Create scatter plot matrix for top discriminative features
top_3_features = class_diff_df.head(3)['feature'].tolist()

fig = px.scatter_matrix(
    df, 
    dimensions=top_3_features,
    color=df[target_col].map({0: 'No Failure', 1: 'Failure'}),
    color_discrete_map={'No Failure': app_color_palette[0], 'Failure': app_color_palette[1]}
)

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=9),
    title_font=dict(color='#7C3AED', size=12),
    legend=dict(font=dict(color='#8B5CF6', size=10))
)

fig.update_traces(
    diagonal_visible=False,
    marker=dict(size=3, opacity=0.7)
)

fig.write_html(plots_dir / "class_separation_analysis.html", 
               include_plotlyjs=True, 
               config={'responsive': True, 'displayModeBar': False})
fig.show()

## 10. Summary Statistics by Class

In [None]:
# Summary statistics by class for key features
key_features = target_correlations.head(10).index.tolist()

print("Summary statistics by class for top 10 most correlated features:")
for feature in key_features:
    print(f"\n{feature}:")
    summary = df.groupby(target_col)[feature].describe()
    print(summary)

# Store results for report generation
eda_results = {
    'dataset_shape': df.shape,
    'target_distribution': target_counts.to_dict(),
    'feature_types': {
        'numerical': len(numerical_features),
        'categorical': len(categorical_features)
    },
    'missing_values': len(missing_df),
    'high_correlations': len(high_corr_pairs),
    'top_target_correlations': target_correlations.head(10).to_dict(),
    'class_differences': class_diff_df.to_dict('records')
}

print("\nEDA completed successfully!")
print(f"Results saved to: {plots_dir}")