# üîß Predictive Maintenance - Exploratory Data Analysis

**Project:** IoT Machine Failure Prediction  
**Team:** Infotact Solutions  
**Date:** December 2024

---

## Objectives

1. Understand the structure and quality of sensor data
2. Analyze class distribution and imbalance
3. Explore sensor patterns and correlations
4. Identify potential feature engineering opportunities
5. Validate data quality for modeling

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Style settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print('Libraries loaded successfully!')

In [None]:
# Load data
DATA_DIR = Path('../data')
raw_data_path = DATA_DIR / 'raw' / 'Predictive Maintainance dataset.csv'
processed_data_path = DATA_DIR / 'processed' / 'processed_data.csv'

print(f'Raw data path: {raw_data_path}')
print(f'Processed data path: {processed_data_path}')

# Load raw data
df_raw = pd.read_csv(raw_data_path)
print(f'\nRaw data shape: {df_raw.shape}')

# Load processed data if available
if processed_data_path.exists():
    df_processed = pd.read_csv(processed_data_path, index_col=0, parse_dates=True)
    print(f'Processed data shape: {df_processed.shape}')
else:
    print('Processed data not found. Run preprocessing first.')
    df_processed = None

## 1. Data Overview

In [None]:
# Display first rows
print('First 5 rows of raw data:')
df_raw.head()

In [None]:
# Data info
print('Data Types and Non-Null Counts:')
df_raw.info()

In [None]:
# Statistical summary
print('Statistical Summary:')
df_raw.describe()

## 2. Target Variable Analysis

In [None]:
# Target distribution
target_col = 'Machine failure'

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Count plot
ax1 = axes[0]
counts = df_raw[target_col].value_counts()
colors = ['#2ecc71', '#e74c3c']
bars = ax1.bar(['No Failure', 'Failure'], counts.values, color=colors)
ax1.set_title('Machine Failure Distribution', fontsize=14, fontweight='bold')
ax1.set_ylabel('Count')

# Add labels on bars
for bar, count in zip(bars, counts.values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50, 
             f'{count:,}', ha='center', fontweight='bold')

# Pie chart
ax2 = axes[1]
ax2.pie(counts.values, labels=['No Failure', 'Failure'], autopct='%1.1f%%',
        colors=colors, explode=[0, 0.1], startangle=90)
ax2.set_title('Failure Rate', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../outputs/eda_target_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print(f'\nClass Imbalance Ratio: {counts.values[0]/counts.values[1]:.1f}:1')
print(f'Failure Rate: {counts.values[1]/len(df_raw)*100:.2f}%')

## 3. Failure Type Analysis

In [None]:
# Failure types
failure_cols = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
failure_names = {
    'TWF': 'Tool Wear Failure',
    'HDF': 'Heat Dissipation Failure',
    'PWF': 'Power Failure',
    'OSF': 'Overstrain Failure',
    'RNF': 'Random Failure'
}

failure_counts = df_raw[failure_cols].sum().sort_values(ascending=True)

fig, ax = plt.subplots(figsize=(10, 6))
bars = ax.barh([failure_names[c] for c in failure_counts.index], failure_counts.values,
               color=plt.cm.Reds(np.linspace(0.3, 0.9, len(failure_counts))))
ax.set_xlabel('Number of Failures')
ax.set_title('Failure Types Distribution', fontsize=14, fontweight='bold')

# Add labels
for bar, count in zip(bars, failure_counts.values):
    ax.text(bar.get_width() + 2, bar.get_y() + bar.get_height()/2,
            f'{count}', va='center', fontweight='bold')

plt.tight_layout()
plt.savefig('../outputs/eda_failure_types.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Sensor Data Distributions

In [None]:
# Sensor columns
sensor_cols = ['Air temperature [K]', 'Process temperature [K]', 
               'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(sensor_cols):
    ax = axes[i]
    
    # Plot by failure status
    for label, group in df_raw.groupby(target_col):
        color = '#2ecc71' if label == 0 else '#e74c3c'
        label_text = 'No Failure' if label == 0 else 'Failure'
        ax.hist(group[col], bins=50, alpha=0.5, label=label_text, color=color, density=True)
    
    ax.set_xlabel(col)
    ax.set_ylabel('Density')
    ax.legend()
    ax.set_title(f'Distribution: {col}', fontsize=11)

# Hide last subplot
axes[-1].axis('off')

plt.suptitle('Sensor Distributions by Failure Status', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../outputs/eda_sensor_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix
numeric_cols = df_raw.select_dtypes(include=[np.number]).columns
corr_matrix = df_raw[numeric_cols].corr()

fig, ax = plt.subplots(figsize=(14, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r',
            center=0, square=True, linewidths=0.5, ax=ax,
            annot_kws={'fontsize': 8})

ax.set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../outputs/eda_correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Top correlations with target
target_corr = corr_matrix[target_col].drop(target_col).sort_values(key=abs, ascending=False)

fig, ax = plt.subplots(figsize=(10, 8))
colors = ['#e74c3c' if x > 0 else '#3498db' for x in target_corr.values]
bars = ax.barh(range(len(target_corr)), target_corr.values, color=colors)
ax.set_yticks(range(len(target_corr)))
ax.set_yticklabels(target_corr.index)
ax.set_xlabel('Correlation with Machine Failure')
ax.set_title('Feature Correlations with Target', fontsize=14, fontweight='bold')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)

plt.tight_layout()
plt.savefig('../outputs/eda_target_correlations.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Product Type Analysis

In [None]:
# Product types
if 'Type' in df_raw.columns:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Distribution
    type_counts = df_raw['Type'].value_counts()
    axes[0].bar(type_counts.index, type_counts.values, color=['#3498db', '#2ecc71', '#e74c3c'])
    axes[0].set_title('Product Type Distribution', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('Count')
    
    # Failure rate by type
    failure_by_type = df_raw.groupby('Type')[target_col].mean() * 100
    axes[1].bar(failure_by_type.index, failure_by_type.values, 
                color=['#3498db', '#2ecc71', '#e74c3c'])
    axes[1].set_title('Failure Rate by Product Type', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('Failure Rate (%)')
    
    plt.tight_layout()
    plt.savefig('../outputs/eda_product_types.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print('Failure Rate by Product Type:')
    print(failure_by_type)

## 7. Sensor Relationships

In [None]:
# Pairplot of key sensors
key_sensors = ['Air temperature [K]', 'Process temperature [K]', 'Torque [Nm]', 'Tool wear [min]']

g = sns.pairplot(df_raw[key_sensors + [target_col]], 
                 hue=target_col, 
                 palette={0: '#2ecc71', 1: '#e74c3c'},
                 diag_kind='kde',
                 plot_kws={'alpha': 0.5})

g.fig.suptitle('Sensor Relationships by Failure Status', fontsize=16, fontweight='bold', y=1.02)
plt.savefig('../outputs/eda_pairplot.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Engineered Features Analysis (if processed data available)

In [None]:
if df_processed is not None:
    print(f'Processed data contains {len(df_processed.columns)} features:')
    
    # Feature categories
    feature_cats = {
        'Original': [c for c in df_processed.columns if any(s in c for s in sensor_cols)],
        'Lag': [c for c in df_processed.columns if 'lag' in c],
        'Rolling': [c for c in df_processed.columns if 'roll' in c],
        'EMA': [c for c in df_processed.columns if 'ema' in c],
        'ROC': [c for c in df_processed.columns if 'roc' in c],
        'Interaction': [c for c in df_processed.columns if c in ['Temp_diff', 'Power', 'Wear_rate']]
    }
    
    print('\nFeature Categories:')
    for cat, features in feature_cats.items():
        print(f'  {cat}: {len(features)} features')

## 9. Key Insights Summary

In [None]:
# Summary
print('=' * 60)
print('EDA SUMMARY')
print('=' * 60)
print(f'\nüìä Dataset Size: {len(df_raw):,} records')
print(f'üìà Features: {len(df_raw.columns)} columns')
print(f'üéØ Failure Rate: {df_raw[target_col].mean()*100:.2f}%')
print(f'‚öñÔ∏è Class Imbalance: {(1-df_raw[target_col].mean())/df_raw[target_col].mean():.1f}:1')

print('\nüìå Key Observations:')
print('  1. Data is highly imbalanced (~3.4% failure rate)')
print('  2. Heat Dissipation Failure (HDF) is most common failure type')
print('  3. Torque and temperature strongly correlated with failures')
print('  4. Product type H has higher failure rate')

print('\nüîß Recommended Actions:')
print('  1. Use class weighting or SMOTE for imbalance')
print('  2. Engineer lag and rolling features for temporal patterns')
print('  3. Focus on Torque, Temperature, and Tool Wear features')
print('  4. Use time-series CV to prevent data leakage')

---

**End of EDA Notebook**  
*Outputs saved to `outputs/` directory*