# Manufacturing Equipment Output Prediction - EDA

This notebook explores the manufacturing dataset to understand:
- Data structure and summary statistics
- Missing values patterns
- Feature correlations
- Distributions of key features
- Target variable analysis

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style for better plots
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

In [None]:
# Load dataset
base_dir = Path(__file__).resolve().parents[1]
df = pd.read_csv(base_dir / 'data' / 'manufacturing_dataset_1000_samples.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()

In [None]:
# Basic information about the dataset
print("=== DATASET INFO ===")
print(f"Shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nData types:")
print(df.dtypes.value_counts())

print("\n=== FIRST FEW ROWS ===")
df.head()

In [None]:
# Summary statistics for numerical features
print("=== NUMERICAL FEATURES SUMMARY ===")
numerical_cols = df.select_dtypes(include=[np.number]).columns
print(df[numerical_cols].describe())

print("\n=== CATEGORICAL FEATURES SUMMARY ===")
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())

In [None]:
# Missing values analysis
print("=== MISSING VALUES ANALYSIS ===")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_data,
    'Missing_Percent': missing_percent
}).sort_values('Missing_Count', ascending=False)

print(missing_df[missing_df['Missing_Count'] > 0])

# Visualize missing values
plt.figure(figsize=(12, 6))
missing_df[missing_df['Missing_Count'] > 0]['Missing_Percent'].plot(kind='bar')
plt.title('Missing Values Percentage by Feature')
plt.xlabel('Features')
plt.ylabel('Missing Percentage (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap for numerical features
print("=== CORRELATION ANALYSIS ===")
numerical_df = df.select_dtypes(include=[np.number])

# Calculate correlation matrix
correlation_matrix = numerical_df.corr()

# Create heatmap
plt.figure(figsize=(14, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, 
            mask=mask,
            annot=True, 
            cmap='coolwarm', 
            center=0,
            square=True,
            fmt='.2f',
            cbar_kws={"shrink": .8})
plt.title('Correlation Heatmap of Numerical Features', fontsize=16, pad=20)
plt.tight_layout()
plt.show()

# Show top correlations with target variable
if 'Parts_Per_Hour' in correlation_matrix.columns:
    target_correlations = correlation_matrix['Parts_Per_Hour'].abs().sort_values(ascending=False)
    print("\nTop 10 features correlated with Parts_Per_Hour:")
    print(target_correlations.head(10))

In [None]:
# Distribution plots of key numerical features
print("=== DISTRIBUTION OF KEY FEATURES ===")

# Key features to plot (excluding target)
key_features = [
    'Injection_Temperature', 'Injection_Pressure', 'Cycle_Time', 
    'Cooling_Time', 'Material_Viscosity', 'Ambient_Temperature',
    'Machine_Age', 'Operator_Experience', 'Maintenance_Hours',
    'Temperature_Pressure_Ratio', 'Total_Cycle_Time', 'Efficiency_Score',
    'Machine_Utilization'
]

# Filter existing features
existing_features = [f for f in key_features if f in df.columns]

if existing_features:
    # Create subplots
    n_features = len(existing_features)
    n_cols = 3
    n_rows = (n_features + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for i, feature in enumerate(existing_features):
        if i < len(axes):
            # Histogram with KDE
            sns.histplot(data=df, x=feature, kde=True, ax=axes[i], bins=30)
            axes[i].set_title(f'Distribution of {feature}')
            axes[i].set_xlabel(feature)
            axes[i].set_ylabel('Frequency')
    
    # Hide empty subplots
    for i in range(len(existing_features), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()
else:
    print("No expected key features found in the dataset.")

In [None]:
# Target variable analysis
print("=== TARGET VARIABLE ANALYSIS ===")

if 'Parts_Per_Hour' in df.columns:
    target = df['Parts_Per_Hour']
    
    # Basic statistics
    print(f"Target variable: Parts_Per_Hour")
    print(f"Mean: {target.mean():.2f}")
    print(f"Median: {target.median():.2f}")
    print(f"Std: {target.std():.2f}")
    print(f"Min: {target.min():.2f}")
    print(f"Max: {target.max():.2f}")
    print(f"Skewness: {target.skew():.3f}")
    print(f"Kurtosis: {target.kurtosis():.3f}")
    
    # Distribution plot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Histogram with KDE
    sns.histplot(data=df, x='Parts_Per_Hour', kde=True, ax=ax1, bins=30)
    ax1.set_title('Distribution of Parts_Per_Hour')
    ax1.set_xlabel('Parts Per Hour')
    ax1.set_ylabel('Frequency')
    
    # Box plot
    sns.boxplot(data=df, y='Parts_Per_Hour', ax=ax2)
    ax2.set_title('Box Plot of Parts_Per_Hour')
    ax2.set_ylabel('Parts Per Hour')
    
    plt.tight_layout()
    plt.show()
    
    # Check for outliers using IQR method
    Q1 = target.quantile(0.25)
    Q3 = target.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = target[(target < lower_bound) | (target > upper_bound)]
    print(f"\nOutliers (IQR method): {len(outliers)} ({len(outliers)/len(target)*100:.1f}%)")
    print(f"Lower bound: {lower_bound:.2f}")
    print(f"Upper bound: {upper_bound:.2f}")
    
else:
    print("Target column 'Parts_Per_Hour' not found in dataset.")

In [None]:
# Categorical features analysis
print("=== CATEGORICAL FEATURES ANALYSIS ===")

categorical_features = ['Shift', 'Machine_Type', 'Material_Grade', 'Day_of_Week']
existing_cat_features = [f for f in categorical_features if f in df.columns]

if existing_cat_features:
    n_features = len(existing_cat_features)
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    for i, feature in enumerate(existing_cat_features):
        if i < len(axes):
            # Count plot
            sns.countplot(data=df, x=feature, ax=axes[i])
            axes[i].set_title(f'Distribution of {feature}')
            axes[i].set_xlabel(feature)
            axes[i].set_ylabel('Count')
            axes[i].tick_params(axis='x', rotation=45)
    
    # Hide empty subplots
    for i in range(len(existing_cat_features), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()
    
    # Target variable by categorical features
    if 'Parts_Per_Hour' in df.columns:
        print("\n=== TARGET VARIABLE BY CATEGORICAL FEATURES ===")
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        axes = axes.flatten()
        
        for i, feature in enumerate(existing_cat_features):
            if i < len(axes):
                # Box plot
                sns.boxplot(data=df, x=feature, y='Parts_Per_Hour', ax=axes[i])
                                axes[i].set_title(f'Parts_Per_Hour by {feature}')
                axes[i].set_xlabel(feature)
                axes[i].set_ylabel('Parts Per Hour')
                axes[i].tick_params(axis='x', rotation=45)
        
        # Hide empty subplots
        for i in range(len(existing_cat_features), len(axes)):
            axes[i].set_visible(False)
        
        plt.tight_layout()
        plt.show()
else:
    print("No categorical features found in the dataset.")

In [None]:
# Summary of findings
print("=== EDA SUMMARY ===")
print(f"Dataset contains {df.shape[0]} samples with {df.shape[1]} features")
print(f"Target variable: Parts_Per_Hour")
print(f"Numerical features: {len(numerical_cols)}")
print(f"Categorical features: {len(categorical_cols)}")
print(f"Missing values: {df.isnull().sum().sum()} total")

if 'Parts_Per_Hour' in df.columns:
    print(f"Target range: {df['Parts_Per_Hour'].min():.1f} - {df['Parts_Per_Hour'].max():.1f}")
    print(f"Target mean: {df['Parts_Per_Hour'].mean():.1f}")

print("\nKey insights:")
print("- Dataset appears to be manufacturing equipment data with sensor readings")
print("- Contains both continuous (temperature, pressure, time) and categorical (shift, machine type) features")
print("- Target variable shows variation suitable for regression modeling")
print("- Some features have missing values that need to be handled during preprocessing")