# Cluster Performance Data Exploratory Analysis

This notebook provides exploratory data analysis for the cluster performance dataset.

In [None]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_preprocessor import ClusterDataPreprocessor

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

%matplotlib inline

## Load and Explore Data

In [None]:
# Load data
preprocessor = ClusterDataPreprocessor()
df = preprocessor.load_data('../data/raw/cluster_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {len(df.columns)}")
df.head()

## Data Quality Analysis

In [None]:
# Missing values
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

print(f"Columns with missing values: {len(missing_df)}")
missing_df.head(10)

## Feature and Target Identification

In [None]:
# Identify feature and target columns
feature_cols, target_cols = preprocessor.identify_columns(df)

print(f"Feature columns: {len(feature_cols)}")
print(f"Target columns: {len(target_cols)}")

print("\nFeature columns:")
for col in feature_cols:
    print(f"  - {col}")

print("\nTarget columns (first 20):")
for col in target_cols[:20]:
    print(f"  - {col}")

## Feature Analysis

In [None]:
# Analyze categorical features
categorical_features = df[feature_cols].select_dtypes(include=['object']).columns

print(f"Categorical features: {len(categorical_features)}")

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, col in enumerate(categorical_features[:6]):
    value_counts = df[col].value_counts().head(10)
    axes[i].bar(range(len(value_counts)), value_counts.values)
    axes[i].set_title(f'{col} Distribution')
    axes[i].set_xticks(range(len(value_counts)))
    axes[i].set_xticklabels(value_counts.index, rotation=45, ha='right')

plt.tight_layout()
plt.show()

## Target Metrics Analysis

In [None]:
# Analyze target distributions
numerical_targets = df[target_cols].select_dtypes(include=[np.number]).columns[:12]

fig, axes = plt.subplots(3, 4, figsize=(20, 15))
axes = axes.ravel()

for i, col in enumerate(numerical_targets):
    df[col].hist(bins=30, ax=axes[i], alpha=0.7)
    axes[i].set_title(col)
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
# Correlation between numerical features and targets
numerical_features = df[feature_cols].select_dtypes(include=[np.number]).columns
sample_targets = df[target_cols].select_dtypes(include=[np.number]).columns[:10]

if len(numerical_features) > 0 and len(sample_targets) > 0:
    corr_data = df[list(numerical_features) + list(sample_targets)]
    correlation_matrix = corr_data.corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix (Features vs Sample Targets)')
    plt.show()
else:
    print("No numerical features or targets found for correlation analysis")

## Summary Statistics

In [None]:
# Summary statistics for numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
summary_stats = df[numerical_cols].describe()

print("Summary statistics for numerical columns:")
summary_stats.round(4)

## Data Preprocessing Preview

In [None]:
# Preview preprocessing steps
print("Preprocessing pipeline preview...")

# Clean data
df_clean = preprocessor.clean_data(df)
print(f"After cleaning - Missing values: {df_clean.isnull().sum().sum()}")

# Encode categorical features
df_encoded = preprocessor.encode_categorical_features(df_clean, fit=True)
print(f"After encoding - Categorical columns: {len(df_encoded.select_dtypes(include=['object']).columns)}")

# Preview feature and target separation
X = df_encoded[feature_cols].copy()
y = df_encoded[target_cols].copy()

print(f"\nFeatures shape: {X.shape}")
print(f"Targets shape: {y.shape}")

print("\nPreprocessing completed successfully!")