# Feature Visualization and Analysis
This notebook visualizes extracted features and their importance for classification.

In [None]:
import sys
import os
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## Load Extracted Features

In [None]:
# Load features
features_path = '../data/features/train_features.csv'
features_df = pd.read_csv(features_path)

print(f"Features shape: {features_df.shape}")
print(f"\nFeature columns ({len(features_df.columns)-1} features):")
print(features_df.columns.tolist()[:10])

# Separate features and labels
feature_cols = [col for col in features_df.columns if col != 'LABEL']
X = features_df[feature_cols].values
y = features_df['LABEL'].values

print(f"\nLabel distribution:")
print(pd.Series(y).value_counts())

## Feature Importance Analysis

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a random forest to get feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importance
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 20 features
plt.figure(figsize=(10, 8))
top_features = importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'].values)
plt.yticks(range(len(top_features)), top_features['feature'].values)
plt.xlabel('Feature Importance')
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.show()

print("Top 10 most important features:")
print(importance.head(10))

## Feature Correlation Analysis

In [None]:
# Calculate correlation matrix for top features
top_feature_names = importance.head(15)['feature'].values
top_features_data = features_df[top_feature_names]

# Compute correlation matrix
corr_matrix = top_features_data.corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Feature Correlation Matrix (Top 15 Features)')
plt.tight_layout()
plt.show()

## PCA Visualization

In [None]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot
plt.figure(figsize=(10, 8))
colors = ['blue', 'red']
labels = ['No Planet', 'Planet']

for i, label_val in enumerate([1, 2]):
    mask = y == label_val
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], 
               c=colors[i], label=labels[i], alpha=0.6, s=30)

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.title('PCA Visualization of Features')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2%}")

## t-SNE Visualization

In [None]:
# Apply t-SNE (on a subset for speed)
sample_size = min(1000, len(X))
indices = np.random.choice(len(X), sample_size, replace=False)
X_sample = X_scaled[indices]
y_sample = y[indices]

tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_sample)

# Plot
plt.figure(figsize=(10, 8))

for i, label_val in enumerate([1, 2]):
    mask = y_sample == label_val
    plt.scatter(X_tsne[mask, 0], X_tsne[mask, 1], 
               c=colors[i], label=labels[i], alpha=0.6, s=30)

plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.title('t-SNE Visualization of Features')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Feature Distribution by Class

In [None]:
# Plot distributions of top features
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

top_9_features = importance.head(9)['feature'].values

for i, feature in enumerate(top_9_features):
    planet_values = features_df[features_df['LABEL'] == 2][feature]
    no_planet_values = features_df[features_df['LABEL'] == 1][feature]
    
    axes[i].hist(planet_values, bins=30, alpha=0.5, label='Planet', color='blue', density=True)
    axes[i].hist(no_planet_values, bins=30, alpha=0.5, label='No Planet', color='red', density=True)
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Density')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.suptitle('Feature Distributions by Class', fontsize=16)
plt.tight_layout()
plt.show()

## Transit-Specific Features Analysis

In [None]:
# Analyze transit-specific features
transit_features = [col for col in feature_cols if 'transit' in col.lower()]
print(f"Transit-specific features: {transit_features}")

if transit_features:
    # Compare transit features between classes
    transit_comparison = pd.DataFrame()
    
    for feature in transit_features:
        transit_comparison[feature] = [
            features_df[features_df['LABEL'] == 1][feature].mean(),
            features_df[features_df['LABEL'] == 2][feature].mean()
        ]
    
    transit_comparison.index = ['No Planet', 'Planet']
    
    # Plot comparison
    plt.figure(figsize=(12, 6))
    transit_comparison.T.plot(kind='bar')
    plt.xlabel('Transit Features')
    plt.ylabel('Mean Value')
    plt.title('Transit Feature Comparison by Class')
    plt.xticks(rotation=45, ha='right')
    plt.legend(title='Class')
    plt.tight_layout()
    plt.show()
    
    print("\nTransit feature statistics:")
    print(transit_comparison)