# Phase 2-1: Feature Importance Analysis

This notebook analyzes the LGBM importance results from `compute_importance.py` and identifies
deletion candidates for Phase 2-2 permutation testing.

## Objectives
1. Visualize importance distributions
2. Identify low-importance features (bottom 20-30%)
3. Extract stable low-importance features as candidates
4. Generate `phase2_importance_candidates.json`

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime, timezone

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Importance Data

In [None]:
# Load importance summary
summary_path = Path("../../results/feature_selection/tier1_importance_summary.csv")

if not summary_path.exists():
    print(f"Error: {summary_path} not found. Run compute_importance.py first.")
else:
    df = pd.read_csv(summary_path)
    print(f"Loaded importance for {len(df)} features")
    print(f"\nColumns: {list(df.columns)}")
    print(f"\nFirst few rows:")
    display(df.head())

## 2. Data Summary

In [None]:
# Summary statistics
print("Summary Statistics:")
print(df[['mean_gain', 'std_gain', 'mean_split', 'mean_gain_normalized']].describe())

# Check for features with zero importance
zero_importance = df[df['mean_gain'] == 0]
print(f"\nFeatures with zero mean_gain: {len(zero_importance)}")

## 3. Importance Distribution Visualizations

In [None]:
# Histogram of mean_gain
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Linear scale
axes[0].hist(df['mean_gain'], bins=50, edgecolor='black')
axes[0].set_xlabel('Mean Gain Importance')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Mean Gain Importance (Linear Scale)')
axes[0].axvline(df['mean_gain'].quantile(0.25), color='red', linestyle='--', label='25th percentile')
axes[0].legend()

# Log scale (for features with non-zero importance)
non_zero = df[df['mean_gain'] > 0]['mean_gain']
axes[1].hist(non_zero, bins=50, edgecolor='black')
axes[1].set_xlabel('Mean Gain Importance')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Mean Gain Importance (Log Scale, non-zero only)')
axes[1].set_xscale('log')
axes[1].axvline(df['mean_gain'].quantile(0.25), color='red', linestyle='--', label='25th percentile')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Scatter plot: mean_gain vs std_gain
plt.figure(figsize=(12, 8))
plt.scatter(df['mean_gain'], df['std_gain'], alpha=0.5, s=20)
plt.xlabel('Mean Gain Importance')
plt.ylabel('Std Gain Importance')
plt.title('Mean Gain vs Std Gain')
plt.xscale('log')
plt.yscale('log')

# Add threshold lines
mean_threshold = df['mean_gain'].quantile(0.25)
std_threshold = df['std_gain'].median()

plt.axvline(mean_threshold, color='red', linestyle='--', label=f'25th percentile mean_gain = {mean_threshold:.6f}')
plt.axhline(std_threshold, color='blue', linestyle='--', label=f'Median std_gain = {std_threshold:.6f}')

plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 4. Top and Bottom Features

In [None]:
# Top 20 features
print("Top 20 features by mean_gain:")
top_20 = df.nlargest(20, 'mean_gain')[['feature_name', 'mean_gain', 'std_gain', 'mean_gain_normalized']]
display(top_20)

# Bar plot of top 20
plt.figure(figsize=(12, 8))
plt.barh(range(20), top_20['mean_gain'].values[::-1])
plt.yticks(range(20), top_20['feature_name'].values[::-1])
plt.xlabel('Mean Gain Importance')
plt.title('Top 20 Features by Mean Gain Importance')
plt.tight_layout()
plt.show()

In [None]:
# Bottom 20 features
print("Bottom 20 features by mean_gain:")
bottom_20 = df.nsmallest(20, 'mean_gain')[['feature_name', 'mean_gain', 'std_gain', 'mean_gain_normalized']]
display(bottom_20)

# Bar plot of bottom 20
plt.figure(figsize=(12, 8))
plt.barh(range(20), bottom_20['mean_gain'].values)
plt.yticks(range(20), bottom_20['feature_name'].values)
plt.xlabel('Mean Gain Importance')
plt.title('Bottom 20 Features by Mean Gain Importance')
plt.tight_layout()
plt.show()

## 5. Extract Deletion Candidates

Apply selection criteria:
- `mean_gain < quantile(0.25)` (bottom 25%)
- `std_gain < median(std_gain)` (stable, low variance across folds)

This identifies features that are consistently low importance across all folds.

In [None]:
# Define thresholds
mean_gain_threshold = df['mean_gain'].quantile(0.25)
std_gain_threshold = df['std_gain'].median()

print(f"Selection criteria:")
print(f"  mean_gain < {mean_gain_threshold:.8f} (25th percentile)")
print(f"  std_gain < {std_gain_threshold:.8f} (median)")

# Apply criteria
candidates_df = df[
    (df['mean_gain'] < mean_gain_threshold) &
    (df['std_gain'] < std_gain_threshold)
].copy()

print(f"\nCandidates identified: {len(candidates_df)} features")
print(f"Ratio: {len(candidates_df) / len(df):.1%} of total features")

# Display candidates
print("\nCandidate features (sorted by mean_gain):")
candidates_sorted = candidates_df.sort_values('mean_gain')
display(candidates_sorted[['feature_name', 'mean_gain', 'std_gain', 'mean_gain_normalized']])

## 6. Candidate Feature Name Patterns

Analyze which feature groups are represented in the candidates.

In [None]:
def categorize_feature(name):
    """Categorize feature by name pattern."""
    if name.startswith('co_miss'):
        return 'SU5 (co_miss)'
    elif '/' in name:
        prefix = name.split('/')[0]
        return f'SU1 ({prefix})'
    else:
        return 'Raw'

candidates_df['category'] = candidates_df['feature_name'].apply(categorize_feature)

# Count by category
category_counts = candidates_df['category'].value_counts()
print("Candidates by feature category:")
print(category_counts)

# Plot
plt.figure(figsize=(10, 6))
category_counts.plot(kind='bar')
plt.xlabel('Feature Category')
plt.ylabel('Count')
plt.title('Deletion Candidates by Feature Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 7. Generate Candidates JSON

Output: `results/feature_selection/phase2_importance_candidates.json`

In [None]:
# Prepare candidates list
candidates_list = []

for _, row in candidates_df.iterrows():
    candidate_item = {
        "feature_name": row['feature_name'],
        "mean_gain": float(row['mean_gain']),
        "std_gain": float(row['std_gain']),
        "share_of_total": float(row['mean_gain_normalized']),
        "note": "Low and stable importance across folds"
    }
    candidates_list.append(candidate_item)

# Create output JSON
output_json = {
    "version": "phase2-v1",
    "created_at": datetime.now(timezone.utc).isoformat(),
    "source_tier": "tier1",
    "selection_criteria": {
        "method": "lgbm_importance",
        "metric": "gain",
        "threshold_quantile": 0.25,
        "threshold_mean_gain": float(mean_gain_threshold),
        "threshold_std_gain": float(std_gain_threshold),
        "require_stable_low": True
    },
    "candidates": candidates_list,
    "summary": {
        "total_features": int(len(df)),
        "candidate_count": int(len(candidates_list)),
        "candidate_ratio": float(len(candidates_list) / len(df))
    }
}

# Save
output_path = Path("../../results/feature_selection/phase2_importance_candidates.json")
output_path.parent.mkdir(parents=True, exist_ok=True)

with output_path.open("w", encoding="utf-8") as f:
    json.dump(output_json, f, indent=2, ensure_ascii=False)

print(f"Saved candidates to: {output_path}")
print(f"\nSummary:")
print(f"  Total features: {output_json['summary']['total_features']}")
print(f"  Candidates: {output_json['summary']['candidate_count']}")
print(f"  Ratio: {output_json['summary']['candidate_ratio']:.1%}")

## 8. Next Steps

1. Review the candidates list and ensure no critical features are included
2. Run `permutation_importance.py` with these candidates
3. Analyze permutation results to confirm which features can be safely deleted