# 02 â€“ Feature Extraction & Selection

Takes `train_clean.csv`, removes highly correlated columns, applies **Mutual Information**, **tree-based importance**, and finally keeps the top-k features (saved as `selected_features.pkl`).

**Input**  
`train_clean.csv`

**Outputs**  
- `X_selected.csv` (selected numeric features)
- `selected_features.pkl`
- `y.csv`

In [None]:
!pip install -q pandas numpy scikit-learn

In [None]:
import pandas as pd, numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
import pickle, os
from datetime import datetime
print(f"Start: {datetime.now():%Y-%m-%d %H:%M:%S}")

## 1. Load clean data

In [None]:
clean_path = '/content/drive/MyDrive/QuantumBoost2025/train_clean.csv'
df = pd.read_csv(clean_path)
X = df.drop('Toxicity_Class', axis=1)
y = df['Toxicity_Class']
print(f"Loaded {X.shape[1]} features")

## 2. Correlation filter

In [None]:
corr = X.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [c for c in upper.columns if any(upper[c] > 0.95)]
X = X.drop(columns=to_drop)
print(f"After correlation filter: {X.shape[1]} features")

## 3. Mutual-Information selection

In [None]:
k = min(100, X.shape[1])
mi = SelectKBest(mutual_info_classif, k=k).fit(X, y)
X = X[X.columns[mi.get_support()]]
print(f"After MI: {X.shape[1]} features")

## 4. Tree-based importance (cumulative 95 % or top-20)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X, y)
imp = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_})
imp = imp.sort_values('importance', ascending=False)
imp['cumulative'] = imp['importance'].cumsum()

top = imp[imp['cumulative'] <= 0.95]['feature'].tolist()
if len(top) < 20:
    top = imp.head(20)['feature'].tolist()

X_selected = X[top]
print(f"Final selected features: {len(top)}")

## 5. Save

In [None]:
out_dir = '/content/drive/MyDrive/QuantumBoost2025/'
X_selected.to_csv(out_dir + 'X_selected.csv', index=False)
y.to_csv(out_dir + 'y.csv', index=False)
with open(out_dir + 'selected_features.pkl', 'wb') as f:
    pickle.dump(top, f)
print("Feature-extraction artefacts saved.")