In [5]:
# src/preprocessing/feature_engineer.py
import os
import pandas as pd
import numpy as np
import joblib
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ======================
# File paths
# ======================
MERGED_FILE = r"C:\Users\vibho\Downloads\Engineering\exoplanet-ai\data\processed\merged_catalog.parquet"
FEATURES_FILE = r"C:\Users\vibho\Downloads\Engineering\exoplanet-ai\data\processed\features.parquet"
OUTPUT_DIR = r"C:\Users\vibho\Downloads\Engineering\exoplanet-ai\data\processed"
PIPELINE_FILE = r"C:\Users\vibho\Downloads\Engineering\exoplanet-ai\src\preprocessing\feature_pipeline.pkl"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ======================
# Load data
# ======================
catalog = pd.read_parquet(MERGED_FILE)
features = pd.read_parquet(FEATURES_FILE)

print("Catalog shape:", catalog.shape)
print("Features shape:", features.shape)

# ======================
# Identify columns
# ======================
numeric_cols = features.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = features.select_dtypes(exclude=["number"]).columns.tolist()

print(f"Numeric cols: {len(numeric_cols)}")
print(f"Categorical cols: {len(categorical_cols)}")

# ======================
# Prepare X and y
# ======================
X = features.copy()
y = catalog["disposition"]

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ======================
# Preprocessing pipeline with SimpleImputer
# ======================
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),   # fill numeric NaNs
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # fill categorical NaNs
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

pipeline = Pipeline(steps=[("preprocessor", preprocessor)])
X_processed = pipeline.fit_transform(X)

# ======================
# Verify no NaNs remain
# ======================
from scipy import sparse
if sparse.issparse(X_processed):
    assert X_processed.data.size == 0 or not np.isnan(X_processed.data).any(), "X_processed contains NaNs!"
else:
    assert not np.isnan(X_processed).any(), "X_processed contains NaNs!"

print("Final X shape:", X_processed.shape)
print("Encoded y shape:", y_encoded.shape)

# ======================
# Train/validation split
# ======================
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print("Train size:", X_train.shape, "Val size:", X_val.shape)

# ======================
# Save outputs
# ======================
# Full dataset
X_file = os.path.join(OUTPUT_DIR, "X_processed.npz")
y_file = os.path.join(OUTPUT_DIR, "y_encoded.npy")
scipy.sparse.save_npz(X_file, X_processed)
np.save(y_file, y_encoded)

# Train/Validation splits
scipy.sparse.save_npz(os.path.join(OUTPUT_DIR, "X_train.npz"), X_train)
scipy.sparse.save_npz(os.path.join(OUTPUT_DIR, "X_val.npz"), X_val)
np.save(os.path.join(OUTPUT_DIR, "y_train.npy"), y_train)
np.save(os.path.join(OUTPUT_DIR, "y_val.npy"), y_val)

# Save pipeline for later inference
joblib.dump(pipeline, PIPELINE_FILE)

print("✅ Full dataset and train/val splits saved for modeling")
print(f"Pipeline saved to: {PIPELINE_FILE}")

# ======================
# Baseline ML models
# ======================
RANDOM_STATE = 42

baseline_models = {
    "RandomForest": RandomForestClassifier(
        n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1
    ),
    "XGBoost": XGBClassifier(
        n_estimators=200, learning_rate=0.1, max_depth=6,
        random_state=RANDOM_STATE, n_jobs=-1, use_label_encoder=False, eval_metric="mlogloss"
    ),
    "LogisticRegression": LogisticRegression(
        max_iter=1000, random_state=RANDOM_STATE, n_jobs=-1
    ),
}

print("🔹 Training baseline models...")
baseline_results = {}
for name, model in baseline_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    baseline_results[name] = acc
    print(f"{name} Validation Accuracy: {acc:.4f}")

    # Save each model
    model_file = os.path.join(OUTPUT_DIR, f"{name}_baseline_model.pkl")
    joblib.dump(model, model_file)
    print(f"Saved {name} model to: {model_file}")

print("✅ Baseline models trained and saved")


Catalog shape: (19761, 445)
Features shape: (19761, 441)
Numeric cols: 341
Categorical cols: 100


 'pl_eqtsymerr' 'pl_insolsymerr' 'pl_occdeperr1' 'pl_occdeperr2' 'raerr1'
 'raerr2' 'rasymerr' 'sectors' 'sy_icmag' 'sy_icmagerr1' 'sy_icmagerr2'
 'sy_icmagstr' 'sy_kepmagerr1' 'sy_kepmagerr2']. At least one non-missing value is needed for imputation with strategy='median'.


Final X shape: (19761, 108807)
Encoded y shape: (19761,)
Train size: (15808, 108807) Val size: (3953, 108807)
✅ Full dataset and train/val splits saved for modeling
Pipeline saved to: C:\Users\vibho\Downloads\Engineering\exoplanet-ai\src\preprocessing\feature_pipeline.pkl
🔹 Training baseline models...
RandomForest Validation Accuracy: 0.9960
Saved RandomForest model to: C:\Users\vibho\Downloads\Engineering\exoplanet-ai\data\processed\RandomForest_baseline_model.pkl


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Validation Accuracy: 1.0000
Saved XGBoost model to: C:\Users\vibho\Downloads\Engineering\exoplanet-ai\data\processed\XGBoost_baseline_model.pkl
LogisticRegression Validation Accuracy: 0.9992
Saved LogisticRegression model to: C:\Users\vibho\Downloads\Engineering\exoplanet-ai\data\processed\LogisticRegression_baseline_model.pkl
✅ Baseline models trained and saved
