In [4]:
# src/preprocessing/feature_engineer.py
import os
import pandas as pd
import numpy as np
import joblib
import scipy.sparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ======================
# File paths
# ======================
MERGED_FILE = r"C:\Users\vibho\Downloads\Engineering\exoplanet-ai\data\processed\merged_catalog.parquet"
FEATURES_FILE = r"C:\Users\vibho\Downloads\Engineering\exoplanet-ai\data\processed\features.parquet"

OUTPUT_DIR = r"C:\Users\vibho\Downloads\Engineering\exoplanet-ai\data\processed"
PIPELINE_FILE = r"C:\Users\vibho\Downloads\Engineering\exoplanet-ai\src\preprocessing\feature_pipeline.pkl"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ======================
# Load data
# ======================
catalog = pd.read_parquet(MERGED_FILE)
features = pd.read_parquet(FEATURES_FILE)

print("Catalog shape:", catalog.shape)
print("Features shape:", features.shape)

# ======================
# Identify columns
# ======================
numeric_cols = features.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = features.select_dtypes(exclude=["number"]).columns.tolist()

print(f"Numeric cols: {len(numeric_cols)}")
print(f"Categorical cols: {len(categorical_cols)}")

# ======================
# Prepare X and y
# ======================
X = features.copy()
y = catalog["disposition"]

# Handle missing values
for col in numeric_cols:
    X[col] = X[col].fillna(X[col].median())
for col in categorical_cols:
    X[col] = X[col].fillna("missing")

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# ======================
# Preprocessing pipeline
# ======================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ]
)

pipeline = Pipeline(steps=[("preprocessor", preprocessor)])

X_processed = pipeline.fit_transform(X)

print("Final X shape:", X_processed.shape)
print("Encoded y shape:", y_encoded.shape)

# ======================
# Train/validation split
# ======================
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print("Train size:", X_train.shape, "Val size:", X_val.shape)

# ======================
# Save outputs
# ======================
# Full dataset
X_file = os.path.join(OUTPUT_DIR, "X_processed.npz")
y_file = os.path.join(OUTPUT_DIR, "y_encoded.npy")

scipy.sparse.save_npz(X_file, X_processed)
np.save(y_file, y_encoded)

# Train/Validation splits
scipy.sparse.save_npz(os.path.join(OUTPUT_DIR, "X_train.npz"), X_train)
scipy.sparse.save_npz(os.path.join(OUTPUT_DIR, "X_val.npz"), X_val)
np.save(os.path.join(OUTPUT_DIR, "y_train.npy"), y_train)
np.save(os.path.join(OUTPUT_DIR, "y_val.npy"), y_val)

# Save pipeline for later inference
joblib.dump(pipeline, PIPELINE_FILE)

print("✅ Full dataset and train/val splits saved for modeling")
print(f"Pipeline saved to: {PIPELINE_FILE}")


Catalog shape: (19761, 445)
Features shape: (19761, 441)
Numeric cols: 341
Categorical cols: 100


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Final X shape: (19761, 108827)
Encoded y shape: (19761,)
Train size: (15808, 108827) Val size: (3953, 108827)
✅ Full dataset and train/val splits saved for modeling
Pipeline saved to: C:\Users\vibho\Downloads\Engineering\exoplanet-ai\src\preprocessing\feature_pipeline.pkl
