In [12]:
# === 1. Imports ===
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [15]:
# === 2. Load datasets ===
X_train_transform = pd.read_csv("../../data/processed/transformed/X_train_transform_scaled_resampled.csv")
X_train_no_transform = pd.read_csv("../../data/processed/no_transformed/X_train_no_transform_scaled_resampled.csv")
y_train_transform = pd.read_csv("../../data/processed/transformed/y_train_transform_scaled_resampled.csv")
y_train_no_transform = pd.read_csv("../../data/processed/no_transformed/y_train_no_transform_scaled_resampled.csv")

X_val_transform = pd.read_csv("../../data/processed/transformed/X_val_transform_scaled.csv")
X_val_no_transform = pd.read_csv("../../data/processed/no_transformed/X_val_no_transform_scaled.csv")

X_test_transform = pd.read_csv("../../data/processed/transformed/X_test_transform_scaled.csv")
X_test_no_transform = pd.read_csv("../../data/processed/no_transformed/X_test_no_transform_scaled.csv")
y_test_transform = pd.read_csv("../../data/processed/transformed/y_test_transform.csv")
y_test_no_transform = pd.read_csv("../../data/processed/no_transformed/y_test_no_transform.csv")

In [23]:
# === 3. Train RF on transformed data ===
rf_trans = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_trans.fit(X_train_transform, y_train_transform.values.ravel())

selector_trans = SelectFromModel(rf_trans, threshold='median', prefit=True)
selected_cols_trans = X_train_transform.columns[selector_trans.get_support()]

X_train_trans_selected = pd.DataFrame(selector_trans.transform(X_train_transform))
X_val_trans_selected = pd.DataFrame(selector_trans.transform(X_val_transform))
X_test_trans_selected = pd.DataFrame(selector_trans.transform(X_test_transform))

# store the selector and selected columns
joblib.dump(selector_trans, "../../models/selector_transform.pkl")
selected_cols_trans.to_series().to_csv("../../data/processed/selected_columns_transform.csv", index=False)

# === 4. Train RF on no-transform data ===
rf_raw = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_raw.fit(X_train_no_transform, y_train_no_transform.values.ravel())

selector_raw = SelectFromModel(rf_raw, threshold='median', prefit=True)
selected_cols_raw = X_train_no_transform.columns[selector_raw.get_support()]

X_train_raw_selected = pd.DataFrame(selector_raw.transform(X_train_no_transform))
X_val_raw_selected = pd.DataFrame(selector_raw.transform(X_val_no_transform))
X_test_raw_selected = pd.DataFrame(selector_raw.transform(X_test_no_transform))

joblib.dump(selector_raw, "../../models/selector_no_transform.pkl")
selected_cols_raw.to_series().to_csv("../../data/processed/selected_columns_no_transform.csv", index=False)




In [20]:
# === 5. Save selected datasets ===
X_train_trans_selected.to_csv("../../data/processed/transformed/X_train_transform_selected.csv", index=False)
X_val_trans_selected.to_csv("../../data/processed/transformed/X_val_transform_selected.csv", index=False)
X_test_trans_selected.to_csv("../../data/processed/transformed/X_test_transform_selected.csv", index=False)

X_train_raw_selected.to_csv("../../data/processed/no_transformed/X_train_no_transform_selected.csv", index=False)
X_val_raw_selected.to_csv("../../data/processed/no_transformed/X_val_no_transform_selected.csv", index=False)
X_test_raw_selected.to_csv("../../data/processed/no_transformed/X_test_no_transform_selected.csv", index=False)
