In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix,accuracy_score,f1_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_classif,chi2
import numpy as np
from tqdm import tqdm

In [2]:

class FeatureSelector:
    """
    Feature selection using Forward Selection and Backward Elimination
    Supports both Classification and Regression
    """
    
    def __init__(self, estimator=None, scoring='accuracy', task='classification'):
        """
        Parameters:
        -----------
        estimator : sklearn estimator (default: DecisionTreeClassifier)
        scoring : str, 'accuracy', 'f1', 'roc_auc' for classification; 'r2', 'rmse' for regression
        task : str, 'classification' or 'regression'
        """
        self.estimator = estimator if estimator else DecisionTreeClassifier()
        self.scoring = scoring
        self.task = task
        
    def _evaluate_features(self, X_train, y_train, X_val, y_val, features):
        """Evaluate a subset of features"""
        if len(features) == 0:
            return -np.inf
        
        X_train_subset = X_train[:, features]
        X_val_subset = X_val[:, features]
        
        # Train model
        self.estimator.fit(X_train_subset, y_train)
        
        # Predict
        if self.task == 'classification':
            y_pred = self.estimator.predict(X_val_subset)
            
            # Calculate score based on metric
            if self.scoring == 'accuracy':
                score = accuracy_score(y_val, y_pred)
            elif self.scoring == 'f1':
                score = f1_score(y_val, y_pred, average='weighted')
            elif self.scoring == 'roc_auc':
                # Need probability predictions for ROC AUC
                if hasattr(self.estimator, 'predict_proba'):
                    y_pred_proba = self.estimator.predict_proba(X_val_subset)
                    if y_pred_proba.shape[1] == 2:
                        score = roc_auc_score(y_val, y_pred_proba[:, 1])
                    else:
                        score = roc_auc_score(y_val, y_pred_proba, multi_class='ovr', average='weighted')
                else:
                    score = accuracy_score(y_val, y_pred)  # Fallback
        else:  # regression
            y_pred = self.estimator.predict(X_val_subset)
            from sklearn.metrics import r2_score, mean_squared_error
            if self.scoring == 'r2':
                score = r2_score(y_val, y_pred)
            else:  # rmse
                score = -np.sqrt(mean_squared_error(y_val, y_pred))
        
        return score
    
    def forward_selection(self, X_train, y_train, X_val, y_val, 
                         max_features=30, min_improvement=0.0001, verbose=True):
        """
        Forward Selection: Start with no features, add one at a time
        
        Parameters:
        -----------
        X_train : array-like, training features
        y_train : array-like, training target
        X_val : array-like, validation features
        y_val : array-like, validation target
        max_features : int, maximum features to select
        min_improvement : float, minimum improvement to continue
        verbose : bool, print progress
        
        Returns:
        --------
        dict with selected features and scores
        """
        n_features = X_train.shape[1]
        selected_features = []
        remaining_features = list(range(n_features))
        scores_history = []
        
        best_score = -np.inf
        
        print("=" * 70)
        print("FORWARD SELECTION")
        print("=" * 70)
        print(f"Total available features: {n_features}")
        print(f"Maximum features to select: {max_features}")
        print(f"Scoring metric: {self.scoring}")
        print("=" * 70)
        
        for iteration in range(min(max_features, n_features)):
            best_feature = None
            best_iter_score = -np.inf
            
            if verbose:
                print(f"\n--- Iteration {iteration + 1} ---")
                print(f"Evaluating {len(remaining_features)} remaining features...")
            
            # Try adding each remaining feature
            for idx, feature in enumerate(remaining_features):
                candidate_features = selected_features + [feature]
                score = self._evaluate_features(X_train, y_train, X_val, y_val, candidate_features)
                
                if score > best_iter_score:
                    best_iter_score = score
                    best_feature = feature
            
            # Check for improvement
            improvement = best_iter_score - best_score
            
            if improvement < min_improvement and len(selected_features) > 0:
                if verbose:
                    print(f"\n✓ Stopping: Improvement {improvement:.6f} < {min_improvement}")
                break
            
            # Add best feature
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
            best_score = best_iter_score
            scores_history.append({
                'n_features': len(selected_features),
                'feature_added': best_feature,
                'score': best_score
            })
            
            if verbose:
                print(f"✓ Added feature {best_feature} | "
                      f"{self.scoring.upper()}: {best_score:.6f} | "
                      f"Improvement: +{improvement:.6f} | "
                      f"Total features: {len(selected_features)}")
        
        print("\n" + "=" * 70)
        print(f"FORWARD SELECTION COMPLETE")
        print(f"Selected {len(selected_features)} features")
        print(f"Final {self.scoring}: {best_score:.6f}")
        print("=" * 70)
        
        return {
            'selected_features': selected_features,
            'scores_history': scores_history,
            'final_score': best_score,
            'n_features': len(selected_features)
        }
    
    def backward_elimination(self, X_train, y_train, X_val, y_val,
                           min_features=5, min_improvement=0.0001, verbose=True):
        """
        Backward Elimination: Start with all features, remove one at a time
        
        Parameters:
        -----------
        X_train : array-like, training features
        y_train : array-like, training target
        X_val : array-like, validation features
        y_val : array-like, validation target
        min_features : int, minimum features to keep
        min_improvement : float, minimum improvement to continue
        verbose : bool, print progress
        
        Returns:
        --------
        dict with selected features and scores
        """
        n_features = X_train.shape[1]
        selected_features = list(range(n_features))
        scores_history = []
        
        # Initial score with all features
        best_score = self._evaluate_features(X_train, y_train, X_val, y_val, selected_features)
        
        print("\n" + "=" * 70)
        print("BACKWARD ELIMINATION")
        print("=" * 70)
        print(f"Starting with all {n_features} features")
        print(f"Minimum features to keep: {min_features}")
        print(f"Initial {self.scoring}: {best_score:.6f}")
        print("=" * 70)
        
        iteration = 0
        while len(selected_features) > min_features:
            iteration += 1
            worst_feature = None
            best_iter_score = -np.inf
            
            if verbose:
                print(f"\n--- Iteration {iteration} ---")
                print(f"Evaluating removal of {len(selected_features)} features...")
            
            # Try removing each feature
            for feature in selected_features:
                candidate_features = [f for f in selected_features if f != feature]
                score = self._evaluate_features(X_train, y_train, X_val, y_val, candidate_features)
                
                if score > best_iter_score:
                    best_iter_score = score
                    worst_feature = feature
            
            # Check if removal improves or maintains performance
            improvement = best_iter_score - best_score
            
            # Stop if removing features hurts performance significantly
            if improvement < -abs(min_improvement):
                if verbose:
                    print(f"\n✓ Stopping: Removing features degrades performance by {abs(improvement):.6f}")
                break
            
            # Remove worst feature
            selected_features.remove(worst_feature)
            best_score = best_iter_score
            scores_history.append({
                'n_features': len(selected_features),
                'feature_removed': worst_feature,
                'score': best_score
            })
            
            if verbose:
                print(f"✓ Removed feature {worst_feature} | "
                      f"{self.scoring.upper()}: {best_score:.6f} | "
                      f"Change: {improvement:+.6f} | "
                      f"Remaining: {len(selected_features)}")
        
        print("\n" + "=" * 70)
        print(f"BACKWARD ELIMINATION COMPLETE")
        print(f"Selected {len(selected_features)} features")
        print(f"Final {self.scoring}: {best_score:.6f}")
        print("=" * 70)
        
        return {
            'selected_features': selected_features,
            'scores_history': scores_history,
            'final_score': best_score,
            'n_features': len(selected_features)
        }

In [3]:


df = pd.read_csv('./train1.csv')
X = df.drop(['target','id'], axis=1)
y = df['target']

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=29173, stratify=y)

In [5]:
categorical_cols = [col for col in X_train.columns if col.endswith('_cat')]
X_train[categorical_cols] = X_train[categorical_cols].astype('category')
X_val[categorical_cols] = X_val[categorical_cols].astype('category')
binary_cols = [col for col in X_train.columns if col.endswith('_bin')]
X_train[binary_cols] = X_train[binary_cols].astype(bool)
X_val[binary_cols] = X_val[binary_cols].astype(bool)

In [6]:
def class_based_imputer(X, y, categorical_columns=None):
    """
    Class-based imputation for categorical columns using mode values per class.
    """
    X_imputed = X.copy()
    if categorical_columns is None:
        categorical_columns = X.columns.tolist()

    print("="*60)
    print("CLASS-BASED IMPUTATION STARTED")
    print("="*60)
    
    for col in categorical_columns:
        if col not in X_imputed.columns:
            continue

        if X_imputed[col].isnull().sum() == 0:
            continue

        for class_val in [0, 1]:
            class_mask = (y == class_val)
            missing_mask = X_imputed[col].isnull()
            impute_mask = class_mask & missing_mask

            if impute_mask.sum() > 0:
                class_values = X_imputed.loc[class_mask & ~missing_mask, col]
                if len(class_values) > 0:
                    fill_value = class_values.mode()[0]
                else:
                    fill_value = 'Missing'
                X_imputed.loc[impute_mask, col] = fill_value

    print("="*60)
    print("IMPUTATION COMPLETED")
    print("="*60)
    return X_imputed


In [7]:
print(f"Dataset shape: {X_train.shape}")
class_dist = y_train.value_counts(normalize=True) * 100
print(f"Class 0: {class_dist[0]:.2f}% | Class 1: {class_dist[1]:.2f}%")

print("Missing values per column:")
print(X_train.isnull().sum())

print("="*80)
print("APPLYING CLASS-BASED IMPUTATION")
print("="*80)

X_train = class_based_imputer(X_train, y_train, categorical_cols)
print(f"Dataset shape: {X_val.shape}")
class_dist = y_val.value_counts(normalize=True) * 100
print(f"Class 0: {class_dist[0]:.2f}% | Class 1: {class_dist[1]:.2f}%")

print("Missing values per column:")
print(X_train.isnull().sum())

print("="*80)
print("APPLYING CLASS-BASED IMPUTATION")
print("="*80)

X_val = class_based_imputer(X_val, y_val, categorical_cols)


Dataset shape: (207346, 65)
Class 0: 94.87% | Class 1: 5.13%
Missing values per column:
ps_ind_02_cat       91
ps_ind_04_cat       33
ps_ind_05_cat     2030
ps_car_01_cat       42
ps_car_02_cat        3
                 ...  
feature4         37556
feature5             0
feature6             0
feature7             0
feature8             0
Length: 65, dtype: int64
APPLYING CLASS-BASED IMPUTATION
CLASS-BASED IMPUTATION STARTED
IMPUTATION COMPLETED
Dataset shape: (88863, 65)
Class 0: 94.87% | Class 1: 5.13%
Missing values per column:
ps_ind_02_cat        0
ps_ind_04_cat        0
ps_ind_05_cat        0
ps_car_01_cat        0
ps_car_02_cat        0
                 ...  
feature4         37556
feature5             0
feature6             0
feature7             0
feature8             0
Length: 65, dtype: int64
APPLYING CLASS-BASED IMPUTATION
CLASS-BASED IMPUTATION STARTED
IMPUTATION COMPLETED


In [8]:
def class_based_num_imputer(X, y, num_cols=None):
    """
    Perform class-based imputation for numerical columns.
    Each class's missing values are filled using that class's mean.
    """
    X_imputed = X.copy()

    # Identify columns
    categorical_cols = [col for col in X.columns if col.endswith('_cat')]
    binary_cols = [col for col in X.columns if col.endswith('_bin')]

    if num_cols is None:
        num_cols = [col for col in X.columns if not (col.endswith('_cat') or col.endswith('_bin'))]

    # Impute numerical columns class-wise (using mean)
    for col in num_cols:
        if X_imputed[col].isnull().sum() == 0:
            continue

        for class_val in [0, 1]:
            class_mask = (y == class_val)
            missing_mask = X_imputed[col].isnull()
            impute_mask = class_mask & missing_mask

            if impute_mask.sum() > 0:
                class_values = X_imputed.loc[class_mask & ~missing_mask, col]
                if len(class_values) > 0:
                    fill_value = class_values.mean()
                else:
                    fill_value = X_imputed[col].mean()  # fallback global mean
                X_imputed.loc[impute_mask, col] = fill_value

    print("="*60)
    print("IMPUTATION COMPLETED")
    print("="*60)
    return X_imputed


In [9]:
num_cols = [col for col in X_train.columns if not col.endswith(('_cat', '_bin'))]
X_train =  class_based_num_imputer(X_train,y_train,num_cols)
X_val =  class_based_num_imputer(X_val,y_val,num_cols)

IMPUTATION COMPLETED
IMPUTATION COMPLETED


In [None]:
# Your Decision Tree
dt = DecisionTreeClassifier(
    class_weight=None, 
    criterion='gini', 
    max_depth=5, 
    min_samples_leaf=5, 
    min_samples_split=2,
    random_state=29173
)
roc_auc

# Run Feature Selection
selector = FeatureSelector(estimator=dt, scoring='', task='classification')

forward_results = selector.forward_selection(
    X_train.values, y_train,
    X_val.values, y_val,
    max_features=35
)

backward_results = selector.backward_elimination(
    X_train.values, y_train,
    X_val.values, y_val,
    min_features=35
)

FORWARD SELECTION
Total available features: 65
Maximum features to select: 35
Scoring metric: accuracy

--- Iteration 1 ---
Evaluating 65 remaining features...
✓ Added feature 0 | ACCURACY: 0.948730 | Improvement: +inf | Total features: 1

--- Iteration 2 ---
Evaluating 64 remaining features...

✓ Stopping: Improvement 0.000000 < 0.0001

FORWARD SELECTION COMPLETE
Selected 1 features
Final accuracy: 0.948730

BACKWARD ELIMINATION
Starting with all 65 features
Minimum features to keep: 35
Initial accuracy: 0.948730

--- Iteration 1 ---
Evaluating removal of 65 features...
✓ Removed feature 0 | ACCURACY: 0.948730 | Change: +0.000000 | Remaining: 64

--- Iteration 2 ---
Evaluating removal of 64 features...
✓ Removed feature 1 | ACCURACY: 0.948730 | Change: +0.000000 | Remaining: 63

--- Iteration 3 ---
Evaluating removal of 63 features...
✓ Removed feature 2 | ACCURACY: 0.948730 | Change: +0.000000 | Remaining: 62

--- Iteration 4 ---
Evaluating removal of 62 features...
✓ Removed feature

In [11]:
forward_results2 = selector.forward_selection(
    X_train.values, y_train,
    X_val.values, y_val,
    max_features=66
)

backward_results2 = selector.backward_elimination(
    X_train.values, y_train,
    X_val.values, y_val,
    min_features=1
)

FORWARD SELECTION
Total available features: 65
Maximum features to select: 66
Scoring metric: accuracy

--- Iteration 1 ---
Evaluating 65 remaining features...
✓ Added feature 0 | ACCURACY: 0.948730 | Improvement: +inf | Total features: 1

--- Iteration 2 ---
Evaluating 64 remaining features...

✓ Stopping: Improvement 0.000000 < 0.0001

FORWARD SELECTION COMPLETE
Selected 1 features
Final accuracy: 0.948730

BACKWARD ELIMINATION
Starting with all 65 features
Minimum features to keep: 1
Initial accuracy: 0.948730

--- Iteration 1 ---
Evaluating removal of 65 features...
✓ Removed feature 0 | ACCURACY: 0.948730 | Change: +0.000000 | Remaining: 64

--- Iteration 2 ---
Evaluating removal of 64 features...
✓ Removed feature 1 | ACCURACY: 0.948730 | Change: +0.000000 | Remaining: 63

--- Iteration 3 ---
Evaluating removal of 63 features...
✓ Removed feature 2 | ACCURACY: 0.948730 | Change: +0.000000 | Remaining: 62

--- Iteration 4 ---
Evaluating removal of 62 features...
✓ Removed feature 

In [13]:
# Your Decision Tree
dt = DecisionTreeClassifier(
    class_weight=None, 
    criterion='gini', 
    max_depth=5, 
    min_samples_leaf=5, 
    min_samples_split=2,
    random_state=29173
)


# Run Feature Selection
selector = FeatureSelector(estimator=dt, scoring='roc_auc', task='classification')

forward_results3 = selector.forward_selection(
    X_train.values, y_train,
    X_val.values, y_val,
    max_features=66
)

backward_results3 = selector.backward_elimination(
    X_train.values, y_train,
    X_val.values, y_val,
    min_features=1
)

FORWARD SELECTION
Total available features: 65
Maximum features to select: 66
Scoring metric: roc_auc

--- Iteration 1 ---
Evaluating 65 remaining features...
✓ Added feature 34 | ROC_AUC: 0.579119 | Improvement: +inf | Total features: 1

--- Iteration 2 ---
Evaluating 64 remaining features...
✓ Added feature 27 | ROC_AUC: 0.593442 | Improvement: +0.014324 | Total features: 2

--- Iteration 3 ---
Evaluating 63 remaining features...
✓ Added feature 60 | ROC_AUC: 0.617423 | Improvement: +0.023981 | Total features: 3

--- Iteration 4 ---
Evaluating 62 remaining features...
✓ Added feature 25 | ROC_AUC: 0.623358 | Improvement: +0.005934 | Total features: 4

--- Iteration 5 ---
Evaluating 61 remaining features...
✓ Added feature 6 | ROC_AUC: 0.627596 | Improvement: +0.004239 | Total features: 5

--- Iteration 6 ---
Evaluating 60 remaining features...
✓ Added feature 30 | ROC_AUC: 0.629835 | Improvement: +0.002239 | Total features: 6

--- Iteration 7 ---
Evaluating 59 remaining features...
✓

In [14]:
forward_selected_idx = forward_results3['selected_features']
backward_selected_idx = backward_results3['selected_features']


In [15]:
forward_selected_idx

[34, 27, 60, 25, 6, 30, 56]

In [16]:
backward_selected_idx

[2, 6, 27, 31, 35, 36]

In [17]:
forward_selected_features = X_train.columns[forward_selected_idx]
backward_selected_features = X_train.columns[backward_selected_idx]


In [18]:
forward_selected_features

Index(['ps_car_13', 'ps_ind_17_bin', 'feature4', 'ps_ind_15', 'ps_car_04_cat',
       'ps_reg_02', 'ps_calc_20_bin'],
      dtype='object')

In [19]:
backward_selected_features

Index(['ps_ind_05_cat', 'ps_car_04_cat', 'ps_ind_17_bin', 'ps_reg_03',
       'ps_car_14', 'ps_car_15'],
      dtype='object')

In [20]:
# Your Decision Tree
dt = DecisionTreeClassifier(random_state=29173)


# Run Feature Selection
selector = FeatureSelector(estimator=dt, scoring='roc_auc', task='classification')

forward_results3 = selector.forward_selection(
    X_train.values, y_train,
    X_val.values, y_val,
    max_features=66
)

backward_results3 = selector.backward_elimination(
    X_train.values, y_train,
    X_val.values, y_val,
    min_features=1
)

FORWARD SELECTION
Total available features: 65
Maximum features to select: 66
Scoring metric: roc_auc

--- Iteration 1 ---
Evaluating 65 remaining features...
✓ Added feature 13 | ROC_AUC: 0.569937 | Improvement: +inf | Total features: 1

--- Iteration 2 ---
Evaluating 64 remaining features...
✓ Added feature 27 | ROC_AUC: 0.582728 | Improvement: +0.012791 | Total features: 2

--- Iteration 3 ---
Evaluating 63 remaining features...
✓ Added feature 6 | ROC_AUC: 0.589139 | Improvement: +0.006411 | Total features: 3

--- Iteration 4 ---
Evaluating 62 remaining features...

✓ Stopping: Improvement 0.000065 < 0.0001

FORWARD SELECTION COMPLETE
Selected 3 features
Final roc_auc: 0.589139

BACKWARD ELIMINATION
Starting with all 65 features
Minimum features to keep: 1
Initial roc_auc: 0.501034

--- Iteration 1 ---
Evaluating removal of 65 features...
✓ Removed feature 2 | ROC_AUC: 0.508825 | Change: +0.007791 | Remaining: 64

--- Iteration 2 ---
Evaluating removal of 64 features...
✓ Removed f