In [119]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.base import clone
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from utils import custom_score, load_data

In [99]:
X_train = load_data('x_train.txt')
y_train = load_data('y_train.txt')

X = X_train.to_numpy()
y = y_train[0].to_numpy()

In [102]:
def kbest_feature_selection(X, y, base_model, n_features=range(1, 21), n_splits=5):
    best_score = 0
    best_k = None

    for num_features in n_features:
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        scores = []
        accs = []

        for train_idx, val_idx in kf.split(X, y):    
            X_cv, X_val = X[train_idx], X[val_idx]
            y_cv, y_val = y[train_idx], y[val_idx]

            pipeline = Pipeline([
                ('select', SelectKBest(score_func=f_classif, k=num_features)),
                ('scale', StandardScaler()),
                ('clf', clone(base_model))
            ])
            
            pipeline.fit(X_cv, y_cv)
            y_val_prob = pipeline.predict_proba(X_val)[:, 1]
            
            score = custom_score(y_val, y_val_prob, num_features)
            scores.append(score)

            accuracy = accuracy_score(y_val, np.round(y_val_prob))
            accs.append(accuracy)

        avg_score = np.mean(scores)
        if avg_score > best_score:
            best_score = avg_score
            best_k = num_features

        print(f"N features = {num_features}, Custom CV score: {avg_score:.2f} +/- {np.std(scores):.2f}, Accuracy: {np.mean(accs):.2f} +/- {np.std(accs):.2f}")
    
    selector = SelectKBest(score_func=f_classif, k=best_k)
    selector.fit(X, y)
    best_features = selector.get_support(indices=True)

    return best_features, best_score

In [108]:
def greedy_feature_selection(X, y, base_model, max_features=20, cv_splits=5, verbose=True):
    n_features_total = X.shape[1]
    selected = []
    remaining = list(range(n_features_total))
    best_score = -np.inf

    kf = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)

    i = 1
    while len(selected) < max_features and remaining:
        scores = []
        for candidate in tqdm(remaining, total=len(remaining)):
            current_features = selected + [candidate]
            fold_scores = []

            for train_idx, val_idx in kf.split(X, y):
                X_cv, X_val = X[train_idx][:, current_features], X[val_idx][:, current_features]
                y_cv, y_val = y[train_idx], y[val_idx]

                model = clone(base_model)
                pipeline = Pipeline([
                    ('scale', StandardScaler()),
                    ('clf', model)
                ])
                pipeline.fit(X_cv, y_cv)
                y_val_prob = pipeline.predict_proba(X_val)[:, 1]

                score = custom_score(y_val, y_val_prob, num_features=len(current_features))
                fold_scores.append(score)

            avg_score = np.mean(fold_scores)
            scores.append((avg_score, candidate))

        scores.sort(reverse=True)
        top_score, top_feature = scores[0]

        if top_score > best_score:
            selected.append(top_feature)
            remaining.remove(top_feature)
            best_score = top_score
            if verbose:
                print(f"Iteration {i}: added feature {top_feature}, score improved to {top_score:.2f}")
        else:
            if verbose:
                print(f"Iteration {i}: no further improvement, stopping")
            break
        i += 1

    return selected, best_score

In [103]:
selected_features, best_cv_score = kbest_feature_selection(
    X, y,
    base_model=LogisticRegression(max_iter=1000)
)

print(f"\nFinal selected features: {selected_features}")
print(f"Best CV score: {best_cv_score:.2f}")

N features = 1, Custom CV score: 7050.00 +/- 331.66, Accuracy: 0.71 +/- 0.01
N features = 2, Custom CV score: 6850.00 +/- 364.69, Accuracy: 0.71 +/- 0.01
N features = 3, Custom CV score: 6660.00 +/- 367.97, Accuracy: 0.71 +/- 0.01
N features = 4, Custom CV score: 6460.00 +/- 392.94, Accuracy: 0.71 +/- 0.01
N features = 5, Custom CV score: 6260.00 +/- 377.36, Accuracy: 0.72 +/- 0.01
N features = 6, Custom CV score: 6100.00 +/- 380.79, Accuracy: 0.71 +/- 0.01
N features = 7, Custom CV score: 5880.00 +/- 331.06, Accuracy: 0.72 +/- 0.01
N features = 8, Custom CV score: 5700.00 +/- 314.64, Accuracy: 0.72 +/- 0.01
N features = 9, Custom CV score: 5490.00 +/- 281.78, Accuracy: 0.72 +/- 0.01
N features = 10, Custom CV score: 5290.00 +/- 310.48, Accuracy: 0.72 +/- 0.01
N features = 11, Custom CV score: 5060.00 +/- 287.05, Accuracy: 0.72 +/- 0.01
N features = 12, Custom CV score: 4870.00 +/- 274.95, Accuracy: 0.72 +/- 0.01
N features = 13, Custom CV score: 4660.00 +/- 297.32, Accuracy: 0.72 +/- 

In [104]:
selected_features, best_cv_score = kbest_feature_selection(
    X, y,
    base_model=XGBClassifier()
)

print(f"\nFinal selected features: {selected_features}")
print(f"Best CV score: {best_cv_score:.2f}")

N features = 1, Custom CV score: 7050.00 +/- 176.07, Accuracy: 0.69 +/- 0.01
N features = 2, Custom CV score: 6860.00 +/- 177.20, Accuracy: 0.67 +/- 0.01
N features = 3, Custom CV score: 6550.00 +/- 148.32, Accuracy: 0.67 +/- 0.01
N features = 4, Custom CV score: 6530.00 +/- 180.55, Accuracy: 0.67 +/- 0.00
N features = 5, Custom CV score: 6320.00 +/- 81.24, Accuracy: 0.67 +/- 0.01
N features = 6, Custom CV score: 6100.00 +/- 144.91, Accuracy: 0.69 +/- 0.01
N features = 7, Custom CV score: 5790.00 +/- 220.00, Accuracy: 0.68 +/- 0.00
N features = 8, Custom CV score: 5650.00 +/- 83.67, Accuracy: 0.68 +/- 0.01
N features = 9, Custom CV score: 5450.00 +/- 100.00, Accuracy: 0.68 +/- 0.01
N features = 10, Custom CV score: 5090.00 +/- 80.00, Accuracy: 0.68 +/- 0.01
N features = 11, Custom CV score: 4920.00 +/- 196.47, Accuracy: 0.68 +/- 0.01
N features = 12, Custom CV score: 4760.00 +/- 80.00, Accuracy: 0.68 +/- 0.01
N features = 13, Custom CV score: 4570.00 +/- 199.00, Accuracy: 0.68 +/- 0.01

In [87]:
selected_features, best_cv_score = greedy_feature_selection(
    X, y,
    base_model=LogisticRegression(max_iter=1000)
)

print(f"\nFinal selected features: {selected_features}")
print(f"Best CV score: {best_cv_score:.2f}")

Iteration 1: added feature 8, score improved to 7210.00
Iteration 2: no further improvement, stopping

Final selected features: [8]
Best CV score: 7210.00


In [120]:
selected_features, best_cv_score = greedy_feature_selection(
    X, y,
    base_model=DecisionTreeClassifier()
)

print(f"\nFinal selected features: {selected_features}")
print(f"Best CV score: {best_cv_score:.2f}")

  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [01:10<00:00,  7.07it/s]


Iteration 1: added feature 2, score improved to 5540.00


100%|██████████| 499/499 [01:20<00:00,  6.16it/s]


Iteration 2: added feature 113, score improved to 5730.00


100%|██████████| 498/498 [01:35<00:00,  5.20it/s]


Iteration 3: no further improvement, stopping

Final selected features: [2, 113]
Best CV score: 5730.00
