In [2]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from utils import custom_score, load_data
import warnings
warnings.filterwarnings('ignore')

In [16]:
from sklearn.model_selection import StratifiedKFold


def backward_forward_selection(X, y, initial_n, p_thresh=0.05, random_state=42, n_splits=5, add_flg=False, locked_features=None):
    np.random.seed(random_state)
    n_samples, n_features = X.shape
    all_indices = list(range(n_features))

    if locked_features is None:
        locked_features = []

    selected = list(np.random.choice(all_indices, size=initial_n, replace=False))

    selected = list(set(selected + locked_features))
    remaining = list(set(all_indices) - set(selected))

    print(f"Initially selected {len(selected)} features (locked: {len(locked_features)}).")

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    def fit_model_cv(feature_indices):
        scores = []
        accs = []

        for train_idx, test_idx in skf.split(X_scaled, y):
            X_train_fold = X_scaled[train_idx][:, feature_indices]
            y_train_fold = y[train_idx]
            X_test_fold = X_scaled[test_idx][:, feature_indices]
            y_test_fold = y[test_idx]

            X_train_fold = sm.add_constant(X_train_fold, has_constant='add')
            X_test_fold = sm.add_constant(X_test_fold, has_constant='add')

            try:
                model = sm.Logit(y_train_fold, X_train_fold).fit(disp=0)
                y_pred = model.predict(X_test_fold)
                score = custom_score(y_test_fold, y_pred, len(feature_indices))
                acc = accuracy_score(y_test_fold, np.round(y_pred))
                scores.append(score)
                accs.append(acc)
            except Exception as e:
                continue

        if len(scores) == 0:
            return None, -np.inf, 0.0

        return model, np.mean(scores), np.mean(accs)

    model, best_score, acc = fit_model_cv(selected)
    print(f"Initial custom score: {best_score:.4f}")
    print(f"Initial accuracy: {acc:.4f}")

    improved = True
    iteration = 0

    while improved:
        improved = False
        iteration += 1
        print(f"\nIteration {iteration}")

        try:
            pvalues = model.pvalues[1:]  # skip constant
        except:
            pvalues = np.ones(len(selected))

        for i, feature in enumerate(selected.copy()):
            if feature in locked_features:
                continue  # skip locked features

            if pvalues[i] > p_thresh:
                trial_selected = selected.copy()
                trial_selected.remove(feature)
                trial_model, trial_score, trial_acc = fit_model_cv(trial_selected)
                if trial_score > best_score:
                    print(f"Removed feature {feature} (better score: {trial_score:.4f}), features: {len(trial_selected)}, acc: {trial_acc:.4f}")
                    selected = trial_selected
                    if add_flg:
                        remaining.append(feature)
                    model = trial_model
                    best_score = trial_score
                    improved = True
                    break

        for feature in remaining.copy():
            trial_selected = selected + [feature]
            trial_model, trial_score, trial_acc = fit_model_cv(trial_selected)
            if trial_model is not None:
                pval_index = trial_selected.index(feature) + 1
                try:
                    pval = trial_model.pvalues[pval_index]
                except:
                    pval = 1.0
                if pval < p_thresh and trial_score > best_score:
                    print(f"Added feature {feature} (p={pval:.4f}) (better score: {trial_score:.4f}), features: {len(trial_selected)}, acc: {trial_acc:.4f}")
                    selected = trial_selected
                    remaining.remove(feature)
                    model = trial_model
                    best_score = trial_score
                    improved = True
                    break

    print(f"\nSelection finished. Final number of features: {len(selected)}. Final score: {best_score:.4f}")
    return selected, model, best_score


In [18]:
X_train = load_data('x_train.txt')
y_train = load_data('y_train.txt')
locked = [0, 2]

X = X_train.to_numpy()
y = y_train[0].to_numpy()

selected_features, final_model, final_score = backward_forward_selection(X, y, 100, p_thresh=0.03, locked_features=locked)

print("Wybrane zmienne:", selected_features)
print("Ostateczny score:", final_score)