In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from utils import custom_score, load_data
import warnings
warnings.filterwarnings('ignore')

In [21]:
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

def backward_forward_selection(X, y, p_thresh=0.05, random_state=42):
    np.random.seed(random_state)
    n_samples, n_features = X.shape
    all_indices = list(range(n_features))

    # Losujemy 250 zmiennych
    selected = list(np.random.choice(all_indices, size=10, replace=False))
    remaining = list(set(all_indices) - set(selected))

    print(f"Początkowo wybrano {len(selected)} zmiennych.")

    # Podział danych
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

    def fit_model(feature_indices):
        X_train_sel = X_train[:, feature_indices]
        X_test_sel = X_test[:, feature_indices]
        X_train_sel = sm.add_constant(X_train_sel, has_constant='add')
        X_test_sel = sm.add_constant(X_test_sel, has_constant='add')
        model = sm.Logit(y_train, X_train_sel).fit(disp=0)
        y_pred = model.predict(X_test_sel)
        return model, custom_score(y_test, y_pred, len(feature_indices)), accuracy_score(y_test, np.round(y_pred))

    model, best_score, acc = fit_model(selected)
    print(f"Startowy score: {best_score:.4f}")
    print(f"Startowy acc: {acc:.4f}")

    improved = True
    iteration = 0

    while improved:
        improved = False
        iteration += 1
        print(f"\nIteracja {iteration}")

        try:
            pvalues = model.pvalues[1:]  # bez const
        except:
            pvalues = np.ones(len(selected))  # fallback

        for i, feature in enumerate(selected.copy()):
            if pvalues[i] > p_thresh:
                trial_selected = selected.copy()
                trial_selected.remove(feature)
                try:
                    trial_model, trial_score, acc = fit_model(trial_selected)
                    if trial_score > best_score:
                        print(f"Usunięto {feature} (lepszy score: {trial_score:.4f}), liczba zmiennych: {len(trial_selected)}, acc: {acc}")
                        selected = trial_selected
                        remaining.append(feature)
                        model = trial_model
                        best_score = trial_score
                        improved = True
                        break
                except Exception as e:
                    print(f"Błąd przy usuwaniu {feature}: {e}")
                    continue

        for feature in remaining.copy():
            trial_selected = selected + [feature]
            try:
                trial_model, trial_score, acc = fit_model(trial_selected)
                pval_index = trial_selected.index(feature) + 1
                pval = trial_model.pvalues[pval_index]
                if pval < p_thresh and trial_score > best_score:
                    print(f"Dodano {feature} (p={pval:.4f}) (lepszy score: {trial_score:.4f}), liczba zmiennych: {len(trial_selected)}, acc: {acc}")
                    selected = trial_selected
                    remaining.remove(feature)
                    model = trial_model
                    best_score = trial_score
                    improved = True
                    break
            except Exception as e:
                print(f"Błąd przy dodawaniu {feature}: {e}")
                continue

    print(f"Zakończono. Wybrano {len(selected)} zmiennych. Ostateczny score: {best_score:.4f}")
    return selected, model, best_score


In [22]:
X_train = load_data('x_train.txt')
y_train = load_data('y_train.txt')

X = X_train.to_numpy()
y = y_train[0].to_numpy()

selected_features, final_model, final_score = backward_forward_selection(X, y)

print("Wybrane zmienne:", selected_features)
print("Ostateczny score:", final_score)

Początkowo wybrano 10 zmiennych.
Startowy score: 2666.6667
Startowy acc: 0.4807

Iteracja 1
Usunięto 361 (lepszy score: 2700.0000), liczba zmiennych: 9, acc: 0.482
Dodano 0 (p=0.0000) (lepszy score: 4933.3333), liczba zmiennych: 10, acc: 0.6566666666666666

Iteracja 2
Usunięto 73 (lepszy score: 5100.0000), liczba zmiennych: 9, acc: 0.6526666666666666
Dodano 2 (p=0.0000) (lepszy score: 5266.6667), liczba zmiennych: 10, acc: 0.696

Iteracja 3
Usunięto 374 (lepszy score: 5500.0000), liczba zmiennych: 9, acc: 0.694

Iteracja 4
Usunięto 104 (lepszy score: 5666.6667), liczba zmiennych: 8, acc: 0.6933333333333334

Iteracja 5
Usunięto 394 (lepszy score: 5900.0000), liczba zmiennych: 7, acc: 0.6913333333333334

Iteracja 6
Usunięto 377 (lepszy score: 6066.6667), liczba zmiennych: 6, acc: 0.6953333333333334

Iteracja 7
Usunięto 124 (lepszy score: 6266.6667), liczba zmiennych: 5, acc: 0.6966666666666667

Iteracja 8
Usunięto 68 (lepszy score: 6466.6667), liczba zmiennych: 4, acc: 0.696

Iteracja 9
