In [39]:
from utils import *
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

In [41]:
X = pd.read_csv('../data_after_processing/step_by_step_6_features.csv')
y = X['col500']
X = X.drop(columns=['col500'])

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [43]:
def custom_score_sklearn(estimator: BaseEstimator, X_val, y_val):
    y_prob = estimator.predict_proba(X_val)[:, 1]
    num_features = X_val.shape[1]
    return custom_score(y_val, y_prob, num_features)

In [44]:
def evaluate_model_with_cv(model, X, y, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)

        model_clone = clone(model)
        model_clone.fit(X_train_scaled, y_train)
        y_prob = model_clone.predict_proba(X_val_scaled)[:, 1]

        score = custom_score(y_val.to_numpy(), y_prob, X.shape[1])
        scores.append(score)

    return np.mean(scores), np.std(scores)

In [40]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models.items():
    mean_score, std_score = evaluate_model_with_cv(model, X, y, n_splits=5)
    print(f"{name}: mean custom score = {mean_score:.2f} ± {std_score:.2f}")


Logistic Regression: mean custom score = 6280.00 ± 369.59
Random Forest: mean custom score = 6400.00 ± 221.36
Gradient Boosting: mean custom score = 6190.00 ± 185.47
XGBoost: mean custom score = 6070.00 ± 156.84
