In [1]:
from utils import *
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [18]:
def custom_scorer(estimator, X, y):
    y_prob = estimator.predict_proba(X)[:, 1]
    return custom_score(y.to_numpy(), y_prob, num_features=X.shape[1], top_k_reference=1000)

In [19]:
def metrics_top_and_global(y_true, y_prob, num_features=2,top_k=1000):
    
    score= custom_score(y_true, y_prob, num_features=num_features, top_k_reference=top_k)

    top_k_idx = np.argsort(y_prob)[-top_k:]
    y_true_top = y_true[top_k_idx]
    y_pred_top = (y_prob[top_k_idx] >= 0.5).astype(int)
    acc_top = np.mean(y_true_top == y_pred_top)

    y_pred_all = (y_prob >= 0.5).astype(int)
    acc_global = np.mean(y_true == y_pred_all)

    return {
        "score": score,
        "acc_top": acc_top,
        "acc_global": acc_global
    }


In [20]:
def evaluate_model_with_gridsearch(model, param_grid, X, y, n_splits=5, num_features=2,top_k=1000):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", model)
    ])

    grid = GridSearchCV(
        estimator=pipe,
        param_grid={"clf__" + k: v for k, v in param_grid.items()},
        cv=skf,
        scoring=custom_scorer,
        n_jobs=-1
    )

    grid.fit(X, y)
    best_model = grid.best_estimator_
    y_prob_all = cross_val_predict(best_model, X, y, cv=skf, method="predict_proba")[:, 1]
    metrics = metrics_top_and_global(y.to_numpy(), y_prob_all, num_features, top_k)

    return best_model, grid.best_score_, grid.best_params_, metrics
