In [None]:
import importlib
import subprocess
import sys
from utils.environment_specific import is_local_development

def install_if_missing(package_name, pip_name=None):
    try:
        importlib.import_module(package_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or package_name])

if not is_local_development():
    install_if_missing("dotenv", "python-dotenv")
    install_if_missing("onnxruntime")
    install_if_missing("tldextract")

In [None]:
import time
import random
import os

import pandas as pd
import numpy as np
from dotenv import load_dotenv
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit
from sklearn.linear_model import LogisticRegression
import mlflow 

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from utils.dataset import load_public_dataset, load_full_private_df, split_df_by_folds
from utils.url_features import extract_all_vec
from utils.base_models import find_decision_threshold_maximizing_f1, log_persistent_performance
from utils.output import print_dict_level1_inline

RANDOM_STATE = 42

In [None]:
np.random.seed(RANDOM_STATE)
# in case any standard library uses some random function
random.seed(RANDOM_STATE)

In [None]:
loaded = load_dotenv(".env")
if not loaded:
    loaded = load_dotenv("../../.env")
assert loaded is True

In [None]:
if is_local_development():
    spark = None

In [None]:
def get_dataset(name):
    if name != "private_data":
        df_train_urls, df_test_urls = load_public_dataset(name)
    else:
        df = load_full_private_df(spark)
        df_train_urls, df_test_urls = split_df_by_folds(
            df,
            train_folds=None,
            eval_folds=[4],
            shorten_string_train=None,
            shorten_string_eval=None,
            seed=42,
        )
        

    df_train_features = extract_all_vec(df_train_urls["url"]).reset_index(drop=True)
    df_test_features = extract_all_vec(df_test_urls["url"]).reset_index(drop=True)

    df_train = pd.concat([df_train_features, df_train_urls[["label", "fold"]].reset_index(drop=True)], axis=1)
    df_test = pd.concat([df_test_features, df_test_urls[["label"]].reset_index(drop=True)], axis=1)

    return {"train": df_train, "test": df_test}

In [None]:
datasets = {
    "grambeddings": get_dataset("grambeddings"),
    "kaggle_binary": get_dataset("kaggle_binary"),
    "kaggle_multiple": get_dataset("kaggle_multiple"),
    "mendeley": get_dataset("mendeley"),
    "joined": get_dataset("mendeley"),
    "private_data": get_dataset("private_data"),
}

In [0]:
def get_ready_dataset(name):
    df_train, df_test = datasets[name]["train"], datasets[name]["test"]

    assert "label" in df_test.columns, f"Missing 'label' in test data for {name}"

    df_val = df_train[df_train["fold"] == 3].reset_index(drop=True)
    df_train = df_train[df_train["fold"] != 3].reset_index(drop=True)

    df_train = df_train.drop(columns=["fold"])
    df_val = df_val.drop(columns=["fold"])
    df_test = df_test.drop(columns=["fold"], errors="ignore")

    X_train, y_train = df_train.drop(columns=["label"]), df_train["label"]
    X_val, y_val = df_val.drop(columns=["label"]), df_val["label"]
    X_test, y_test = df_test.drop(columns=["label"]), df_test["label"]

    X_train_val = np.concatenate([X_train, X_val], axis=0)
    y_train_val = np.concatenate([y_train, y_val], axis=0)

    return {
        "X_train": X_train,
        "y_train": y_train,
        "X_val": X_val,
        "y_val": y_val,
        "X_test": X_test,
        "y_test": y_test,
        "X_train_val": X_train_val,
        "y_train_val": y_train_val,
    }

In [0]:
from sklearn.preprocessing import StandardScaler


def scale_dataset(data: dict) -> dict:
    X_train = data["X_train"]
    X_val = data["X_val"]
    X_test = data["X_test"]
    X_train_val = data["X_train_val"]
    y_train = data["y_train"]
    y_val = data["y_val"]
    y_test = data["y_test"]
    y_train_val = data["y_train_val"]

    # Scaler for train/val (used during model selection)
    scaler_train_val = StandardScaler()
    X_train_scaled = scaler_train_val.fit_transform(X_train)
    X_val_scaled = scaler_train_val.transform(X_val)

    # Scaler for full training (train + val) used for final model
    scaler_whole = StandardScaler()
    X_train_val_scaled = scaler_whole.fit_transform(X_train_val)
    X_test_scaled = scaler_whole.transform(X_test)

    return {
        "X_train": X_train_scaled,
        "y_train": y_train,
        "X_val": X_val_scaled,
        "y_val": y_val,
        "X_test": X_test_scaled,
        "y_test": y_test,
        "X_train_val": X_train_val_scaled,
        "y_train_val": y_train_val,
        "scaler_train_val": scaler_train_val,
        "scaler_whole": scaler_whole,
    }

# Models

In [0]:
def calculate_metrics(
    true_labels,
    probabilities,
    predictions,
):
    """
    true_labels   : 1-D array-like
    probabilities : array-like of shape (n_samples, n_classes)
    predictions   : 1-D array-like
    """
    true_labels = np.asarray(true_labels)
    probabilities = np.asarray(probabilities)
    predictions = np.asarray(predictions, dtype=int)

    if not (len(true_labels) == len(probabilities) == len(predictions)):
        raise ValueError("true_labels, probabilities and predictions must have equal length")

    n_classes = probabilities.shape[1]

    metrics = classification_report(true_labels, predictions, output_dict=True, zero_division=0)
    cm = confusion_matrix(true_labels, predictions)
    metrics["confusion_matrix"] = cm.tolist()

    if n_classes == 2:
        metrics["roc_auc_score"] = roc_auc_score(true_labels, probabilities[:, 1])

    return metrics


def predict(class_probs):
    assert class_probs.shape[1] >= 2
    return np.argmax(class_probs, axis=1)


def predict_with_threshold(class_probs, threshold):
    assert class_probs.shape[1] == 2
    return (class_probs[:, 1] >= threshold).astype(int)


In [0]:
def _as_numpy(X):
    """Safely convert pandas or NumPy to ndarray for concatenation."""
    return X.values if isinstance(X, (pd.DataFrame, pd.Series)) else np.asarray(X)


def run_experiment(
    data: dict,
    model,
    param_distributions: dict,
    *,
    n_iter: int = 30,
    scoring: str = "f1_macro",
    random_state: int = 42,
    path_to_log
    # n_jobs: int = -1,
):
    X_train, y_train = _as_numpy(data["X_train"]), _as_numpy(data["y_train"])
    X_val, y_val = _as_numpy(data["X_val"]), _as_numpy(data["y_val"])
    X_test, y_test = _as_numpy(data["X_test"]), _as_numpy(data["y_test"])
    X_train_val, y_train_val = _as_numpy(data["X_train_val"]), _as_numpy(data["y_train_val"])

    X_search = np.concatenate([X_train, X_val], axis=0)
    y_search = np.concatenate([y_train, y_val], axis=0)
    test_fold = np.concatenate([np.full(len(X_train), -1), np.zeros(len(X_val))])
    cv = PredefinedSplit(test_fold)

    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_distributions,
        n_iter=n_iter,
        scoring=scoring,
        cv=cv,
        # n_jobs=n_jobs,
        random_state=random_state,
        verbose=1,
    )
    search.fit(X_search, y_search)
    print(f"\nBest val {scoring}: {search.best_score_:.4f}")
    print("Best params:", search.best_params_)

    best_model = search.best_estimator_
    best_model.fit(X_train_val, y_train_val)

    # evaluate on test
    y_probs_test = best_model.predict_proba(X_test)
    predictions = predict(y_probs_test)
    metrics = calculate_metrics(
        true_labels=y_test,
        probabilities=y_probs_test,
        predictions=predictions,
    )

    if y_probs_test.shape[1] == 2:
        class_1_probs = y_probs_test[:, 1]
        best_threshold = find_decision_threshold_maximizing_f1(probs=class_1_probs, labels=y_test)
        metrics_best = calculate_metrics(
            true_labels=y_test,
            probabilities=y_probs_test,
            predictions=predict_with_threshold(y_probs_test, best_threshold),
        )
        metrics_best["best_threshold"] = best_threshold
        prefix_path = f'{path_to_log}/'
        log_persistent_performance(metrics=metrics, best_threshold_metrics=metrics_best, true_labels=y_test, class_probabilities=y_probs_test, predictions=predictions, store_predictions=False, prefix=prefix_path)
    else:
        metrics_best = metrics
        metrics_best["best_threshold"] = None
        mlflow.log_dict(metrics, artifact_file=f"{path_to_log}/best_model_metrics.json")
        
    print_dict_level1_inline(metrics)

    # inference speed
    t0 = time.perf_counter()
    _ = best_model.predict(X_test)
    t_elapsed = time.perf_counter() - t0
    sps = len(X_test) / t_elapsed
    sps_inv = t_elapsed / len(X_test)
    print("\nInference speed")
    print(f"total time (s): {t_elapsed:.4f}")
    print(f"samples / sec : {sps:.2f}")
    print(f"sec / sample  : {sps_inv:.6f}")

    # pack results
    perf_score = {"n_per_s": sps, "s_per_1": sps_inv}
    to_store = {
        "best_val_score": search.best_score_,
        "best_params": search.best_params_,
        "test_metrics": metrics,
        "perf_score": perf_score,
    }
    return to_store, best_model

In [0]:
def store_sorted_feature_importance_dict(feature_importance_dict: dict, artifact_file: str):
    sorted_importance = sorted(
        feature_importance_dict.items(),
        key=lambda x: x[1],
        reverse=True,
    )
    sorted_dict = [{"feature": k, "importance": float(v)} for k, v in sorted_importance]
    mlflow.log_dict({"feature_importance": sorted_dict}, artifact_file=artifact_file)

In [0]:
def get_path_to_log(model_name, dataset_name):
    print(f"=== {dataset_name} ===")
    path = os.path.join(model_name, dataset_name)
    os.makedirs(path, exist_ok=True)
    return path

In [0]:
client = mlflow.MlflowClient()
experiments_folder_path = os.getenv("EXPERIMENTS_PATH")
if is_local_development():
    experiment_name = "feature_models"
    print()
else:
    experiment_name = os.getenv("EXPERIMENT_NAME")

experiment_path = os.path.join(experiments_folder_path, experiment_name)
mlflow.set_experiment(experiment_path)

In [0]:
run = mlflow.start_run(run_name="Feature based models")
print(run.info.run_id)

## Logistic regression

In [0]:
logistic_regression_param_space = {
    # inverse strength of regularization
    "C": [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
    "penalty": ["l2"],
    "solver": ["lbfgs"],
    "max_iter": [300, 800],
}

for ds_name in datasets.keys():
    path = get_path_to_log("logistic_regression", ds_name)

    unscaled_dataset = get_ready_dataset(ds_name)
    dataset = scale_dataset(unscaled_dataset)
    to_store, model = run_experiment(dataset, LogisticRegression(max_iter=1000, n_jobs=-1, class_weight="balanced"), logistic_regression_param_space, n_iter=3, path_to_log=path)

    feature_names = list(unscaled_dataset["X_train"].columns)
    coef = model.coef_
    if coef.ndim == 1:
        values = np.abs(coef)
    else:
        values = np.mean(np.abs(coef), axis=0)

    feature_importance_dict = dict(zip(feature_names, values))
    store_sorted_feature_importance_dict(
        feature_importance_dict,
        artifact_file=f"{path}/feature_importance.json"
    )

## Naive Bayes

In [0]:
from sklearn.naive_bayes import GaussianNB

naive_bayes_param_space = {
    "var_smoothing": [1e-11, 1e-9, 1e-7],
}

for ds_name in datasets.keys():
    path = get_path_to_log("naive_bayes", ds_name)

    dataset = get_ready_dataset(ds_name)

    to_store, model = run_experiment(
        dataset,
        GaussianNB(),
        naive_bayes_param_space,
        n_iter=3,
        path_to_log=path
    )

## XGBoost

In [0]:
from xgboost import XGBClassifier

xgboost_param_space = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'min_child_weight': [1, 3, 5]
}

for ds_name in datasets.keys():
    path = get_path_to_log("xgboost", ds_name)

    dataset = get_ready_dataset(ds_name)

    y_train = dataset["y_train"]
    classes = np.unique(y_train)
    is_binary = len(classes) == 2

    if is_binary:
        pos = np.sum(y_train == 1)
        neg = np.sum(y_train == 0)
        scale_pos_weight = neg / pos if pos > 0 else 1.0
        extra = {
            "objective": "binary:logistic",
            "scale_pos_weight": scale_pos_weight,
        }
    else:
        extra = {
            "objective": "multi:softprob",
            "num_class": len(classes),
        }

    to_store, model = run_experiment(
        dataset,
        XGBClassifier(
            tree_method="hist",
            # use_label_encoder=False,
            eval_metric="logloss",
            **extra,
        ),
        xgboost_param_space,
        n_iter=25,
        path_to_log=path,
    )

    feature_names = list(dataset["X_train"].columns)
    feature_importance_dict = dict(zip(feature_names, model.feature_importances_))

    store_sorted_feature_importance_dict(
        feature_importance_dict,
        artifact_file=f"{path}/feature_importance.json"
    )

In [0]:
mlflow.end_run()