In [None]:
import importlib
import subprocess
import sys
from utils.environment_specific import is_local_development

def install_if_missing(package_name, pip_name=None):
    try:
        importlib.import_module(package_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name or package_name])

if not is_local_development():
    install_if_missing("dotenv", "python-dotenv")
    install_if_missing("onnxruntime", "onnxruntime-gpu==1.17.0")
    install_if_missing("tldextract")

In [None]:
import random
import os

import numpy as np
from dotenv import load_dotenv
import mlflow 

from utils.dataset import load_public_dataset, load_full_private_df, split_df_by_folds
from utils.base_models import find_decision_threshold_maximizing_f1
from utils.output import print_dict_level1_inline

RANDOM_STATE = 42

In [None]:
np.random.seed(RANDOM_STATE)
# in case any standard library uses some random function
random.seed(RANDOM_STATE)

In [None]:
loaded = load_dotenv(".env")
if not loaded:
    loaded = load_dotenv("../../.env")
assert loaded is True

In [None]:
if is_local_development():
    spark = None

In [None]:
def get_dataset(name):
    if name != "private_data":
        df_train, df_test = load_public_dataset(name)
    else:
        df = load_full_private_df(spark)
        df_train, df_test = split_df_by_folds(
            df,
            train_folds=None,
            eval_folds=[4],
            shorten_string_train=None,
            shorten_string_eval=None,
            seed=42,
        )
    return {"train": df_train, "test": df_test}

In [None]:
datasets = {
    "grambeddings": get_dataset("grambeddings"),
    "kaggle_binary": get_dataset("kaggle_binary"),
    "kaggle_multiple": get_dataset("kaggle_multiple"),
    "mendeley": get_dataset("mendeley"),
    "joined": get_dataset("joined"),
    # "private_data": get_dataset("private_data"),
}

In [None]:
from baselines_code.URLNet.train import get_default_train_args, run_training
from baselines_code.URLNet.test import get_default_test_args
from utils.base_models import calculate_metrics_binary, log_persistent_performance



In [None]:
client = mlflow.MlflowClient()
experiments_folder_path = os.getenv("EXPERIMENTS_PATH")
if is_local_development():
    experiment_name = "feature_models"
    print()
else:
    experiment_name = os.getenv("EXPERIMENT_NAME")

experiment_path = os.path.join(experiments_folder_path, experiment_name)
mlflow.set_experiment(experiment_path)

In [None]:
def run_pipeline(train_df, test_df, dataset_name):

    train_args = get_default_train_args()
    emb_mode = 5
    cwd = os.getcwd()
    output_dir = os.path.join(cwd, "baselines", "URLNet", "runs", "temp") + os.sep

    train_args.model_emb_mode = emb_mode
    # train_args.log_output_dir = output_dir
    test_args = get_default_test_args()
    test_args.model_emb_mode = emb_mode
    result = run_training(df_train=train_df, df_test=test_df, train_args=train_args, test_args=test_args)

    # test_args.data_word_dict_dir = output_dir + "words_dict.p"
    # test_args.data_subword_dict_dir = output_dir + "subwords_dict.p"
    # test_args.data_char_dict_dir = output_dir + "chars_dict.p"
    # test_args.log_checkpoint_dir = output_dir + "checkpoints/"
    # train_args.log_output_dir = output_dir
    # result = run_test(test_df, test_args)

    y_true = result["targets"]
    y_pred = result["predictions"]
    y_probs = result["probabilities"]

    # Assuming y_probs is an array of shape (n_samples, 2) for binary classification
    unique, counts = np.unique(y_pred, return_counts=True)
    print(f"pred counts: {counts}")
    print(f"probs: {y_probs[:5]}")

    class1_p = y_probs[:, 1]
    metrics = calculate_metrics_binary(y_true, y_probs, y_pred)
    metrics["n_per_s_total"] = result["n_per_s_total"]
    metrics["n_per_s_batched"] = result["n_per_s_batched"]
    metrics["params_count"] = result["params_count"]
    best_thr = find_decision_threshold_maximizing_f1(class1_p, y_true)
    metrics["best_decision_threshold"] = best_thr
    alt_metrics = calculate_metrics_binary(y_true, y_probs, (class1_p >= best_thr).astype(int))
    print_dict_level1_inline(metrics)

    prefix_path = f"{dataset_name}/"
    log_persistent_performance(
        metrics=metrics,
        best_threshold_metrics=alt_metrics,
        true_labels=y_true,
        class_probabilities=y_probs,
        predictions=y_pred,
        prefix=prefix_path,
        # store_predictions=True,
    )

In [None]:
with mlflow.start_run(run_name="URLNet") as run:
    print(run.info.run_id)
    for dataset_name in datasets.keys():
        run_pipeline(datasets[dataset_name]["train"], datasets[dataset_name]["test"], dataset_name)

In [None]:
mlflow.end_run()