In [10]:
import catboost as cb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score

from catboost import CatBoostClassifier, EShapCalcType, EFeaturesSelectionAlgorithm
from catboost import Pool

import mlflow

import optuna

#paper->https://doi.org/10.1016/j.ins.2019.01.064


In [11]:
df = pd.read_csv("Phishing.csv")
df.isnull().sum()
df.shape

(10000, 50)

In [12]:
X = df.loc[:, ~df.columns.isin(["CLASS_LABEL", "id"])]
y = df.loc[:, df.columns =="CLASS_LABEL"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y, 
    stratify=y,
    random_state=42
)

## Without feature selection training the model

In [14]:
model = CatBoostClassifier(iterations=2000, random_seed=42)

In [15]:
model.fit(X_train, y_train)

Learning rate set to 0.012899
0:	learn: 0.6719728	total: 7.68ms	remaining: 15.4s
1:	learn: 0.6515904	total: 26.2ms	remaining: 26.1s
2:	learn: 0.6330576	total: 30.3ms	remaining: 20.1s
3:	learn: 0.6128296	total: 43.3ms	remaining: 21.6s
4:	learn: 0.5955118	total: 47.1ms	remaining: 18.8s
5:	learn: 0.5787113	total: 55.9ms	remaining: 18.6s
6:	learn: 0.5636739	total: 69.6ms	remaining: 19.8s
7:	learn: 0.5494391	total: 80ms	remaining: 19.9s
8:	learn: 0.5344159	total: 102ms	remaining: 22.5s
9:	learn: 0.5204508	total: 113ms	remaining: 22.5s
10:	learn: 0.5080630	total: 125ms	remaining: 22.6s
11:	learn: 0.4933179	total: 140ms	remaining: 23.2s
12:	learn: 0.4810719	total: 151ms	remaining: 23s
13:	learn: 0.4693515	total: 160ms	remaining: 22.7s
14:	learn: 0.4573107	total: 166ms	remaining: 21.9s
15:	learn: 0.4474812	total: 173ms	remaining: 21.5s
16:	learn: 0.4357476	total: 181ms	remaining: 21.1s
17:	learn: 0.4248280	total: 192ms	remaining: 21.1s
18:	learn: 0.4151850	total: 197ms	remaining: 20.6s
19:	lea

<catboost.core.CatBoostClassifier at 0x7f3c30b5bb80>

In [16]:
y_pred = model.predict(X_test)

In [17]:
roc_auc_score(y_test, y_pred)

0.9876

In [18]:
accuracy_score(y_test, y_pred)

0.9876

In [19]:
confusion_matrix(y_test, y_pred)

array([[1232,   18],
       [  13, 1237]])

## Feature selection

In [20]:
feature_names = list(X.columns)

train_pool = Pool(
    X_train,
    y_train,
    feature_names=feature_names
)
test_pool = Pool(
    X_test,
    y_test,
    feature_names=feature_names
)

In [21]:
def select_features(algorithm: EFeaturesSelectionAlgorithm, steps: int = 1):
    print('Algorithm:', algorithm)
    model = CatBoostClassifier(iterations=2000, random_seed=42)
    summary = model.select_features(
        train_pool,
        eval_set=test_pool,
        features_for_select=list(range(train_pool.num_col())),
        num_features_to_select=16,
        steps=steps,
        algorithm=algorithm,
        shap_calc_type=EShapCalcType.Regular,
        train_final_model=True,
        logging_level='Silent',
        plot=True
    )
    print('Selected features:', summary['selected_features_names'])
    return summary


In [22]:
shap_summary = select_features(
    algorithm = EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    steps=10
)

Algorithm: EFeaturesSelectionAlgorithm.RecursiveByShapValues


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Selected features: ['NumDots', 'PathLevel', 'NumDash', 'NoHttps', 'PathLength', 'QueryLength', 'NumSensitiveWords', 'PctExtHyperlinks', 'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms', 'PctNullSelfRedirectHyperlinks', 'FrequentDomainNameMismatch', 'SubmitInfoToEmail', 'ExtMetaScriptLinkRT', 'PctExtNullSelfRedirectHyperlinksRT']


In [23]:
shap_summary["selected_features"]

[0, 2, 4, 14, 21, 22, 24, 26, 27, 28, 29, 33, 34, 38, 46, 47]

In [24]:
important_features = []

for feat in shap_summary['selected_features']:
    important_features.append(feature_names[feat])

In [25]:
important_features

['NumDots',
 'PathLevel',
 'NumDash',
 'NoHttps',
 'PathLength',
 'QueryLength',
 'NumSensitiveWords',
 'PctExtHyperlinks',
 'PctExtResourceUrls',
 'ExtFavicon',
 'InsecureForms',
 'PctNullSelfRedirectHyperlinks',
 'FrequentDomainNameMismatch',
 'SubmitInfoToEmail',
 'ExtMetaScriptLinkRT',
 'PctExtNullSelfRedirectHyperlinksRT']

## Training the model to see how well it predicts

In [26]:
X2 = X.loc[:, X.columns.isin(important_features)]
y2 = y

In [27]:
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2,
    y2, 
    stratify=y2,
    random_state=42
)

In [28]:
model = CatBoostClassifier(iterations=2000, random_seed=42)

In [29]:
model.fit(X2_train, y2_train)

Learning rate set to 0.012899
0:	learn: 0.6723839	total: 12.1ms	remaining: 24.1s
1:	learn: 0.6520877	total: 27.7ms	remaining: 27.7s
2:	learn: 0.6317021	total: 39.9ms	remaining: 26.6s
3:	learn: 0.6152184	total: 59.6ms	remaining: 29.8s
4:	learn: 0.5970065	total: 65ms	remaining: 25.9s
5:	learn: 0.5813629	total: 81.8ms	remaining: 27.2s
6:	learn: 0.5647777	total: 88ms	remaining: 25s
7:	learn: 0.5488160	total: 91.9ms	remaining: 22.9s
8:	learn: 0.5324009	total: 110ms	remaining: 24.3s
9:	learn: 0.5169699	total: 113ms	remaining: 22.5s
10:	learn: 0.5020042	total: 116ms	remaining: 20.9s
11:	learn: 0.4887251	total: 119ms	remaining: 19.7s
12:	learn: 0.4765475	total: 124ms	remaining: 18.9s
13:	learn: 0.4646858	total: 128ms	remaining: 18.2s
14:	learn: 0.4516929	total: 134ms	remaining: 17.8s
15:	learn: 0.4414279	total: 139ms	remaining: 17.2s
16:	learn: 0.4299810	total: 143ms	remaining: 16.6s
17:	learn: 0.4193732	total: 149ms	remaining: 16.4s
18:	learn: 0.4098169	total: 157ms	remaining: 16.4s
19:	learn

<catboost.core.CatBoostClassifier at 0x7f3c0bfc8ca0>

In [30]:
y2_pred = model.predict(X2_test)

In [31]:
roc_auc_score(y2_test, y2_pred)

0.984

In [32]:
accuracy_score(y2_test, y2_pred)

0.984

In [33]:
confusion_matrix(y2_test, y2_pred)

array([[1230,   20],
       [  20, 1230]])

## HyperParameter Tuning

In [34]:
mlflow.set_tracking_uri("http://localhost:5000")

from mlflow.models import infer_signature

def objective(trial):
    with mlflow.start_run(nested=True):
        params = {
        "iterations": trial.suggest_int("iterations", 2000, 3000),
        "learning_rate": trial.suggest_float(
            "learning_rate", 1e-3, 1e-1, log=True
        ),
        "depth": trial.suggest_int(
            "depth", 1, 10
        ),
        "min_data_in_leaf": trial.suggest_int(
            "min_data_in_leaf",
            1, 100
        )
    }
    
    
        # enable pruning by setting a callback
        pruning_callback = optuna.integration.CatBoostPruningCallback(trial, "Logloss")
    
        model = cb.CatBoostClassifier(**params, silent=True, random_seed=42)
    
        model.fit(X2_train,y2_train,
              eval_set=[(X2_test, y2_test)],
              verbose=0,
              callbacks=[pruning_callback])
        # predict
        y_pred = model.predict(X2_test)
        y_pred = y_pred.astype(int)
    
        # roc and auc
        roc_auc = roc_auc_score(y2_test, y_pred)
        
        # log to mlflow
        mlflow.log_params(params)
        mlflow.log_metric("roc_auc", roc_auc)
    
    return roc_auc

In [35]:
run_name="sixth_run"


def get_or_create_experiment(experiment_name):
    """
    Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist.

    This function checks if an experiment with the given name exists within MLflow.
    If it does, the function returns its ID. If not, it creates a new experiment
    with the provided name and returns its ID.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.

    Returns:
    - str: ID of the existing or newly created MLflow experiment.
    """

    if experiment := mlflow.get_experiment_by_name(experiment_name):
        return experiment.experiment_id
    else:
        return mlflow.create_experiment(experiment_name)
    
experiment_id = get_or_create_experiment("PhishingExperiment")


with mlflow.start_run(experiment_id=experiment_id, run_name=run_name, nested=True):
    study = optuna.create_study(
    direction="maximize",
    pruner = optuna.pruners.MedianPruner(),
    study_name="hyperparameter_optimization_catboost"
    )
    study.optimize(objective, n_trials=30)
    
    
    mlflow.log_params(study.best_params)
    mlflow.log_metric("best_roc_auc", study.best_value)
    
    #logs tags
    
    mlflow.set_tags(
        tags={
            "project": "Phishing Detection project",
            "optimizer_engine": "Optuna",
            "model_family": "catboost",
            "feature_set_version": 1
            
        }
    )
    
    # log the model instanc
    model = cb.CatBoostClassifier(**study.best_params, silent=True)
    model.fit(X2_train, y2_train)
    
    
    artifact_path="model"
    
    predict = model.predict(X2_test)
    signature = infer_signature(X2_test, predict)
    
    mlflow.catboost.log_model(
        cb_model=model,
        artifact_path=artifact_path,
        signature=signature,
        input_example=X2_train.iloc[[0]],
        metadata={"model_data_version": 1}
    )
    
    model_uri = mlflow.get_artifact_uri(artifact_path)

[I 2023-12-18 15:09:28,541] A new study created in memory with name: hyperparameter_optimization_catboost

CatBoostPruningCallback is experimental (supported from v3.0.0). The interface can change in the future.

[I 2023-12-18 15:10:01,559] Trial 0 finished with value: 0.9836 and parameters: {'iterations': 2811, 'learning_rate': 0.0031270020388982364, 'depth': 9, 'min_data_in_leaf': 37}. Best is trial 0 with value: 0.9836.

CatBoostPruningCallback is experimental (supported from v3.0.0). The interface can change in the future.

[I 2023-12-18 15:10:25,620] Trial 1 finished with value: 0.9832000000000001 and parameters: {'iterations': 2745, 'learning_rate': 0.006216152131842099, 'depth': 8, 'min_data_in_leaf': 69}. Best is trial 0 with value: 0.9836.

CatBoostPruningCallback is experimental (supported from v3.0.0). The interface can change in the future.

[I 2023-12-18 15:11:02,753] Trial 2 finished with value: 0.9812 and parameters: {'iterations': 2504, 'learning_rate': 0.02732393704217

In [36]:
study.best_params

{'iterations': 2709,
 'learning_rate': 0.003722236625796399,
 'depth': 8,
 'min_data_in_leaf': 37}

In [37]:
study.best_value

0.9840000000000001

In [38]:
model_uri

'mlflow-artifacts:/472960832933254073/726c095950d144b682f4fa6b7ea2a8cf/artifacts/model'