In [None]:
import json
from pathlib import Path

import joblib
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Modeling

In this notebook, we will try to do simple modeling for the purpose of demo.

## Constants & Hyperparameters

In [None]:
DATA_SPLIT_DIR = Path("../data/split")
DATA_TRAIN_FILEPATH = DATA_SPLIT_DIR / "phishing_train.csv"
DATA_TEST_FILEPATH = DATA_SPLIT_DIR / "phishing_test.csv"

OUTPUTS_DIR = Path("../outputs")
MODELS_DIR = OUTPUTS_DIR / "models"

## Dataset

In [None]:
df_train = pd.read_csv(DATA_TRAIN_FILEPATH)
df_test = pd.read_csv(DATA_TEST_FILEPATH)

print("Train size:", df_train.shape)
print("Test size:", df_test.shape)

Train size: (9080, 24)
Test size: (2270, 24)


In [None]:
df_train.head()

Unnamed: 0,month,ext,urlLength,numDigits,numParams,num_%20,num_@,entropy,has_ip,hasHttp,...,bodyLength,numTitles,numImages,numLinks,specialChars,scriptLength,sbr,bscr,sscr,target
0,1,com,98,2,0,0,0,-4.356572,0,True,...,336430,0,36,58,86530,331704,0.985953,0.257201,3.066719,0
1,7,com,121,7,0,0,0,-4.515268,0,True,...,46050,19,3,248,10656,11141,0.241933,0.231401,1.045514,0
2,1,net,101,29,0,0,0,-4.113054,0,True,...,71,1,0,0,14,0,0.0,0.197183,0.0,0
3,3,com,99,11,0,0,0,-4.373665,0,True,...,87682,1,0,20,20368,16184,0.184576,0.232294,0.635664,0
4,12,other,85,19,0,0,0,-4.458455,0,True,...,16524,48,35,176,6553,11620,0.619211,0.300169,2.062872,0


In [None]:
df_test.head()

Unnamed: 0,month,ext,urlLength,numDigits,numParams,num_%20,num_@,entropy,has_ip,hasHttp,...,bodyLength,numTitles,numImages,numLinks,specialChars,scriptLength,sbr,bscr,sscr,target
0,3,com,100,10,0,0,0,-4.566468,0,True,...,32897,12,43,200,8872,19447,0.591148,0.26969,1.753562,0
1,6,other,116,3,0,0,0,-4.494696,0,False,...,293,1,0,0,45,0,0.0,0.153584,0.0,0
2,3,html,90,7,0,0,0,-4.148278,0,True,...,1759,0,0,1,578,1588,0.902786,0.328596,2.197924,0
3,8,other,95,8,0,0,0,-4.397216,0,True,...,169,2,0,0,29,0,0.0,0.171598,0.0,0
4,6,com,123,2,0,0,0,-4.334384,0,True,...,0,0,0,0,0,0,0.0,0.0,0.0,0


In [None]:
X_train, y_train = df_train.drop(columns=["target"]), df_train["target"]
X_test, y_test = df_test.drop(columns=["target"]), df_test["target"]

## Define Pipeline

In [None]:
model = Pipeline(
    [
        (
            "transformer", ColumnTransformer(
                [("encoder", OneHotEncoder(), ["ext"])],
                remainder="passthrough"
            )
        ),
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression())
    ]
)

In [None]:
model

## Model Training & Evaluation

In [None]:
model.fit(X_train, y_train)

In [None]:
preds_train = model.predict(X_train)
preds_test = model.predict(X_test)

In [None]:
print("Training performances..")
print(classification_report(y_train, preds_train))
print()
print("Test performances..")
print(classification_report(y_test, preds_test))

Training performances..
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      8803
           1       0.75      0.22      0.34       277

    accuracy                           0.97      9080
   macro avg       0.86      0.61      0.66      9080
weighted avg       0.97      0.97      0.97      9080


Test performances..
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      2197
           1       0.84      0.22      0.35        73

    accuracy                           0.97      2270
   macro avg       0.91      0.61      0.67      2270
weighted avg       0.97      0.97      0.97      2270



## Save Model

In [None]:
if not MODELS_DIR.exists():
    MODELS_DIR.mkdir(parents=True)

assert MODELS_DIR.exists()

In [None]:
joblib.dump(model, MODELS_DIR / "model.joblib")

['../outputs/models/model.joblib']

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8f5c09b4-3349-4c4e-9128-93e08a4345f5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>