In [2]:
# Imports
import mlcroissant as mlc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
# feature engineering
import tldextract
import Levenshtein

# logistic regression from sklearn
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import KFold, cross_val_predict, train_test_split
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_curve, auc, classification_report
)

# svm and decision tree from sklearn
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
import math

# XGBoost and Random Forest
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# Neural network
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, Input, Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout, BatchNormalization, concatenate
from sklearn.utils import class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau

### Load Data Set

In [3]:
# Fetch the Croissant JSON-LD
croissant_dataset = mlc.Dataset('https://www.kaggle.com/datasets/naveenbhadouria/malicious/croissant/download')

# Check what record sets are in the dataset
record_sets = croissant_dataset.metadata.record_sets
print(record_sets)

# Fetch the records and put them in a DataFrame
record_set_df = pd.DataFrame(croissant_dataset.records(record_set=record_sets[0].uuid))
df = record_set_df

# Rename columns
df = df.rename(columns={'malicious_phish1.csv/url' : 'url', 'malicious_phish1.csv/type' : 'type'})

# Use errors='replace' to substitute bad bytes with a placeholder
df['url'] = df['url'].apply(lambda x: x.decode('utf-8', errors='replace') if isinstance(x, bytes) else x)
df['type'] = df['type'].apply(lambda x: x.decode('utf-8', errors='replace') if isinstance(x, bytes) else x)

  -  [Metadata(Malicious_URL's_Dataset)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.


[RecordSet(uuid="malicious_phish1.csv")]


### Feature Engineering
##### Using a function is better for the UI

In [4]:
def extract_url_features(url):
    # Ensure the url is a string
    url = str(url)
    
    # Count based features
    features = {
        'url_length': len(url),
        'num_digits': sum(c.isdigit() for c in url),
        'num_periods': url.count('.'),
        'num_slashes': url.count('/'),
        'num_ats': url.count('@'),
        'digit_len_ratio': sum(c.isdigit() for c in url) / len(url) if len(url) > 0 else 0,
        'entropy': calc_entropy(url) # Using your existing entropy function
    }
    
    # Boolean features
    patterns = {
        'has_html': r'\.html?',
        'has_query_param': r'\?query=',
        'has_https': r'^https://',
        'has_http': r'^http://',
        'has_ip_address': r'://(?:\d{1,3}\.){3}\d{1,3}',
        'has_non_ascii_chars': r'[^\x00-\x7F]'
    }
    
    for name, pattern in patterns.items():
        features[name] = int(bool(re.search(pattern, url, re.IGNORECASE)))
        
    # Suspicious keyword anywhere in url
    suspicious_kw = ['login', 'secure', 'payment', 'verify']
    features['has_suspicious_kw'] = int(any(kw in url.lower() for kw in suspicious_kw))
    
    brands = ['google', 'paypal', 'microsoft', 'apple', 'amazon', 'netflix', 'facebook']
    features['has_brand_kw'] = int(any(brand in url.lower() for brand in brands))
    
    # Domain and typosquat
    extracted = tldextract.extract(url)
    domain = extracted.domain
    features['tld'] = extracted.suffix # We need this for the one hot encoding later
    
    min_dist = 100
    for brand in brands:
        dist = Levenshtein.distance(domain.lower(), brand)
        if dist < min_dist:
            min_dist = dist
            
    features['min_brand_dist'] = min_dist
    
    # Typosquat logic
    if min_dist == 0:
        features['is_typosquat'] = 0
    elif 0 < min_dist <= 2:
        features['is_typosquat'] = 1
    else:
        features['is_typosquat'] = 0
        
    return features

# Calculate Shannon entropy - measures how random the url is (if its gibberish its probably bad)
def calc_entropy(text):
    if not text: return 0
    entropy = 0
    for x, n in Counter(str(text)).items():
        p = n / len(text)
        entropy -= p * math.log2(p)
    return entropy

In [6]:
# 1. GENERATE FEATURES FOR ALL DATA
print("Generating features...")
# Apply the function we created in Step 1
features_list = df['url'].apply(extract_url_features).tolist()
df_features = pd.DataFrame(features_list)

# 2. HANDLE TLDs (Crucial Step for consistency)
# Identify top 20 TLDs from training data
top_tlds = df_features['tld'].value_counts().nlargest(20).index.tolist()

# Function to handle TLD columns manually
def process_tlds(tld_series, top_tlds_list):
    # Create a DataFrame of 0s
    tld_df = pd.DataFrame(0, index=tld_series.index, columns=[f'tld_{t}' for t in top_tlds_list])
    tld_df['tld_other'] = 0
    
    for idx, tld in tld_series.items():
        col_name = f'tld_{tld}'
        if col_name in tld_df.columns:
            tld_df.at[idx, col_name] = 1
        else:
            tld_df.at[idx, 'tld_other'] = 1
    return tld_df

tld_dummies = process_tlds(df_features['tld'], top_tlds)

# 3. PREPARE NUMERIC INPUT (X_num)
# Drop raw 'tld' column and concatenate dummies
X_num_raw = pd.concat([df_features.drop(['tld'], axis=1), tld_dummies], axis=1)

scaler = StandardScaler()
X_num = scaler.fit_transform(X_num_raw)

# 4. PREPARE TEXT INPUT (X_text)
tokenizer = Tokenizer(char_level=True, oov_token='UNK')
tokenizer.fit_on_texts(df['url'])
sequences = tokenizer.texts_to_sequences(df['url'])
max_len = 200
X_text = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

Generating features...


### Create Labels & Split Data

In [None]:
# --- 1. PREPARE LABELS ---
# Binary Target (for XGBoost)
y_binary = df['malicious'].values

# Multiclass Target (for Neural Network)
encoder = LabelEncoder()
y_multi_int = encoder.fit_transform(df['type'])
y_multi = to_categorical(y_multi_int)
num_classes = y_multi.shape[1]

# --- 2. SPLIT DATA ---
# We split ALL arrays at once using the same random_state to ensure alignment
X_num_train, X_num_test, \
X_text_train, X_text_test, \
y_bin_train, y_bin_test, \
y_multi_train, y_multi_test = train_test_split(
    X_num, 
    X_text, 
    y_binary, 
    y_multi, 
    test_size=0.2, 
    random_state=42
)

print(f"Training Samples: {len(X_num_train)}")
print(f"Test Samples: {len(X_num_test)}")

In [None]:
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

print("Tuning XGBoost (This will take a few minutes)...")

# 1. Define the Hyperparameter Grid (From your old file)
param_grid_xg = {
    'n_estimators': [100, 200], # Added 200 to see if more trees help
    'max_depth': [3, 5, 7],     # Added 7 for deeper complexity
    'learning_rate': [0.1, 0.05],
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}

# 2. Set up Grid Search
# We use 'refit=True', so after finding the best params, 
# it automatically retrains one final model on the whole X_num_train dataset.
grid_search = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    param_grid_xg,
    scoring='f1',
    cv=3, # 3-fold internal validation is enough for tuning
    n_jobs=-1, # Use all CPU cores to speed this up
    verbose=1
)

# 3. Fit on the Training Data
grid_search.fit(X_num_train, y_bin_train)

# 4. Extract the Best Model
best_xgb = grid_search.best_estimator_
print(f"\nBest Parameters Found: {grid_search.best_params_}")

# 5. Evaluate on Test Data (The data the model has never seen)
y_pred_xgb = best_xgb.predict(X_num_test)

print("\nFinal XGBoost Evaluation:")
print(confusion_matrix(y_bin_test, y_pred_xgb))
print(classification_report(y_bin_test, y_pred_xgb))

# --- Feature Importance Visualization ---
# We need to manually reconstruct feature names because X_num is a numpy array
feature_names = [
    'url_length', 'num_digits', 'num_periods', 'num_slashes', 'num_ats', 
    'digit_len_ratio', 'entropy', 
    'has_html', 'has_query_param', 'has_https', 'has_http', 'has_ip_address', 
    'has_non_ascii_chars', 'has_suspicious_kw', 'has_brand_kw', 
    'min_brand_dist', 'is_typosquat'
]
# Add the TLD columns we created
feature_names += [f'tld_{t}' for t in top_tlds] + ['tld_other']

# Get importances
importances = best_xgb.feature_importances_

# Sort them for a better chart
sorted_idx = np.argsort(importances)[-20:] # Top 20 features
plt.figure(figsize=(10, 8))
plt.barh([feature_names[i] for i in sorted_idx], importances[sorted_idx])
plt.title("Top 20 Feature Importance (XGBoost)")
plt.xlabel("Importance Score")
plt.show()

# 6. UPDATE VARIABLE FOR SAVING
# This is crucial so the next cell saves this TUNED model, not a default one
xgb_model = best_xgb

In [None]:
print("Training Neural Network (Multiclass)...")

def build_hybrid_model():
    # --- Branch 1: Numerical Data ---
    # Shape is determined automatically by X_num columns
    input_num = Input(shape=(X_num_train.shape[1],), name='numeric_input')
    x1 = layers.Dense(128, activation='relu')(input_num) # Increased neurons slightly
    x1 = layers.BatchNormalization()(x1)
    x1 = layers.Dropout(0.3)(x1)

    # --- Branch 2: Text Data (Char-CNN) ---
    input_text = Input(shape=(max_len,), name='text_input')
    
    vocab_size = len(tokenizer.word_index) + 1
    
    x2 = layers.Embedding(input_dim=vocab_size, output_dim=32)(input_text) # Removed input_length (deprecated)
    
    x2 = layers.Conv1D(filters=64, kernel_size=3, activation='relu')(x2)
    x2 = layers.MaxPooling1D(pool_size=2)(x2)
    
    x2 = layers.Conv1D(filters=128, kernel_size=5, activation='relu')(x2)
    x2 = layers.GlobalMaxPooling1D()(x2)
    x2 = layers.Dropout(0.3)(x2)

    # --- Merge ---
    merged = concatenate([x1, x2])
    
    x = layers.Dense(128, activation='relu')(merged)
    x = layers.Dropout(0.3)(x)
    
    # Output Layer (Multiclass)
    output = layers.Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=[input_text, input_num], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Calculate class weights for imbalance
from sklearn.utils import class_weight
y_multi_int_train = np.argmax(y_multi_train, axis=1)
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_multi_int_train),
    y=y_multi_int_train
)
weights_dict = dict(enumerate(class_weights))

# Build and Train
nn_model = build_hybrid_model()

early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.00001)

history = nn_model.fit(
    [X_text_train, X_num_train], # Two inputs!
    y_multi_train,
    validation_split=0.2,
    epochs=20, # 20 is usually enough with early stopping
    batch_size=32, 
    callbacks=[early_stop, lr_scheduler], 
    class_weight=weights_dict,
    verbose=1
)