In [3]:

# Import necessary libraries
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

# Load the cleaned tokenized dataset with all 23 features
df_cleaned = pd.read_csv('../data/unseen/client_ceas_unseen_cleaned.csv')

# List of features (23 features based on what was saved before)
features = [
    'sender_domain_length', 'sender_has_digits', 'sender_has_special_chars', 
    'sender_tld', 'sender_is_public_domain', 'receiver_is_undisclosed', 
    'receiver_is_public_domain', 'sender_equals_receiver', 'email_hour', 
    'is_weekend', 'url_present', 'url_has_ip', 'url_has_special_chars', 
    'url_has_redirect', 'url_suspicious_tld', 'subject_length', 'body_length', 
    'text_combined_length', 'uppercase_ratio', 'exclamation_count', 'label', 
    'url_length_ratio', 'url_density'
]

# The 'label' column is the target variable (class to predict)
X = df_cleaned[features].drop(columns=['label'])  # Features
y = df_cleaned['label']  # Target variable (labels)

# Split the dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to DMatrix for XGBoost (the optimized data structure for XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Define the XGBoost model parameters
params = {
    "objective": "binary:logistic",  # Binary classification
    "eval_metric": "logloss",        # Use logloss as the evaluation metric
    "max_depth": 6,                  # Maximum depth of a tree
    "learning_rate": 0.1,            # Step size shrinkage
    "n_estimators": 100,             # Number of trees (boosting rounds)
}

# Train the XGBoost model
bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'eval')])

# Get predicted probabilities (probability of class 1)
y_pred_prob = bst.predict(dval)

# Print the first few predicted probabilities
print("Predicted probabilities (first 5 samples):", y_pred_prob[:5])

# Calculate accuracy and logloss
y_pred_class = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to class labels
accuracy = accuracy_score(y_val, y_pred_class)
logloss = log_loss(y_val, y_pred_prob)

print(f"Validation Accuracy: {accuracy}")
print(f"Validation Log Loss: {logloss}")

# Save the predicted probabilities
probabilities_df = pd.DataFrame({
    'prob_class_0': 1 - y_pred_prob,  # Probabilities for class 0
    'prob_class_1': y_pred_prob,      # Probabilities for class 1
})

# Save to CSV
probabilities_save_path = "../data/unseen/xgboost_probabilities.csv"
probabilities_df.to_csv(probabilities_save_path, index=False)

print(f"✅ XGBoost probabilities saved to {probabilities_save_path}")


[0]	eval-logloss:0.61324
[1]	eval-logloss:0.54795
[2]	eval-logloss:0.49365
[3]	eval-logloss:0.44780
[4]	eval-logloss:0.40759
[5]	eval-logloss:0.37434
[6]	eval-logloss:0.34556
[7]	eval-logloss:0.31849
[8]	eval-logloss:0.29541
[9]	eval-logloss:0.27538
[10]	eval-logloss:0.25732
[11]	eval-logloss:0.24160
[12]	eval-logloss:0.22767
[13]	eval-logloss:0.21417
[14]	eval-logloss:0.20361
[15]	eval-logloss:0.19339
[16]	eval-logloss:0.18279
[17]	eval-logloss:0.17580
[18]	eval-logloss:0.16789
[19]	eval-logloss:0.16096
[20]	eval-logloss:0.15539
[21]	eval-logloss:0.15066
[22]	eval-logloss:0.14547
[23]	eval-logloss:0.14094
[24]	eval-logloss:0.13777
[25]	eval-logloss:0.13430
[26]	eval-logloss:0.13033
[27]	eval-logloss:0.12674
[28]	eval-logloss:0.12513
[29]	eval-logloss:0.12206
[30]	eval-logloss:0.11988
[31]	eval-logloss:0.11800
[32]	eval-logloss:0.11565
[33]	eval-logloss:0.11484
[34]	eval-logloss:0.11311
[35]	eval-logloss:0.11081
[36]	eval-logloss:0.10940
[37]	eval-logloss:0.10744
[38]	eval-logloss:0.10

Parameters: { "n_estimators" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[73]	eval-logloss:0.08833
[74]	eval-logloss:0.08802
[75]	eval-logloss:0.08809
[76]	eval-logloss:0.08762
[77]	eval-logloss:0.08788
[78]	eval-logloss:0.08777
[79]	eval-logloss:0.08714
[80]	eval-logloss:0.08700
[81]	eval-logloss:0.08655
[82]	eval-logloss:0.08651
[83]	eval-logloss:0.08661
[84]	eval-logloss:0.08645
[85]	eval-logloss:0.08609
[86]	eval-logloss:0.08535
[87]	eval-logloss:0.08553
[88]	eval-logloss:0.08544
[89]	eval-logloss:0.08527
[90]	eval-logloss:0.08552
[91]	eval-logloss:0.08534
[92]	eval-logloss:0.08554
[93]	eval-logloss:0.08559
[94]	eval-logloss:0.08548
[95]	eval-logloss:0.08538
[96]	eval-logloss:0.08503
[97]	eval-logloss:0.08461
[98]	eval-logloss:0.08471
[99]	eval-logloss:0.08426
Predicted probabilities (first 5 samples): [0.97605604 0.42612672 0.9972556  0.00110426 0.00177454]
Validation Accuracy: 0.9670658682634731
Validation Log Loss: 0.08425596146906676
✅ XGBoost probabilities saved to ../data/unseen/xgboost_probabilities.csv
