In [1]:
!pip install catboost imbalanced-learn pytorch-tabnet

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3->pytorch-tabnet)
  Downloading nvidia_cublas_c

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import xgboost as xgb


In [70]:
X_train,y_train,X_test,y_test = pd.read_csv("/content/X_train.csv"),pd.read_csv("/content/y_train.csv"),pd.read_csv("/content/X_test.csv"),pd.read_csv("/content/y_test.csv")
features = ['purchase_bins_encoded', 'weekday_encoded',
       'email_text_encoded', 'email_version_encoded', 'user_country_encoded','hour','opened']
X_train['opened'] = y_train['opened']
X_test['opened'] = y_test['opened']

In [71]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score, precision_score, recall_score, roc_auc_score


In [72]:
cat_features = features

# Initialize the CatBoost model
model = CatBoostClassifier(
    iterations=5000,
    learning_rate=0.01,
    depth=6,
    l2_leaf_reg=3.0,
    random_strength=1.5,
    bagging_temperature=1.0,
    early_stopping_rounds=50,
    class_weights=[1, 20],

    use_best_model=True,
    cat_features=cat_features,
    random_seed=42,
    verbose=10
)




In [73]:
model.fit(X_train, y_train['clicked'], eval_set=(X_test, y_test['clicked']))

0:	learn: 0.6772231	test: 0.6772213	best: 0.6772213 (0)	total: 104ms	remaining: 8m 38s
10:	learn: 0.5427084	test: 0.5427819	best: 0.5427819 (10)	total: 870ms	remaining: 6m 34s
20:	learn: 0.4474937	test: 0.4475933	best: 0.4475933 (20)	total: 1.61s	remaining: 6m 21s
30:	learn: 0.3801073	test: 0.3802221	best: 0.3802221 (30)	total: 2.26s	remaining: 6m 2s
40:	learn: 0.3320675	test: 0.3322302	best: 0.3322302 (40)	total: 2.92s	remaining: 5m 53s
50:	learn: 0.2980396	test: 0.2982186	best: 0.2982186 (50)	total: 3.48s	remaining: 5m 37s
60:	learn: 0.2734212	test: 0.2736167	best: 0.2736167 (60)	total: 4.13s	remaining: 5m 34s
70:	learn: 0.2554140	test: 0.2556462	best: 0.2556462 (70)	total: 4.77s	remaining: 5m 31s
80:	learn: 0.2421723	test: 0.2424091	best: 0.2424091 (80)	total: 5.3s	remaining: 5m 22s
90:	learn: 0.2322092	test: 0.2324724	best: 0.2324724 (90)	total: 5.95s	remaining: 5m 21s
100:	learn: 0.2247276	test: 0.2250022	best: 0.2250022 (100)	total: 6.54s	remaining: 5m 17s
110:	learn: 0.2190193	t

<catboost.core.CatBoostClassifier at 0x7b4330ae0550>

In [74]:
y_pred_catb = model.predict(X_test)
f1 = f1_score(y_test['clicked'], y_pred_catb)
print("F1 Score on test set:", f1)


F1 Score on test set: 0.33212996389891697


In [75]:
from sklearn.metrics import classification_report
print(classification_report(y_test['clicked'], y_pred_catb))


              precision    recall  f1-score   support

           0       1.00      0.92      0.96     19576
           1       0.20      0.98      0.33       424

    accuracy                           0.92     20000
   macro avg       0.60      0.95      0.64     20000
weighted avg       0.98      0.92      0.94     20000



In [76]:
dtrain = xgb.DMatrix(X_train, label=y_train['clicked'])
dtest = xgb.DMatrix(X_test, label=y_test['clicked'])

In [77]:
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 7,
    'eta': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'lambda': 1.0,
    'alpha': 0.5,
    'seed': 42,
    'scale_pos_weight': 20,

}


In [78]:
watchlist = [(dtrain, 'train'), (dtest, 'test')]

# Train with early stopping
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=2000,
    evals=watchlist,
    early_stopping_rounds=50,
    verbose_eval=50

)

[0]	train-logloss:0.38913	test-logloss:0.38927
[50]	train-logloss:0.16126	test-logloss:0.16505
[100]	train-logloss:0.13711	test-logloss:0.14483
[150]	train-logloss:0.12510	test-logloss:0.13717
[200]	train-logloss:0.11531	test-logloss:0.13101
[250]	train-logloss:0.10590	test-logloss:0.12570
[300]	train-logloss:0.09820	test-logloss:0.12175
[350]	train-logloss:0.09155	test-logloss:0.11840
[400]	train-logloss:0.08543	test-logloss:0.11539
[450]	train-logloss:0.08005	test-logloss:0.11295
[500]	train-logloss:0.07532	test-logloss:0.11088
[550]	train-logloss:0.07096	test-logloss:0.10903
[600]	train-logloss:0.06703	test-logloss:0.10750
[650]	train-logloss:0.06348	test-logloss:0.10616
[700]	train-logloss:0.06026	test-logloss:0.10465
[750]	train-logloss:0.05756	test-logloss:0.10383
[800]	train-logloss:0.05438	test-logloss:0.10286
[850]	train-logloss:0.05197	test-logloss:0.10211
[900]	train-logloss:0.04984	test-logloss:0.10156
[950]	train-logloss:0.04758	test-logloss:0.10125
[1000]	train-logloss:0.

In [79]:
from sklearn.metrics import f1_score
y_pred_prob = bst.predict(dtest, iteration_range=(0, bst.best_iteration + 1))
y_pred_xgb = (y_pred_prob > 0.5).astype(int)

# Evaluate

f1 = f1_score(y_test['clicked'], y_pred_xgb)
print(f"F1_score: {f1:.4f}")
print(f"Best iteration: {bst.best_iteration}")
print(f"Best score: {bst.best_score}")

F1_score: 0.2631
Best iteration: 1123
Best score: 0.10048042858328828


In [80]:
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
X_train_np = X_train.values
X_test_np = X_test.values
y_train_np = y_train['clicked'].values.flatten()
y_test_np = y_test['clicked'].values.flatten()

# Step 2: Initialize and train TabNet
tabnet_model = TabNetClassifier(
    n_d=16, n_a=16, n_steps=5,
    gamma=1.5, lambda_sparse=1e-4,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax',
    verbose=True,
    seed=42
)



In [81]:

tabnet_model.fit(
    X_train_np, y_train_np,
    eval_set=[(X_test_np, y_test_np)],
    eval_name=["val"],
    eval_metric=["logloss"],
    max_epochs=20,
    patience=5,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
    weights={0:1,1:20}

)

epoch 0  | loss: 0.28051 | val_logloss: 0.33909 |  0:00:08s
epoch 1  | loss: 0.20616 | val_logloss: 0.18928 |  0:00:17s
epoch 2  | loss: 0.1991  | val_logloss: 0.16806 |  0:00:26s
epoch 3  | loss: 0.19908 | val_logloss: 0.16584 |  0:00:34s
epoch 4  | loss: 0.20141 | val_logloss: 0.14943 |  0:00:43s
epoch 5  | loss: 0.19472 | val_logloss: 0.14837 |  0:00:53s
epoch 6  | loss: 0.19629 | val_logloss: 0.17609 |  0:01:02s
epoch 7  | loss: 0.19723 | val_logloss: 0.17634 |  0:01:11s
epoch 8  | loss: 0.20068 | val_logloss: 0.17035 |  0:01:20s
epoch 9  | loss: 0.19357 | val_logloss: 0.17438 |  0:01:30s
epoch 10 | loss: 0.20015 | val_logloss: 0.15366 |  0:01:39s

Early stopping occurred at epoch 10 with best_epoch = 5 and best_val_logloss = 0.14837




In [82]:
# Predictions
preds_proba = tabnet_model.predict_proba(X_test_np)[:, 1]
preds_tabnet = (preds_proba > 0.5).astype(int)

# Metrics
accuracy = accuracy_score(y_test_np, preds_tabnet)
precision = precision_score(y_test_np, preds_tabnet)
recall = recall_score(y_test_np, preds_tabnet)
f1 = f1_score(y_test_np, preds_tabnet)


# Output results
print("\nTabNet Performance:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")




TabNet Performance:
Accuracy:  0.9167
Precision: 0.1997
Recall:    0.9741
F1 Score:  0.3315


In [83]:
def majority_vote(preds1, preds2,preds3):
    preds = np.array([preds1, preds2,preds3])
    # Sum predictions and take majority vote
    maj_vote = np.apply_along_axis(lambda x: np.argmax(np.bincount(x)), axis=0, arr=preds)
    return maj_vote

In [84]:
ensemble_preds = majority_vote(y_pred_catb, preds_tabnet, y_pred_xgb)

In [85]:

# Evaluate accuracy
accuracy = accuracy_score(y_test['clicked'], ensemble_preds)
precision = precision_score(y_test['clicked'], ensemble_preds)
recall = recall_score(y_test['clicked'], ensemble_preds)
f1 = f1_score(y_test['clicked'], ensemble_preds)

print(f"Ensemble Accuracy: {accuracy:.4f}")
print(f"Ensemble Precision: {precision:.4f}")
print(f"Ensemble Recall: {recall:.4f}")
print(f"Ensemble F1 Score: {f1:.4f}")

# Compare with individual models
print("\nIndividual Model Performance:")
print(f"CatBoost F1: {f1_score(y_test['clicked'], y_pred_catb):.4f}")
print(f"XGBoost F1: {f1_score(y_test['clicked'], y_pred_xgb):.4f}")
print(f"Tabnet F1 : {f1_score(y_test['clicked'],preds_tabnet)}")

Ensemble Accuracy: 0.9167
Ensemble Precision: 0.1997
Ensemble Recall: 0.9741
Ensemble F1 Score: 0.3315

Individual Model Performance:
CatBoost F1: 0.3321
XGBoost F1: 0.2631
Tabnet F1 : 0.33146067415730335


In [86]:
opened_preds = pd.read_csv('/content/opened_prediction.csv')

In [89]:
#Ensemble_preds opened with F1_metrics


X_test_new = X_test.copy()
X_test_new['opened'] = opened_preds['0']

y_pred_catb_new = model.predict(X_test_new)
preds_proba = tabnet_model.predict_proba(X_test_new.values)[:, 1]
preds_tabnet_new = (preds_proba > 0.5).astype(int)

y_pred_prob = bst.predict(xgb.DMatrix(X_test_new, label=y_test['clicked']), iteration_range=(0, bst.best_iteration + 1))
y_pred_xgb_new = (y_pred_prob > 0.5).astype(int)




In [90]:
ensemble_preds_new = majority_vote(y_pred_catb_new, preds_tabnet_new, y_pred_xgb_new)

In [91]:

# Evaluate accuracy
accuracy = accuracy_score(y_test['clicked'], ensemble_preds_new)
precision = precision_score(y_test['clicked'], ensemble_preds_new)
recall = recall_score(y_test['clicked'], ensemble_preds_new)
f1 = f1_score(y_test['clicked'], ensemble_preds_new)

print(f"Ensemble Accuracy: {accuracy:.4f}")
print(f"Ensemble Precision: {precision:.4f}")
print(f"Ensemble Recall: {recall:.4f}")
print(f"Ensemble F1 Score: {f1:.4f}")

# Compare with individual models
print("\nIndividual Model Performance:")
print(f"CatBoost F1: {f1_score(y_test['clicked'], y_pred_catb_new):.4f}")
print(f"XGBoost F1: {f1_score(y_test['clicked'], y_pred_xgb_new):.4f}")
print(f"Tabnet F1 : {f1_score(y_test['clicked'],preds_tabnet_new)}")

Ensemble Accuracy: 0.6370
Ensemble Precision: 0.0381
Ensemble Recall: 0.6651
Ensemble F1 Score: 0.0721

Individual Model Performance:
CatBoost F1: 0.0723
XGBoost F1: 0.0643
Tabnet F1 : 0.07208588957055215
