In [1]:
"""An ad click-through rate (CTR) prediction task is essentially a binary classification task.
The goal is to predict whether a user will click on an ad given a set of features.
This project is a simple implementation of a CTR prediction with tree-based models, 
including decision trees, random forests, and gradient boosting trees (GBT)."""

## Load the dataset
import pandas as pd

n_rows = 300000
df: pd.DataFrame = pd.read_csv('train/train.csv', nrows=n_rows)
print(df.head())

             id  click      hour    C1  banner_pos   site_id site_domain  \
0  1.000009e+18      0  14102100  1005           0  1fbe01fe    f3845767   
1  1.000017e+19      0  14102100  1005           0  1fbe01fe    f3845767   
2  1.000037e+19      0  14102100  1005           0  1fbe01fe    f3845767   
3  1.000064e+19      0  14102100  1005           0  1fbe01fe    f3845767   
4  1.000068e+19      0  14102100  1005           1  fe8cc448    9166c161   

  site_category    app_id app_domain  ... device_type device_conn_type    C14  \
0      28905ebd  ecad2386   7801e8d9  ...           1                2  15706   
1      28905ebd  ecad2386   7801e8d9  ...           1                0  15704   
2      28905ebd  ecad2386   7801e8d9  ...           1                0  15704   
3      28905ebd  ecad2386   7801e8d9  ...           1                0  15706   
4      0569f928  ecad2386   7801e8d9  ...           1                0  18993   

   C15  C16   C17  C18  C19     C20  C21  
0  320   50  

In [2]:
## Preprocess the dataset
import numpy as np

Y = df['click'].values
X: np.ndarray = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
print(X.shape)
print(Y.shape)

(300000, 19)
(300000,)


In [3]:
"""Normally, splitting datasets is done by randomly picking samples. 
However, in this case, the samples are in chronological order, as indicated in the hour field. 
Obviously, we cannot use future samples to predict past ones. Hence, 
we take the first 90% as training samples and the rest as testing samples:"""

## Split the dataset
n_train = int(0.9 * n_rows)
X_train: np.ndarray = X[:n_train]
Y_train: np.ndarray = Y[:n_train]
X_test: np.ndarray = X[n_train:]
Y_test: np.ndarray = Y[n_train:]

print(X_train[0])

[1005 0 '1fbe01fe' 'f3845767' '28905ebd' 'ecad2386' '7801e8d9' '07d7df22'
 '44956a24' 1 2 15706 320 50 1722 0 35 -1 79]


In [4]:
## Encode the training and test set with the OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')  # "ignore" will prevent errors due to any unseen categorical values.

# Fit the encoder to the training data and transform it
X_train_encoded: np.ndarray = encoder.fit_transform(X_train)
X_test_encoded: np.ndarray = encoder.transform(X_test)  # Transform the test data with the same encoder

print(X_train_encoded[0])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 19 stored elements and shape (1, 8204)>
  Coords	Values
  (0, 2)	1.0
  (0, 6)	1.0
  (0, 188)	1.0
  (0, 2608)	1.0
  (0, 2679)	1.0
  (0, 3771)	1.0
  (0, 3885)	1.0
  (0, 3929)	1.0
  (0, 4879)	1.0
  (0, 7315)	1.0
  (0, 7319)	1.0
  (0, 7475)	1.0
  (0, 7824)	1.0
  (0, 7828)	1.0
  (0, 7869)	1.0
  (0, 7977)	1.0
  (0, 7982)	1.0
  (0, 8021)	1.0
  (0, 8189)	1.0


In [5]:
## Experiment with the Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Initialize the DecisionTreeClassifier with default parameters
tree_clf = DecisionTreeClassifier(random_state=17)

# Define the parameter grid for GridSearchCV
param_grid:  dict[str, list] = {
    'max_depth': [ 10, 15, 20, 30],
    'min_samples_split': [10, 20, 30, 40, 50]
}
# Initialize GridSearchCV
grid_search = GridSearchCV(tree_clf, param_grid, 
                           cv=3,   # using small number of cv for large dataset
                           scoring='roc_auc', 
                           refit=True,
                           verbose=1, 
                           n_jobs=-1)
# Fit the model
grid_search.fit(X_train_encoded, Y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [6]:
print("Best parameters found for Decision Tree: ", grid_search.best_params_)
print(f"Best score found for Decision Tree: {grid_search.best_score_:.3f}")

Best parameters found for Decision Tree:  {'max_depth': 30, 'min_samples_split': 50}
Best score found for Decision Tree: 0.721


In [7]:
## Evaluate the best model of Decision Trees on the test set
from sklearn.metrics import roc_auc_score


best_model = grid_search.best_estimator_
y_pred_prob: np.ndarray = best_model.predict_proba(X_test_encoded)
roc_auc_test: float = roc_auc_score(Y_test, y_pred_prob[:, 1])  # Get the probability of the positive class
print("Test ROC AUC for Decision Tree: ", roc_auc_test)

Test ROC AUC for Decision Tree:  0.7337658251198956


In [8]:
from sklearn.metrics import classification_report

y_pred: np.ndarray = best_model.predict(X_test_encoded)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.98      0.92     25495
           1       0.52      0.12      0.20      4505

    accuracy                           0.85     30000
   macro avg       0.69      0.55      0.56     30000
weighted avg       0.81      0.85      0.81     30000



In [9]:
## Experiment with the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Initialize the RandomForestClassifier with default parameters
rf_clf = RandomForestClassifier(random_state=17, n_jobs=-1)

# Define the parameter grid for GridSearchCV
rf_param_grid: dict[str, list] = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 15, 20],
    'min_samples_split': [10, 20, 30, 50]
}

# Initialize GridSearchCV for Random Forest
rf_grid_search = GridSearchCV(rf_clf, rf_param_grid, 
                               cv=3,   # using small number of cv for large dataset
                               scoring='roc_auc', 
                               refit=True,
                               verbose=1, n_jobs=-1)
# Fit the model
rf_grid_search.fit(X_train_encoded, Y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [10]:
print("Best parameters found for Random Forest: ", rf_grid_search.best_params_)
print(f"Best score found for Random Forest: {rf_grid_search.best_score_:.3f}" )

Best parameters found for Random Forest:  {'max_depth': 20, 'min_samples_split': 20, 'n_estimators': 500}
Best score found for Random Forest: 0.729


In [11]:
# Evaluate the best model on the test set
rf_best_model = rf_grid_search.best_estimator_
y_pred_prob_rf: np.ndarray = rf_best_model.predict_proba(X_test_encoded)
roc_auc_test_rf: float = roc_auc_score(Y_test, y_pred_prob_rf[:, 1])  # Get the probability of the positive class
print("Test ROC AUC for Random Forest: ", roc_auc_test_rf)
print("Average accuracy on test set for Random Forest: ", rf_best_model.score(X_test_encoded, Y_test))

Test ROC AUC for Random Forest:  0.746900079861582
Average accuracy on test set for Random Forest:  0.851


In [12]:
## Experiment with the GradientBoostingClassifier
from xgboost import XGBClassifier

# Initialize the XGBClassifier with default parameters
xgb_clf = XGBClassifier(random_state=17)

# Define the parameter grid for GridSearchCV
xgb_param_grid: dict[str, list] = {
    'n_estimators': [500, 1000, 1500],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10]
}
# Initialize GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(estimator=xgb_clf, 
                               param_grid=xgb_param_grid, 
                               scoring='roc_auc', 
                               cv=3, 
                               verbose=1, 
                               n_jobs=-1)
# Fit the model
grid_search_xgb.fit(X_train_encoded, Y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [13]:
print("best parameters found for XGBoost: ", grid_search_xgb.best_params_)
print("best score found for XGBoost: ", grid_search_xgb.best_score_)

best parameters found for XGBoost:  {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 1500}
best score found for XGBoost:  0.7456587656850333


In [14]:
# Evaluate the model on the test set
y_pred_prob_xgb: np.ndarray = grid_search_xgb.best_estimator_.predict_proba(X_test_encoded)
roc_auc_test_xgb: float = roc_auc_score(Y_test, y_pred_prob_xgb[:, 1])  # Get the probability of the positive class
print("Test ROC AUC for XGBoost: ", roc_auc_test_xgb)

Test ROC AUC for XGBoost:  0.7708956490565603


Click-through involves many intricate human factors, which is why predicting it is not an easy task. An AUC of 0.771 is actually pretty good. <br>
<br>
In summary:

- <b>Decision tree (CART)</b>: is the most simple and interpretable algorithm. It is usually used for smaller datasets.
- <b>Random forest</b>: is more robust to overfitting, and can handle larger or complex datasets well.
- <b>GBT</b>: is considered the most powerful algorithm for complex problems, and the most popular tree-based algorithm in the industry. At the same time, however, it can be prone to overfitting. Hence, using hyperparameter tuning and regularization techniques to avoid overfitting is recommended.
