

# Predicting ad clicking behavior with LASSO, elastic net, decision tree, random forest, and XGBoost.

In this exercise, we will predict users' ad clicking behavior using a LASSO model, a elastic net model, a decision tree, a random forest, and XGBoost.

<a id='1.1'></a>
## Loading the python packages

In [1]:
# Load libraries


import warnings
warnings.filterwarnings('ignore')

#import os

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# for higher resolution
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg','pdf')

# nice format for matplotlib https://tonysyu.github.io/raw_content/matplotlib-style-gallery/gallery.html
plt.style.use('bmh')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import read_csv, set_option
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
#Libraries for Deep Learning Models
from keras.models import Sequential
from keras.layers import Dense
import xgboost as xgb
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
# from keras.wrappers.scikit_learn import KerasClassifier
# from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
#from keras.optimizers import SGD
from tensorflow.keras.optimizers import SGD


<a id='1.2'></a>
## Loading the Data



In [2]:
# load sample dataset (only load the first 300000 rows) (1pts)
data = pd.read_csv('PS2_casestudy_data.csv')
data = data.drop(data.columns[0], axis=1).head(300000) #try a smaller size first
data

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,1.350725e+18,1,14102102,1005,0,5b08c53b,7687a86e,3e814130,ecad2386,7801e8d9,...,1,0,20093,300,250,2295,2,35,-1,23
299996,1.350725e+17,0,14102102,1005,0,85f751fd,c4e18dd6,50e219e0,cb06b587,2347f47a,...,1,0,21647,320,50,2487,1,547,-1,51
299997,1.350745e+19,0,14102102,1005,1,57fe1b20,5b626596,f028772b,ecad2386,7801e8d9,...,1,0,21647,320,50,2487,1,547,-1,51
299998,1.350752e+18,0,14102102,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15699,320,50,1722,0,35,-1,79


In [3]:
#Explore the data (print the first and last 5 rows) (2pts)
print('First five rows:')
print(data.head(5))
print('Last five rows:')
print(data.tail(5))

First five rows:
             id  click      hour    C1  banner_pos   site_id site_domain  \
0  1.000009e+18      0  14102100  1005           0  1fbe01fe    f3845767   
1  1.000017e+19      0  14102100  1005           0  1fbe01fe    f3845767   
2  1.000037e+19      0  14102100  1005           0  1fbe01fe    f3845767   
3  1.000064e+19      0  14102100  1005           0  1fbe01fe    f3845767   
4  1.000068e+19      0  14102100  1005           1  fe8cc448    9166c161   

  site_category    app_id app_domain  ... device_type device_conn_type    C14  \
0      28905ebd  ecad2386   7801e8d9  ...           1                2  15706   
1      28905ebd  ecad2386   7801e8d9  ...           1                0  15704   
2      28905ebd  ecad2386   7801e8d9  ...           1                0  15704   
3      28905ebd  ecad2386   7801e8d9  ...           1                0  15706   
4      0569f928  ecad2386   7801e8d9  ...           1                0  18993   

   C15  C16   C17  C18  C19     C20  C2

In [4]:
#remove nas from dataframe by simply dropping these rows (2pts)
data.dropna()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,1.350725e+18,1,14102102,1005,0,5b08c53b,7687a86e,3e814130,ecad2386,7801e8d9,...,1,0,20093,300,250,2295,2,35,-1,23
299996,1.350725e+17,0,14102102,1005,0,85f751fd,c4e18dd6,50e219e0,cb06b587,2347f47a,...,1,0,21647,320,50,2487,1,547,-1,51
299997,1.350745e+19,0,14102102,1005,1,57fe1b20,5b626596,f028772b,ecad2386,7801e8d9,...,1,0,21647,320,50,2487,1,547,-1,51
299998,1.350752e+18,0,14102102,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15699,320,50,1722,0,35,-1,79


In [5]:
#create Y and X. Y is the "click" column, and X is the other 19 columns (Do not use ['id', 'hour', 'device_id', 'device_ip']) (1pts)
Y = data['click']
X = data.drop(columns=['click', 'id', 'hour', 'device_id', 'device_ip'])


In [6]:
# Use sklearn one-hot encoder to transform string variables in X to categorical columns (make sure to save the data as a sparse matrxi) (3pts)

string_cols = [i for i in X.columns if X[i].dtype is np.dtype('object')]

encoder = OneHotEncoder(sparse=True, handle_unknown = 'ignore')  #sparse=True)#, drop='first')

encoded_x = encoder.fit_transform(X[string_cols])


unencoded_x = X[[col for col in X.columns if col not in list(string_cols)]]

unencoded_x_sparse = sparse.csr_matrix(unencoded_x)

X_encoded = sparse.hstack([unencoded_x_sparse, encoded_x])



In [7]:
#do a train-test split. Use the first 90% of the data as training. (1pts)

X_train, X_test, Y_train, Y_test = train_test_split(X_encoded, Y.values, test_size=0.1, random_state = 42)



## LASSO and ElasticNet
First use a sklearn LASSO model with alpha=0.005 (regularization penalty) to predict clicking behavior. Report the prediction accuracy. Then use sklearn elastic net with alpha = 0.005 and l1_ratio=0.5 (l1_ratio is a number from 0 to 1 which represents the portion of L1 penalization in the total penalization term)

In [8]:
#initialize and training the model (1pts)
lasso_model = Lasso(alpha=0.005)
lasso_model.fit(X_train, Y_train)


In [9]:
#testing the model (2pts)
Y_pred_lasso = lasso_model.predict(X_test)
Y_pred_lasso_binary = (Y_pred_lasso >= 0.5).astype(int)
lasso_accuracy = accuracy_score(Y_test,Y_pred_lasso_binary)
lasso_auc = roc_auc_score(Y_test,Y_pred_lasso)
print(f"LASSO accuracy: {lasso_accuracy}")
print(f"LASSO ROC AUC Score: {lasso_auc}")

LASSO accuracy: 0.8315
LASSO ROC AUC Score: 0.6624092241421361


In [10]:
#initialize and training the elastic net (2pts)
elasticnet_model = ElasticNet(alpha=0.005, l1_ratio=0.5)
elasticnet_model.fit(X_train, Y_train)

In [11]:
#testing the elastic net (2pts)
Y_pred_els = elasticnet_model.predict(X_test)
Y_pred_els_binary = (Y_pred_els >= 0.5).astype(int)
elasticnet_accuracy = accuracy_score(Y_pred_els_binary, Y_test)
els_auc = roc_auc_score(Y_test,Y_pred_els)
print(f"ElasticNet accuracy: {elasticnet_accuracy}")
print(f"Elastic Net ROC AUC Score: {els_auc}")

ElasticNet accuracy: 0.8315
Elastic Net ROC AUC Score: 0.6804212765889573


What do you think is causing these differences? 

Elastic Net has the same accuracy with LASSO but outperforms from ROC AUC perspective. It's because that LASSO uses L1 regularization to perform variable selection, but it may miss capturing complex relationships between features. ElasticNet combines both L1 and L2 regularization, which can better handle scenarios with highly correlated variables, providing a balance between variable selection and coefficient shrinkage. 

## Decision tree and random forest

In [12]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
#build a single decision tree model using gini impurity and roc_auc scoring (2pts)
decision_tree = DecisionTreeClassifier(criterion='gini',random_state = 42)


In [13]:
#do a grid search on the max-depth variable [3,10,None]. (3pts)
param_grid = {'max_depth': [3, 10, None]}
grid_search_dt = GridSearchCV(decision_tree, param_grid, scoring='roc_auc', cv=5)
grid_search_dt.fit(X_train, Y_train)


In [15]:
#print the auc of the optimal model applied on the test set (3pts)
best_grid_search_dt = grid_search_dt.best_estimator_
Y_pred_dt_grid = best_grid_search_dt.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred_dt_grid)

# Calculate the Youden's J statistic for each threshold
j_scores = tpr - fpr
optimal_idx = np.argmax(j_scores)
optimal_threshold = thresholds[optimal_idx]

# Convert predictions to binary outcomes (0 or 1)
Y_pred_dt_grid_binary = (Y_pred_dt_grid >= optimal_threshold).astype(int)

accuracy_dt = accuracy_score(Y_test, Y_pred_dt_grid_binary)

roc_auc_dt = roc_auc_score(Y_test, Y_pred_dt_grid)

print(f"Best Decision Tree Model Prediction Accuracy: {accuracy_dt:.4f}")
print(f"Best Decision Tree Model: {grid_search_dt.best_params_}")
print(f"Best Decision Tree Model ROC AUC Score: {roc_auc_dt:.4f}")


Best Decision Tree Model Prediction Accuracy: 0.6277
Best Decision Tree Model: {'max_depth': 10}
Best Decision Tree Model ROC AUC Score: 0.7131


In [29]:
#build a random forest using gini impurity and roc_auc scoring (2pts)
random_forest = RandomForestClassifier(criterion='gini')


In [30]:
#do grid search to tune n_estimators and max_depth -- 'max_depth': [3, 10, None],'n_estimators': [10,50,100,200]. (3pts)
param_grid_rf = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [3, 10, None]
}
grid_search_rf = GridSearchCV(random_forest, param_grid_rf, scoring='roc_auc', cv=2, n_jobs = -1, verbose = 3)
grid_search_rf.fit(X_train, Y_train)


Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV 1/2] END ......max_depth=3, n_estimators=50;, score=0.691 total time=   2.6s
[CV 1/2] END ....max_depth=10, n_estimators=200;, score=0.709 total time=  23.1s
[CV 2/2] END ......max_depth=3, n_estimators=50;, score=0.685 total time=   3.3s
[CV 2/2] END ....max_depth=10, n_estimators=200;, score=0.711 total time=  30.2s
[CV 1/2] END .....max_depth=3, n_estimators=100;, score=0.695 total time=   4.7s
[CV 2/2] END ...max_depth=None, n_estimators=10;, score=0.701 total time=  31.9s
[CV 2/2] END .....max_depth=3, n_estimators=100;, score=0.694 total time=   4.7s
[CV 1/2] END ...max_depth=None, n_estimators=10;, score=0.705 total time=  32.0s
[CV 2/2] END ......max_depth=3, n_estimators=10;, score=0.677 total time=   0.7s
[CV 2/2] END .....max_depth=10, n_estimators=50;, score=0.710 total time=   6.3s
[CV 2/2] END ...max_depth=None, n_estimators=50;, score=0.712 total time= 2.5min
[CV 1/2] END ......max_depth=3, n_estimators=10;

In [31]:
#print the performance of the best model (3pts)
best_rf_model = grid_search_rf.best_estimator_

Y_pred_rf_grid = best_rf_model.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred_rf_grid)

# Calculate the Youden's J statistic for each threshold
j_scores = tpr - fpr
optimal_idx = np.argmax(j_scores)
optimal_threshold = thresholds[optimal_idx]

# Convert predictions to binary outcomes (0 or 1)
Y_pred_rf_grid_binary = (Y_pred_rf_grid >= optimal_threshold).astype(int)

accuracy_rf = accuracy_score(Y_test, Y_pred_rf_grid_binary)

roc_auc_rf = roc_auc_score(Y_test, Y_pred_rf_grid)

print(f"Random Forest Model Prediction Accuracy: {accuracy_rf:.4f}")
print(f"Random Forest Model ROC AUC Score: {roc_auc_rf:.4f}")
print(f"Best Random Forest Model: {grid_search_rf.best_params_}")

Random Forest Model Prediction Accuracy: 0.6394
Random Forest Model ROC AUC Score: 0.7290
Best Random Forest Model: {'max_depth': None, 'n_estimators': 200}


## XGBoost

Use the XGBoost classifier to predict clicking behavior. Fine-tune n_estimators over $[10,50,100]$ and $\eta$ (eta) over $[0.01,0.05,0.1]$. Use roc_auc as the scoring criterion for CV.

In [20]:
#initialize the parameter grid and the model (2pts）

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
param_grid_xgb = {
    'n_estimators': [10, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1]
}

In [21]:
#do grid search (3pts)
grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, scoring='roc_auc', cv=5)
grid_search_xgb.fit(X_train, Y_train)


In [22]:
#print the performance of the best model (2pts)
best_xgb_model = grid_search_xgb.best_estimator_
Y_pred_xgb_grid = best_xgb_model.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred_xgb_grid)

# Calculate the Youden's J statistic for each threshold
j_scores = tpr - fpr
optimal_idx = np.argmax(j_scores)
optimal_threshold = thresholds[optimal_idx]

# Convert predictions to binary outcomes (0 or 1)
Y_pred_xgb_grid_binary = (Y_pred_xgb_grid >= optimal_threshold).astype(int)


accuracy_dt = accuracy_score(Y_test, Y_pred_xgb_grid_binary)

roc_auc_xgb = roc_auc_score(Y_test, Y_pred_xgb_grid)


print(f"XGBoost Model Prediction Accuracy: {accuracy_dt:.4f}")
print(f"XGBoost Tree Model ROC AUC Score: {roc_auc_xgb:.4f}")
print(f"Best XGBoost Model: {grid_search_xgb.best_params_}")


XGBoost Model Prediction Accuracy: 0.6435
XGBoost Tree Model ROC AUC Score: 0.7364
Best XGBoost Model: {'learning_rate': 0.1, 'n_estimators': 100}


In [23]:
#Looking at the CV results of the decision tree, random forest, and XGBoost model, which ones are likely underfitted/overfitted. (3pts)
decision_tree = pd.DataFrame(grid_search_dt.cv_results_)
print('Decision Tree CV results')
print(decision_tree[['params','mean_test_score','std_test_score','rank_test_score']])

Decision Tree CV results
                params  mean_test_score  std_test_score  rank_test_score
0     {'max_depth': 3}         0.671974        0.001887                3
1    {'max_depth': 10}         0.715806        0.002743                1
2  {'max_depth': None}         0.679233        0.003621                2


In [32]:
random_forest = pd.DataFrame(grid_search_rf.cv_results_)
print('Random Forest CV results')
print(random_forest[['params','mean_test_score','std_test_score','rank_test_score']])

Random Forest CV results
                                      params  mean_test_score  std_test_score  \
0       {'max_depth': 3, 'n_estimators': 10}         0.668123        0.008921   
1       {'max_depth': 3, 'n_estimators': 50}         0.687827        0.003172   
2      {'max_depth': 3, 'n_estimators': 100}         0.694764        0.000301   
3      {'max_depth': 3, 'n_estimators': 200}         0.693698        0.001719   
4      {'max_depth': 10, 'n_estimators': 10}         0.695555        0.000811   
5      {'max_depth': 10, 'n_estimators': 50}         0.710814        0.000451   
6     {'max_depth': 10, 'n_estimators': 100}         0.709994        0.000517   
7     {'max_depth': 10, 'n_estimators': 200}         0.709952        0.000714   
8    {'max_depth': None, 'n_estimators': 10}         0.702569        0.002064   
9    {'max_depth': None, 'n_estimators': 50}         0.714130        0.001805   
10  {'max_depth': None, 'n_estimators': 100}         0.715460        0.001596   
11 

In [25]:
xgboost = pd.DataFrame(grid_search_xgb.cv_results_)
print('XGBoost CV results')
print(xgboost[['params','mean_test_score','std_test_score','rank_test_score']])

XGBoost CV results
                                         params  mean_test_score  \
0   {'learning_rate': 0.01, 'n_estimators': 10}         0.705638   
1   {'learning_rate': 0.01, 'n_estimators': 50}         0.715202   
2  {'learning_rate': 0.01, 'n_estimators': 100}         0.720825   
3   {'learning_rate': 0.05, 'n_estimators': 10}         0.714687   
4   {'learning_rate': 0.05, 'n_estimators': 50}         0.729680   
5  {'learning_rate': 0.05, 'n_estimators': 100}         0.734381   
6    {'learning_rate': 0.1, 'n_estimators': 10}         0.720424   
7    {'learning_rate': 0.1, 'n_estimators': 50}         0.734335   
8   {'learning_rate': 0.1, 'n_estimators': 100}         0.738713   

   std_test_score  rank_test_score  
0        0.002671                9  
1        0.004370                7  
2        0.002666                5  
3        0.004169                8  
4        0.003021                4  
5        0.003080                2  
6        0.002643                6  
7   

Decision Tree has the best performance with 'max_depth' as 10. 
Overfitting: The decision tree model with unlimited max depth has the highest standard deviation of test score, which may cause a unstable performance and potential overfitting issue.

Random Forest has the best performance with 'max_depth' as none and n_estimates as 200.
Overfitting: The random forest model with unlimited max depth and large number of estimates tends has the best mean test score and the largest standard deviation of the test score which could cause overfitting issue. However, since random forest takes the average result of multiple trees as final result, it has less likelihood of overfitting as decision tree.

XGBoost has the best performance with 'learning_rate' as 0.1 and n_estimates as 100.
XGBoost seems to offer the best performance with minimal risk of overfitting and underfitting.