Aim: To get the best performance as we can get.

**Experiment**:

1. Use dataset by dropping ALL rows with missing value:

  A. Accuracy: 61%

  B. F1 Score: 43%
2. Use dataset by handling missing values on all rows

  A. Accuracy: 65%

  B. F1 Score: 40%

3. USe dataset by handling only age column:

  A. Accuracy: 64%

  B. F1 Score: 43%

**Experiment 2**: Try multiple algorithms

1. Using logistic regression:

  A. Accuracy: 64%

  B. F1 Score: 43%

2. Using random forest:

  A. Accuracy: 86%

  B. F1 Score: 83%


Result: To get the best performance from this dataset, we need to handle the age column by replacing missing values with the mean and then dropping rows with missing values in other columns and then use random forest algorithm.

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("marius2303/ad-click-prediction-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/marius2303/ad-click-prediction-dataset?dataset_version_number=5...


100%|██████████| 81.2k/81.2k [00:00<00:00, 35.2MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/marius2303/ad-click-prediction-dataset/versions/5





In [2]:
import pandas as pd

path_to_dataset = path + "/ad_click_dataset.csv"
ad_data = pd.read_csv(path_to_dataset)
ad_data.head()

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,670,User670,22.0,,Desktop,Top,Shopping,Afternoon,1
1,3044,User3044,,Male,Desktop,Top,,,1
2,5912,User5912,41.0,Non-Binary,,Side,Education,Night,1
3,5418,User5418,34.0,Male,,,Entertainment,Evening,1
4,9452,User9452,39.0,Non-Binary,,,Social Media,Morning,0


In [3]:
ad_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10000 non-null  int64  
 1   full_name         10000 non-null  object 
 2   age               5234 non-null   float64
 3   gender            5307 non-null   object 
 4   device_type       8000 non-null   object 
 5   ad_position       8000 non-null   object 
 6   browsing_history  5218 non-null   object 
 7   time_of_day       8000 non-null   object 
 8   click             10000 non-null  int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 703.2+ KB


In [6]:
# check for missing values
num_of_missing_values = ad_data.isnull().sum()
print(num_of_missing_values)
print()

# percentage = (Total Values - Missing Values) / 100
percent_of_missing_values = (10000 - num_of_missing_values) / 100
print(percent_of_missing_values)

id                     0
full_name              0
age                 4766
gender              4693
device_type         2000
ad_position         2000
browsing_history    4782
time_of_day         2000
click                  0
dtype: int64

id                  100.00
full_name           100.00
age                  52.34
gender               53.07
device_type          80.00
ad_position          80.00
browsing_history     52.18
time_of_day          80.00
click               100.00
dtype: float64


In [10]:
# Experiment 1: Drop all rows with missing values

dropna_data = ad_data.dropna()
dropna_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 816 entries, 17 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                816 non-null    int64  
 1   full_name         816 non-null    object 
 2   age               816 non-null    float64
 3   gender            816 non-null    object 
 4   device_type       816 non-null    object 
 5   ad_position       816 non-null    object 
 6   browsing_history  816 non-null    object 
 7   time_of_day       816 non-null    object 
 8   click             816 non-null    int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 63.8+ KB


In [12]:
# feature engineering
cat_variables = ['gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']

# perform one-hot encoding
encoded_data = pd.get_dummies(dropna_data, columns=cat_variables)

In [16]:
# drop id columns and split data

from sklearn.model_selection import train_test_split

cols_to_drop = ["id", "full_name", "click"] # drop irrelevant columns

# split into features (x) and labels (y)
X = encoded_data.drop(columns=cols_to_drop)
y = encoded_data["click"]

# split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# check the shape pf the train/test data
print(X_train.shape, X_test.shape)

(652, 19) (164, 19)


In [17]:
# model building

from sklearn.linear_model import LogisticRegression as lr

logreg = lr(max_iter=1000)

# train the model
logreg.fit(X_train, y_train)

In [18]:
# evaluate the model

from sklearn.metrics import classification_report

y_pred = logreg.predict(X_test)

class_report = classification_report(y_test, y_pred)

print("Classification Report: \n", class_report)

Classification Report: 
               precision    recall  f1-score   support

           0       0.50      0.06      0.11        64
           1       0.62      0.96      0.75       100

    accuracy                           0.61       164
   macro avg       0.56      0.51      0.43       164
weighted avg       0.57      0.61      0.50       164



-------------------------------------------------------------

In [19]:
# percentage = (Total Values - Missing Values) / 100
percent_of_missing_values = (10000 - num_of_missing_values) / 100
print(percent_of_missing_values)

id                  100.00
full_name           100.00
age                  52.34
gender               53.07
device_type          80.00
ad_position          80.00
browsing_history     52.18
time_of_day          80.00
click               100.00
dtype: float64


In [27]:
# Handle Missing Values
ad_data_2 = ad_data.copy()

ad_data_2['age'].fillna(ad_data_2['age'].mean(), inplace=True) # Fill age with the mean value
ad_data_2 = ad_data_2.dropna()

'''
ad_data_2['gender'].fillna('unknown', inplace=True) # Fill missing with 'unknown'
ad_data_2['device_type'].fillna('unknown', inplace=True) # Fill missing with 'unknown'
ad_data_2['ad_position'].fillna('unknown', inplace=True) # Fill missing with 'unknown'
ad_data_2['browsing_history'].fillna('no_data', inplace=True) # Fill missing with 'unknown'
ad_data_2['time_of_day'].fillna('unknown', inplace=True) # Fill missing with 'unknown'
'''

print(ad_data_2.isnull().sum())

id                  0
full_name           0
age                 0
gender              0
device_type         0
ad_position         0
browsing_history    0
time_of_day         0
click               0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ad_data_2['age'].fillna(ad_data_2['age'].mean(), inplace=True) # Fill age with the mean value


In [28]:
# feature engineering
cat_variables = ['gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']

# perform one-hot encoding
encoded_data = pd.get_dummies(ad_data_2, columns=cat_variables)

# drop id columns and split data

from sklearn.model_selection import train_test_split

cols_to_drop = ["id", "full_name", "click"] # drop irrelevant columns

# split into features (x) and labels (y)
X = encoded_data.drop(columns=cols_to_drop)
y = encoded_data["click"]

# split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# check the shape pf the train/test data
print(X_train.shape, X_test.shape)

# model building

from sklearn.linear_model import LogisticRegression as lr

logreg = lr(max_iter=1000)

# train the model
logreg.fit(X_train, y_train)

# evaluate the model

from sklearn.metrics import classification_report

y_pred = logreg.predict(X_test)

class_report = classification_report(y_test, y_pred)

print("Classification Report: \n", class_report)

(1212, 19) (304, 19)
Classification Report: 
               precision    recall  f1-score   support

           0       0.29      0.05      0.08       101
           1       0.67      0.94      0.78       203

    accuracy                           0.64       304
   macro avg       0.48      0.50      0.43       304
weighted avg       0.54      0.64      0.55       304



In [29]:
from sklearn.ensemble import RandomForestClassifier as RFC

# initialize the model
rf_model = RFC(n_estimators=100, random_state=42)

# train the model
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

class_report = classification_report(y_test, y_pred)

print("Classification Report: \n", class_report)

Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.65      0.75       101
           1       0.85      0.96      0.90       203

    accuracy                           0.86       304
   macro avg       0.87      0.81      0.83       304
weighted avg       0.86      0.86      0.85       304

