In [None]:
"""This project is a simple example of click-through rate (CTR) prediction using logistic regression and PyTorch."""

import pandas as pd
import numpy as np

# load the ctr dataset
file_path = r'train\train.csv'
num_rows: int = 200000
ctr_data: pd.DataFrame = pd.read_csv(file_path, nrows=num_rows)
print(ctr_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                200000 non-null  float64
 1   click             200000 non-null  int64  
 2   hour              200000 non-null  int64  
 3   C1                200000 non-null  int64  
 4   banner_pos        200000 non-null  int64  
 5   site_id           200000 non-null  object 
 6   site_domain       200000 non-null  object 
 7   site_category     200000 non-null  object 
 8   app_id            200000 non-null  object 
 9   app_domain        200000 non-null  object 
 10  app_category      200000 non-null  object 
 11  device_id         200000 non-null  object 
 12  device_ip         200000 non-null  object 
 13  device_model      200000 non-null  object 
 14  device_type       200000 non-null  int64  
 15  device_conn_type  200000 non-null  int64  
 16  C14               20

In [2]:
Y: np.ndarray = ctr_data['click'].values
X: pd.DataFrame = ctr_data.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1)
print(X.describe())

                  C1     banner_pos    device_type  device_conn_type  \
count  200000.000000  200000.000000  200000.000000     200000.000000   
mean     1005.020940       0.207390       1.045545          0.198685   
std         1.028794       0.409769       0.548254          0.635249   
min      1001.000000       0.000000       0.000000          0.000000   
25%      1005.000000       0.000000       1.000000          0.000000   
50%      1005.000000       0.000000       1.000000          0.000000   
75%      1005.000000       0.000000       1.000000          0.000000   
max      1010.000000       7.000000       5.000000          5.000000   

                 C14            C15            C16            C17  \
count  200000.000000  200000.000000  200000.000000  200000.000000   
mean    17794.362140     318.352400      57.065250    1979.110140   
std      3250.084833      13.192933      37.470529     398.900228   
min       375.000000     120.000000      20.000000     112.000000   
25%   

In [3]:
## Split the dataset
n_train = int(0.9 * num_rows)
X_train: pd.DataFrame = X[:n_train]
Y_train: np.ndarray = Y[:n_train]
X_test: pd.DataFrame = X[n_train:]
Y_test: np.ndarray = Y[n_train:]

print(X_train.iloc[0])

C1                      1005
banner_pos                 0
site_id             1fbe01fe
site_domain         f3845767
site_category       28905ebd
app_id              ecad2386
app_domain          7801e8d9
app_category        07d7df22
device_model        44956a24
device_type                1
device_conn_type           2
C14                    15706
C15                      320
C16                       50
C17                     1722
C18                        0
C19                       35
C20                       -1
C21                       79
Name: 0, dtype: object


In [4]:
## Preprocess the features with ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Define the categorical and numerical features
categorical_features: list[str] = ['site_id', 'site_domain', 'site_category', 
                                   'app_id', 'app_domain', 'app_category', 'device_model'
                                   ]
numerical_features: list[str] = ['C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

# Create the ColumnTransformer to preprocess the features
preprocessor: ColumnTransformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'  # Keep the remaining columns as they are
)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV

clf_lr =  Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=500, n_jobs=-1, random_state=17))
])

# Fit the model
clf_lr.fit(X_train, Y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [6]:
# Evaluate the model on training and test data
from sklearn.metrics import roc_auc_score, accuracy_score

Y_train_pred_prob: np.ndarray = clf_lr.predict_proba(X_train)[:, 1]
Y_train_pred: np.ndarray = clf_lr.predict(X_train)
print(f'Train AUC for logistics regression: {roc_auc_score(Y_train, Y_train_pred_prob):.3f}')
print(f'Train Accuracy for logistics regression: {accuracy_score(Y_train, Y_train_pred):.3f}' )

Y_test_pred_prob: np.ndarray = clf_lr.predict_proba(X_test)[:, 1]
Y_test_pred: np.ndarray = clf_lr.predict(X_test)
print(f'Test AUC for logistics regression: {roc_auc_score(Y_test, Y_test_pred_prob):.3f}' )
print(f'Test Accuracy for logistics regression: {accuracy_score(Y_test, Y_test_pred):.3f}')

Train AUC for logistics regression: 0.763
Train Accuracy for logistics regression: 0.832
Test AUC for logistics regression: 0.741
Test Accuracy for logistics regression: 0.825


#### Implementing logistic regression using PyTorch


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# 1, Define a custom dataset class for the CTR dataset
class CTRDataset(Dataset):
    def __init__(self, X: np.ndarray | pd.DataFrame, y: np.ndarray) -> None:
        if isinstance(X, pd.DataFrame):
            X = X.values
        self.X: torch.Tensor = torch.tensor(X,  # keep data on cpu (loading large dataset directly to GPU can lead to memory issues)
                                            dtype=torch.float32)  # Convert to full precision (float32) for better numerical stability
        self.y: torch.Tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)  

    def __len__(self) -> int:
        return len(self.X)

    def __getitem__(self, idx) -> tuple[torch.Tensor, torch.Tensor]:
        return self.X[idx], self.y[idx]


# 2, Define a simple feedforward MLP model
class LogisticsRegressionCTRModel(nn.Module):
    def __init__(self, input_dim: int) -> None:
        super(LogisticsRegressionCTRModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [8]:
# 3, Preprocess the data using the preprocessor and create DataLoader objects
# Preprocess the features
X_train_processed: np.ndarray = preprocessor.fit_transform(X_train)
X_test_processed: np.ndarray = preprocessor.transform(X_test)

# 4, Create datasets and dataloaders
batch_size: int = 128
train_dataset: CTRDataset = CTRDataset(X_train_processed, Y_train)
train_loader: DataLoader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset: CTRDataset = CTRDataset(X_test_processed, Y_test)
test_loader: DataLoader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
# 5, Initialize the model, loss function, and optimizer
input_dim: int = X_train_processed.shape[1]
device: str = 'cuda' if torch.cuda.is_available() else 'cpu'

model: LogisticsRegressionCTRModel = LogisticsRegressionCTRModel(input_dim).to(device)  # Move model to GPU if available
criterion: nn.BCELoss = nn.BCELoss()
optimizer: optim.AdamW= optim.AdamW(model.parameters(),  # use AdamW optimizer with weight decay, instead of Adam
                                 weight_decay=0.1,  # L2 regularization
                                 lr=1e-3)  


# 6, Train the model
num_epochs: int = 50
best_loss: float = float('inf')  # Initialize best_loss to infinity for early stopping
patience_counter: int = 0  # Initialize patience counter for early stopping

for epoch in range(num_epochs):
    model.train()
    running_loss: float = 0.0

    for batch_X, batch_y in train_loader:
        batch_X: torch.Tensor = batch_X.to(device)  # Move data to GPU if available
        batch_y: torch.Tensor = batch_y.to(device)

        # Zero the gradients
        optimizer.zero_grad()
        # Forward pass
        outputs: torch.Tensor = model(batch_X)
        loss: torch.Tensor = criterion(outputs, batch_y)
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Apply early stopping if the loss does not improve for 5 epochs
    epoch_loss: float = running_loss / len(train_loader)
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_model_state: dict = model.state_dict()
        # Save the best model state
        torch.save(best_model_state, 'ctr_model_best.pth')
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= 5:
            # Early stopping condition met
            print(f'Early stopping at epoch {epoch + 1}')
            break

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}')

Epoch [1/50], Loss: 0.4142
Epoch [2/50], Loss: 0.3995
Epoch [3/50], Loss: 0.3938
Epoch [4/50], Loss: 0.3889
Epoch [5/50], Loss: 0.3841
Epoch [6/50], Loss: 0.3801
Epoch [7/50], Loss: 0.3764
Epoch [8/50], Loss: 0.3731
Epoch [9/50], Loss: 0.3705
Epoch [10/50], Loss: 0.3683
Epoch [11/50], Loss: 0.3662
Epoch [12/50], Loss: 0.3645
Epoch [13/50], Loss: 0.3633
Epoch [14/50], Loss: 0.3621
Epoch [15/50], Loss: 0.3616
Epoch [16/50], Loss: 0.3604
Epoch [17/50], Loss: 0.3595
Epoch [18/50], Loss: 0.3592
Epoch [19/50], Loss: 0.3587
Epoch [20/50], Loss: 0.3576
Epoch [21/50], Loss: 0.3572
Epoch [22/50], Loss: 0.3564
Epoch [23/50], Loss: 0.3564
Epoch [24/50], Loss: 0.3559
Epoch [25/50], Loss: 0.3555
Epoch [26/50], Loss: 0.3552
Epoch [27/50], Loss: 0.3549
Epoch [28/50], Loss: 0.3545
Epoch [29/50], Loss: 0.3543
Epoch [30/50], Loss: 0.3542
Epoch [31/50], Loss: 0.3534
Epoch [32/50], Loss: 0.3534
Epoch [33/50], Loss: 0.3535
Epoch [34/50], Loss: 0.3530
Epoch [35/50], Loss: 0.3526
Epoch [36/50], Loss: 0.3529
E

In [16]:
# 7, Load the best model state for evaluation
model.load_state_dict(torch.load('ctr_model_best.pth', 
                                 weights_only=True))


# 8, Evaluate the model on the test set
model.eval()
test_loss: float = 0.0
y_true: list[float] = []
y_pred: list[float] = []
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X: torch.Tensor = batch_X.to(device)  # Move data to GPU if available
        batch_y: torch.Tensor = batch_y.to(device)

        outputs: torch.Tensor = model(batch_X)  # Outputs are sigmoid values for the positive class
        loss: torch.Tensor = criterion(outputs, batch_y)
        test_loss += loss.item()

        y_true.extend(batch_y.cpu().numpy())  # Move to CPU and convert to numpy array
        y_pred.extend(outputs.cpu().numpy())  
    test_loss /= len(test_loader)

print(f'Test Loss for MLP: {test_loss:.4f}')
print(f'Test AUC for MLP: {roc_auc_score(y_true, y_pred):.4f}')

Test Loss for MLP: 0.4717
Test AUC for MLP: 0.7246
