In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import *

# Preprocessing

In [74]:
data = pd.read_csv('smaller_sample.csv')
events = pd.read_csv('Event Definitions.csv')
# data.customer_id = list(range(0, len(data)))
df = fingerhut_data_cleaner(data, events)

In [75]:
idxs = list(df[df['event_name'] == 'promotion_created'].index)
df.drop(idxs, inplace=True)
df.reset_index(drop=True, inplace=True)

In [76]:
def add_downpayment_cleared(df):
    downpaymen_cleared = df.groupby('customer_id')['ed_id'].apply(lambda x: 27 in x.values).reset_index(name='downpayment_cleared')
    return pd.merge(df, downpaymen_cleared, on='customer_id')

Add some other features. Will need more in the future.

In [77]:
df = add_n_accounts(df)
df = add_has_discover(df)
df = add_downpayment_cleared(df)

Filter customers with at least 10 events and downpayment not cleared in the first 10 events. Want to see if models can predict whether or not a customer clears downpayment by only looking at first 10 actions.

In [78]:
df = df.sort_values(
        ['customer_id', 'journey_steps_until_end']
    ).groupby(['customer_id'], sort=False).agg(
        {
            'ed_id': lambda x: list(x)[:10],
            'journey_steps_until_end': lambda x: list(x)[-1],
            'has_discover': 'first',
            'downpayment_cleared': 'first',
            'n_accounts': 'first',
        }
)

df = df[df.ed_id.apply(lambda x: len(x) == 10)]

In [79]:
df = df.reset_index()

In [80]:
idx = df[df.ed_id.apply(lambda x: 27 in x)].index
df.drop(idx, inplace=True)

In [436]:
df_X = df.drop(columns=['downpayment_cleared', 'customer_id'])
target = df.downpayment_cleared.astype(int)

# Read in available data

In [13]:
df = pd.read_csv('data_with_embeddings.csv')

In [14]:
df.columns

Index(['num_journeys', 'max_journey', 'discover', 'number_accounts',
       'one_more_journey', 'most_repeated_event', 'average_length_seq',
       'approved_credit', 'order_ships', 'has_prospecting',
       'has_pre_application', 'initial_device', 'time_in_discover',
       'time_in_apply', 'event_id_0', 'event_id_1', 'event_id_2', 'event_id_3',
       'event_id_4', 'time_0', 'time_1', 'time_2', 'time_3', 'time_4'],
      dtype='object')

In [15]:
# df.drop(columns=['max_milestone', 'downpayment_cleared', 'first_purchase', 
# 'downpayment_received', 'account_activitation', 'customer_id'], inplace=True)
df = df.dropna(axis=0)

df_0, df_1 = df[df.order_ships == 0], df[df.order_ships == 1]
df_0 = df_0.sample(n=len(df_1))
# df_1 = df_1.sample(n=(len(df_0)), replace=True)
df_balanced = pd.concat([df_0, df_1], axis=0).reset_index(drop=True)

# shuffle
df_balanced = df_balanced.sample(frac=1)

df_X = df_balanced.drop(columns='order_ships')
target = df_balanced.order_ships

# Model training

## Embeddings

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [15]:
class Embedding(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.fc1 = nn.Linear(10, embedding_dim, bias=False)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(embedding_dim, embedding_dim, bias=False)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [439]:
# Embed event id sequence into smaller dimension
event_id = df_X.ed_id
event_id = torch.tensor(event_id.to_list()).float()
emb = Embedding(5)
event_id = emb(event_id)

## Regression / Boost

In [17]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
event_id_df = pd.DataFrame(event_id.detach().numpy())
ori_dfx = df_X.drop(columns='ed_id').reset_index(drop=True)
new_dfx = pd.concat([ori_dfx, event_id_df], axis=1)

In [20]:
# X_train, X_test, y_train, y_test = train_test_split(new_dfx, target, train_size=.8, random_state=123)
X_train, X_test, y_train, y_test = train_test_split(df_X, target, train_size=.8, random_state=2024)

In [33]:
log_clf = LogisticRegression(random_state=2024)
log_clf.fit(X_train, y_train)
prediction = log_clf.predict(X_test)
accuracy_score(y_test, prediction)

0.5602726991013325

In [22]:
clf = XGBClassifier(n_estimators=200, 
                    max_depth=10, 
                    learning_rate=0.1, 
                    tree_method='approx',
                    objective='binary:logistic',)

clf.fit(X_train, y_train)

preds = clf.predict(X_test)

accuracy_score(y_test, preds)

0.8379299659126124

In [27]:
ada_clf = AdaBoostClassifier(n_estimators=200,
                             learning_rate=1.0,
                             random_state=2024)

ada_clf.fit(X_train, y_train)
ada_prediction = ada_clf.predict(X_test)
accuracy_score(y_test, ada_prediction)

0.8021382088627208

In [32]:
gb_clf = GradientBoostingClassifier(n_estimators=200,
                                    learning_rate=0.1,
                                    loss='log_loss',
                                    max_depth=10,
                                    tol=1e-5,
                                    random_state=2024)

gb_clf.fit(X_train, y_train)
gb_prediction = gb_clf.predict(X_test)
accuracy_score(y_test, gb_prediction)

0.8359157111868608

Seems like boosting algorithms (XGBoost, AdaBoost, Gradient Boosting) are achieving better results.

## NN

In [112]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [116]:
feature_columns = df_X.columns.to_list()
feature_tensors = [torch.tensor(df_balanced[feature].values).float().unsqueeze(1) for feature in feature_columns]

# res = torch.cat([event_id] + feature_tensors, dim=1).to(device)
res = torch.cat(feature_tensors, dim=1).to(device)
target = torch.tensor(target, dtype=torch.float32).view(-1, 1).to(device)

X_train, X_test, y_train, y_test = train_test_split(res, target, train_size=.7, random_state=42)
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)

In [117]:
class Classifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.leaky_relu = nn.LeakyReLU()
        self.fc2 = nn.Linear(512, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.leaky_relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [125]:
torch.manual_seed(1)
model = Classifier(input_dim=23).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=2e-5)

num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        inputs.detach_()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    model.eval()

    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            prediction = (outputs > 0.5).float()
            correct += (prediction == labels).sum().item()
            total += labels.size(0)

            val_loss += criterion(outputs, labels).item()
    val_loss /= len(test_loader)
    
    print(f"Epoch {epoch + 1} / {num_epochs}, Loss: {loss.item():.4f}, Validation Loss: {val_loss:.4f}, Accuracy: {float(correct) / float(total) * 100:.4f}")

Epoch 1 / 50, Loss: 31.5605, Validation Loss: 36.6210, Accuracy: 62.1423
Epoch 2 / 50, Loss: 58.6207, Validation Loss: 45.9562, Accuracy: 53.4965
Epoch 3 / 50, Loss: 55.1724, Validation Loss: 47.4891, Accuracy: 52.3293
Epoch 4 / 50, Loss: 48.2759, Validation Loss: 46.3556, Accuracy: 53.2280
Epoch 5 / 50, Loss: 51.9932, Validation Loss: 45.7640, Accuracy: 53.6618
Epoch 6 / 50, Loss: 44.8276, Validation Loss: 46.0418, Accuracy: 53.6515
Epoch 7 / 50, Loss: 44.8276, Validation Loss: 45.0252, Accuracy: 54.4159
Epoch 8 / 50, Loss: 51.7241, Validation Loss: 46.2333, Accuracy: 53.5172
Epoch 9 / 50, Loss: 44.8276, Validation Loss: 45.3558, Accuracy: 54.2609
Epoch 10 / 50, Loss: 37.9310, Validation Loss: 45.5692, Accuracy: 54.0647
Epoch 11 / 50, Loss: 34.4828, Validation Loss: 45.1963, Accuracy: 54.2816
Epoch 12 / 50, Loss: 41.3793, Validation Loss: 39.2092, Accuracy: 60.3760
Epoch 13 / 50, Loss: 48.2759, Validation Loss: 45.2203, Accuracy: 54.4882
Epoch 14 / 50, Loss: 31.0345, Validation Loss: 