In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import *

# Preprocessing

In [74]:
data = pd.read_csv('smaller_sample.csv')
events = pd.read_csv('Event Definitions.csv')
# data.customer_id = list(range(0, len(data)))
df = fingerhut_data_cleaner(data, events)

In [75]:
idxs = list(df[df['event_name'] == 'promotion_created'].index)
df.drop(idxs, inplace=True)
df.reset_index(drop=True, inplace=True)

In [76]:
def add_downpayment_cleared(df):
    downpaymen_cleared = df.groupby('customer_id')['ed_id'].apply(lambda x: 27 in x.values).reset_index(name='downpayment_cleared')
    return pd.merge(df, downpaymen_cleared, on='customer_id')

Add some other features. Will need more in the future.

In [77]:
df = add_n_accounts(df)
df = add_has_discover(df)
df = add_downpayment_cleared(df)

Filter customers with at least 10 events and downpayment not cleared in the first 10 events. Want to see if models can predict whether or not a customer clears downpayment by only looking at first 10 actions.

In [78]:
df = df.sort_values(
        ['customer_id', 'journey_steps_until_end']
    ).groupby(['customer_id'], sort=False).agg(
        {
            'ed_id': lambda x: list(x)[:10],
            'journey_steps_until_end': lambda x: list(x)[-1],
            'has_discover': 'first',
            'downpayment_cleared': 'first',
            'n_accounts': 'first',
        }
)

df = df[df.ed_id.apply(lambda x: len(x) == 10)]

In [79]:
df = df.reset_index()

In [80]:
idx = df[df.ed_id.apply(lambda x: 27 in x)].index
df.drop(idx, inplace=True)

In [436]:
df_X = df.drop(columns=['downpayment_cleared', 'customer_id'])
target = df.downpayment_cleared.astype(int)

# Model training

In [437]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [438]:
class Embedding(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.fc1 = nn.Linear(10, embedding_dim, bias=False)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(embedding_dim, embedding_dim, bias=False)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [439]:
# Embed event id sequence into smaller dimension
event_id = df_X.ed_id
event_id = torch.tensor(event_id.to_list()).float()
emb = Embedding(5)
event_id = emb(event_id)

## Regression / Boost

In [454]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [449]:
event_id_df = pd.DataFrame(event_id.detach().numpy())
ori_dfx = df_X.drop(columns='ed_id').reset_index(drop=True)
new_dfx = pd.concat([ori_dfx, event_id_df], axis=1)

In [450]:
X_train, X_test, y_train, y_test = train_test_split(new_dfx, target, train_size=.8, random_state=123)

In [453]:
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
prediction = log_clf.predict(X_test)
accuracy_score(y_test, prediction)

0.7217759731662106

In [428]:
clf = XGBClassifier(n_estimators=200, 
                    max_depth=10, 
                    learning_rate=0.1, 
                    tree_method='approx',
                    objective='binary:logistic')

clf.fit(X_train, y_train)

preds = clf.predict(X_test)

(accuracy_score(y_test, preds))

0.7234530850030895

## NN

In [455]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [456]:
feature_tensors = [torch.tensor(df[feature].values).float().unsqueeze(1) for \
                   feature in ['journey_steps_until_end', 'has_discover', 'n_accounts']]

res = torch.cat([event_id] + feature_tensors, dim=1).to(device)
target = torch.tensor(target, dtype=torch.float32).view(-1, 1).to(device)

X_train, X_test, y_train, y_test = train_test_split(res, target, train_size=.8, random_state=42)
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)

In [457]:
class Classifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [458]:
# torch.manual_seed(1)
model = Classifier(input_dim=8).to(device)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        inputs.detach_()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    model.eval()

    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            prediction = (outputs > 0.5).float()
            correct += (prediction == labels).sum().item()
            total += labels.size(0)

            val_loss += criterion(outputs, labels).item()
    val_loss /= len(test_loader)
    
    print(f"Epoch {epoch + 1} / {num_epochs}, Loss: {loss.item():.4f}, Validation Loss: {val_loss:.4f}, Accuracy: {float(correct) / float(total) * 100:.4f}")

Epoch 1 / 10, Loss: 0.3899, Validation Loss: 0.5520, Accuracy: 72.6807
Epoch 2 / 10, Loss: 0.5274, Validation Loss: 0.5716, Accuracy: 70.3063
Epoch 3 / 10, Loss: 0.1835, Validation Loss: 0.5574, Accuracy: 72.6631
Epoch 4 / 10, Loss: 0.4919, Validation Loss: 0.5472, Accuracy: 72.5925
Epoch 5 / 10, Loss: 0.8179, Validation Loss: 0.5706, Accuracy: 72.6807
Epoch 6 / 10, Loss: 0.6930, Validation Loss: 0.5803, Accuracy: 68.7792
Epoch 7 / 10, Loss: 0.6653, Validation Loss: 0.5925, Accuracy: 69.0794
Epoch 8 / 10, Loss: 0.5114, Validation Loss: 0.5547, Accuracy: 71.3126
Epoch 9 / 10, Loss: 0.2207, Validation Loss: 0.5662, Accuracy: 72.6631
Epoch 10 / 10, Loss: 0.3752, Validation Loss: 0.5426, Accuracy: 72.6013
