# LSTM Building Notebook for Next Event Prediction

LSTM (long short-term memory) is a neural network framework built on top of RNNs that allow us to predict the next event based on a given history of events, similar to predicting the next word given a sequence of previous words (one of the most popular applications of LSTMs). This notebook will process our data, train, and evaluate a LSTM model for predicting the next event of a customer's journey. 

### I. Data Processing

In [1]:
import pandas as pd
import numpy as np

# Import initial data
data = pd.read_csv('../smaller_sample.csv')
event_defs = pd.read_csv('../event_definitions.csv')

In [2]:
# Initial data processing here:
from utils import fingerhut_data_cleaner
df = fingerhut_data_cleaner(data, event_defs)

In [None]:
#! TODO: Add journey_id column, customer_quit event, and time since last event variable

In [27]:
df.ed_id.nunique() # 26 Unique Events Originally, 27 with customer_quit (not yet implemented)  

26

In [32]:
df.head()

Unnamed: 0,customer_id,account_id,ed_id,event_name,event_timestamp,journey_steps_until_end,stage,event_encoded
0,278713037,1812321640,2,campaign_click,2021-05-31 06:00:00,1,Discover,13
1,278713037,1812321640,19,application_web_view,2021-05-31 23:11:03,2,Apply for Credit,10
2,278713037,1812321640,3,application_web_submit,2021-05-31 23:11:51,3,Apply for Credit,9
3,278713037,1812321640,19,application_web_view,2021-05-31 23:11:51,4,Apply for Credit,10
4,278713037,1812321640,19,application_web_view,2021-05-31 23:11:54,5,Apply for Credit,10


In [29]:
from sklearn.model_selection import train_test_split
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

num_classes = df.ed_id.nunique() # 27 (including customer_quit)

In [None]:
def create_sequences(df, seq_length=10):
    X = []
    y = []
    
    for _, group in df.groupby(['customer_id', 'journey_id']):
        # Prepare the sequence of events and steps
        events = group['ed_id'].tolist()
        steps = group['journey_steps_until_end'].tolist()
        
        # Combine events and steps into a single sequence of features
        for i in range(len(events) - seq_length):
            sequence = [[events[j], steps[j]] for j in range(i, i + seq_length)]
            X.append(sequence)
            y.append(events[i + seq_length])
    
    return np.array(X), np.array(y)

seq_length = 3  # Adjust based on your data
X, y = create_sequences(df, seq_length)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
class EventDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)  # Ensure correct data type
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create Dataset objects
train_dataset = EventDataset(X_train, y_train)
test_dataset = EventDataset(X_test, y_test)

# Create DataLoader objects
batch_size = 64  # Adjust as per your requirements
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### II. Building the Model

In [None]:
class LSTMModelWithSteps(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModelWithSteps, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        # Input dim is 2 because we now include event and step
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device)
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :])  # We only use the output of the last time step
        return out


In [None]:
# Model instantiation
input_dim = 2  # Input dimension is the sequence length
hidden_dim = 100  # Adjust the hidden layer dimension
layer_dim = 1  # Number of LSTM layers
output_dim = num_classes  # Number of output classes

model = LSTMModelWithSteps(input_dim, hidden_dim, layer_dim, output_dim)

### III. Train the Model

In [None]:
# Make sure GPU is loaded for training
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

In [None]:
# Load model to GPU
model = model.to(device)

In [None]:
from tqdm import tqdm  

# Assuming model and train_loader are already defined
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
NUM_EPOCHS = 3

# Your test_loader will act as the validation_loader in this context
validation_loader = test_loader

def validate(model, validation_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():  # No need to track gradients during validation
        for sequences, labels in validation_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            outputs = model(sequences)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_loss /= len(validation_loader)
    val_accuracy = 100 * correct / total
    return val_loss, val_accuracy

for epoch in range(NUM_EPOCHS):
    model.train()
    loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=True)
    for i, (sequences, labels) in loop:
        sequences, labels = sequences.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        loop.set_description(f"Epoch [{epoch+1}/{NUM_EPOCHS}]")
        loop.set_postfix(loss=loss.item())

    # Validation phase (once per epoch)
    val_loss, val_accuracy = validate(model, validation_loader, criterion, device)
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')

In [None]:
# Save only the model parameters (recommended)
torch.save(model.state_dict(), 'lstm_predictor.pth')

### IV. Inference

In [None]:
# Rerun Part II (Building the Model) to define the model class if necessary
# model = LSTMModelWithSteps(input_dim, hidden_dim, layer_dim, output_dim).to(device)

In [None]:
# Load weights into model (only necessary if you started a new session)
model.load_state_dict(torch.load('lstm_predictor.pth'))

In [None]:
def predict(model, test_loader, device):
    model.eval()  # Ensure the model is in evaluation mode
    predictions = []
    with torch.no_grad():  # No need to track gradients
        for sequences, _ in test_loader:  # Assuming you don't need labels just for predictions
            sequences = sequences.to(device)
            outputs = model(sequences)
            _, predicted = torch.max(outputs.data, 1)  # Get the index of the max log-probability as the prediction
            predictions.extend(predicted.cpu().numpy())  # Move predictions to CPU and convert to numpy, then extend the list
    return predictions

In [None]:
# Make predictions
predictions = predict(model, test_loader, device)

In [None]:
# Convert predictions list to a numpy array
predictions_array = np.array(predictions)

# Use inverse_transform to convert numerical labels back to original event names
event_names = le.inverse_transform(predictions_array)