In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from feature_scaler import *

# Model 2


In this Notebook Model 2 is tested: FNN with an Embedding layer for customer ID

In [None]:
dataset_nr = 2

X_train = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_train.csv")
y_train = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_train.csv")
X_val = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_val.csv")
y_val = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_val.csv")
X_test = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_test.csv")
y_test = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_test.csv")

In [None]:
from keras.models import Model, load_model
from keras.layers import Input, Dense, Embedding, Flatten, Concatenate
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import mean_absolute_error
from math import sqrt
import time


# Number of unique customers
n_unique_customers = X_train['customer_id_enc'].nunique()

# Define input shape
n_features = X_train.shape[1]

# Initialize results list
results = []

# Prepare the data; extract customer_id for embedding
X_train_customer_id = X_train['customer_id_enc'].values
X_train_other_features = X_train.drop(columns=['customer_id_enc']).values
X_val_customer_id = X_val['customer_id_enc'].values
X_val_other_features = X_val.drop(columns=['customer_id_enc']).values

# Prepare test data
X_test_customer_id = X_test['customer_id_enc'].values
X_test_other_features = X_test.drop(columns=['customer_id_enc']).values

#Define embedding dimension; sqrt commoly used as rule of thumb
embedding_dim = int(sqrt(n_unique_customers))

# Define Model 2:
def model_2(n_unique_customers, n_features):
    
    # Input layers
    customer_id_input = Input(shape=(1,), name='customer_id_input')  # Input for customer_id_enc
    other_features_input = Input(shape=(n_features - 1,), name='other_features_input')  # Input for other features
    
    # Embedding layer for customer_id_enc
    customer_id_embedding = Embedding(input_dim=n_unique_customers, output_dim=embedding_dim, input_length=1)(customer_id_input)
    customer_id_embedding = Flatten()(customer_id_embedding)  # Flatten the embedding
    
    # Concatenate embedding with other features
    concatenated = Concatenate()([customer_id_embedding, other_features_input])
    
    # Add Dense layers
    x = Dense(128, activation="relu")(concatenated)
    x = Dense(64, activation="relu")(x)
    
    # Output layer with 1 neuron
    output = Dense(1, activation="linear")(x)
    
    # Create the model
    model = Model(inputs=[customer_id_input, other_features_input], outputs=output)
    
    # Configure the model
    model.compile(optimizer='adam', loss="mean_squared_error", metrics=["mean_absolute_error"])
    
    return model



# Train and evaluate the model 5 times
for run in range(1, 6):
    print(f"Run {run}")
    start_time = time.time()
    
    model = model_2(n_unique_customers, n_features)
    #Define checkpoint: Saves model at epoch with best validation loss
    checkpoint = ModelCheckpoint(f'learned_models/Model_2/data_{dataset_nr}_run_{run}.keras', monitor='val_loss', save_best_only=True, mode='min')
    # Early stopping after 5 epochs without improvement
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


    model.fit([X_train_customer_id, X_train_other_features], y_train.values, 
              validation_data=([X_val_customer_id, X_val_other_features], y_val.values), 
              epochs=20, batch_size=256, callbacks=[checkpoint, early_stopping])
    
    # Save best model for prediction
    best_model = load_model(f'learned_models/Model_2/data_{dataset_nr}_run_{run}.keras')
    
    # Evaluate the best model of each run at the end
    y_pred = best_model.predict([X_test_customer_id, X_test_other_features])
    mae = mean_absolute_error(y_test.values, y_pred)
    
    end_time = time.time()
    run_time = end_time - start_time
    
    results.append((run, mae, run_time))

# Store results in a DataFrame
results_df = pd.DataFrame(results, columns=['Run', 'MAE', 'Run Time'])

# Save the DataFrame to a CSV file in the current directory
results_df.to_csv(f'Results_df/Model_2/data_{dataset_nr}.csv', index=False)

# Compute average MAE, standard deviation, and average running time
average_mae = results_df['MAE'].mean()
std_mae = results_df['MAE'].std()
average_run_time = results_df['Run Time'].mean()

print("Results DataFrame:")
print(results_df)
print(f"Average MAE: {round(average_mae, ndigits=2)}")
print(f"Standard Deviation of MAE: {round(std_mae, ndigits=2)}")
print(f"Average Running Time: {round(average_run_time, ndigits=2)} seconds")

In [None]:
print("Embedding dimensions Customer ID: ", embedding_dim)
model.summary() # Summary of model architecture

## Testing performance on Test set with new customers:

In [None]:
from keras.models import load_model
from sklearn.metrics import mean_absolute_error, median_absolute_error

# Testing with new customers:
dataset_nr = 5
X_test_all = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_test_all_customers.csv")
y_test_all = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_test_all_customers.csv")

# Comparison without new customers
X_test_subset = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_test.csv")
y_test_subset = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_test.csv")

#update
best_model = load_model('learned_models/Model_2/data_5_run_2.keras')

### Results for Dataset with new customers
X_test_customer_id = X_test_all['customer_id_enc'].values
X_test_other_features = X_test_all.drop(columns=['customer_id_enc']).values

# Make predictions on X_test_subset
predicted_values = best_model.predict([X_test_customer_id, X_test_other_features])

# Convert the predicted values to a DataFrame
predicted_df = pd.DataFrame(predicted_values, columns=['Predicted'])

# Optionally, add the actual values for comparison
actual_df = pd.DataFrame(y_test_all.values, columns=['Actual'])
df_comparison = pd.concat([actual_df, predicted_df], axis=1)
# Round the predicted values
df_comparison['Predicted'] = np.round(df_comparison['Predicted'])


### Results for Dataset without new customers for comparison

X_test_customer_id_subset = X_test_subset['customer_id_enc'].values
X_test_other_features_subset = X_test_subset.drop(columns=['customer_id_enc']).values

predicted_values_subset = best_model.predict([X_test_customer_id_subset, X_test_other_features_subset])

predicted_df_subset = pd.DataFrame(predicted_values_subset, columns=['Predicted'])

actual_df_subset = pd.DataFrame(y_test_subset.values, columns=['Actual'])
df_comparison_subset = pd.concat([actual_df_subset, predicted_df_subset], axis=1)
df_comparison_subset['Predicted'] = np.round(df_comparison_subset['Predicted'])




print("Shape of X_test without new customers: ", X_test_subset.shape)
MAE_subset = mean_absolute_error(df_comparison_subset['Actual'], df_comparison_subset['Predicted'])
MedAE_subset = median_absolute_error(df_comparison_subset['Actual'], df_comparison_subset['Predicted'])
print(f"Mean Absolute without new customers Error: {round(MAE_subset, ndigits=2)}")
print(f"Median Absolute without new customers Error: {round(MedAE_subset, ndigits=2)}")
print('------------------------------------------------')
print("Shape of X_test with new customers: ", X_test_all.shape)
MAE = mean_absolute_error(df_comparison['Actual'], df_comparison['Predicted'])
MedAE = median_absolute_error(df_comparison['Actual'], df_comparison['Predicted'])
print(f"Mean Absolute with new customers Error: {round(MAE, ndigits=2)}")
print(f"Median Absolute with new customers Error: {round(MedAE, ndigits=2)}")
print('------------------------------------------------')