In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from feature_scaler import *

# Model 3

In [8]:
dataset_nr = 5

X_train = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_train.csv")
y_train = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_train.csv")
X_val = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_val.csv")
y_val = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_val.csv")
X_test = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_test.csv")
y_test = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_test.csv")

In [None]:
from keras.models import Model, load_model
from keras.layers import Input, Dense, Embedding, Flatten, Concatenate
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import mean_absolute_error
from math import sqrt
import time

# Number of features in the model
n_features = X_train.shape[1]

# Number of unique customers
n_unique_customers = X_train['customer_id_enc'].nunique()
embedding_dim = int(sqrt(n_unique_customers))


# Number unique countries
n_unique_countries = X_train['country_enc'].nunique()
embedding_dim_country = int(sqrt(n_unique_countries))

#Initialize result list
results = []

# Prepare the data; extract customer_id and country for embedding layers
X_train_customer_id = X_train['customer_id_enc'].values
X_train_country = X_train['country_enc'].values
X_train_other_features = X_train.drop(columns=['customer_id_enc', 'country_enc']).values

X_val_customer_id = X_val['customer_id_enc'].values
X_val_country = X_val['country_enc'].values
X_val_other_features = X_val.drop(columns=['customer_id_enc', 'country_enc']).values

# Extract the test features
X_test_customer_id = X_test['customer_id_enc'].values
X_test_country = X_test['country_enc'].values
X_test_other_features = X_test.drop(columns=['customer_id_enc', 'country_enc']).values


#Define Model 3:
def model_3(n_unique_customers, n_unique_countries, n_features):
    
    # Input layers
    customer_id_input = Input(shape=(1,), name='customer_id_input')  # Input for customer_id_enc
    country_input = Input(shape=(1,), name='country_input')  # Input for country_enc
    other_features_input = Input(shape=(n_features - 2,), name='other_features_input')  # Input for other features, excluding customer_id_enc and country_enc

    # Embedding layer for customer_id_enc; input_dim + 1 to account for unseen categories in test set
    customer_id_embedding = Embedding(input_dim=n_unique_customers + 1, output_dim=embedding_dim)(customer_id_input)
    customer_id_embedding = Flatten()(customer_id_embedding) # Flatten the embedding (from 2D to 1D)

    # Embedding layer for country_enc
    country_embedding = Embedding(input_dim=n_unique_countries + 1, output_dim=embedding_dim_country)(country_input)
    country_embedding = Flatten()(country_embedding)  # Flatten the embedding (from 2D to 1D)


    # Concatenate embeddings with other features
    concatenated = Concatenate()([customer_id_embedding, country_embedding, other_features_input])

    # Add Dense layers
    x = Dense(128, activation="relu")(concatenated)
    x = Dense(64, activation="relu")(x)

    # Output layer with 1 neuron
    output = Dense(1, activation="linear")(x)

    # Create the model
    model = Model(inputs=[customer_id_input, country_input,other_features_input], outputs=output)

    # Configure the model
    model.compile(optimizer='adam', loss="mean_squared_error", metrics=["mean_absolute_error"])

    return model

# training loop 5 times:
for run in range(1,6):
    print(f"Run {run}")
    start_time = time.time()
    
    model = model_3(n_unique_customers, n_unique_countries, n_features)
    checkpoint = ModelCheckpoint(f'learned_models/Model_3/data_{dataset_nr}_run_{run}.keras', monitor='val_loss', save_best_only=True, mode='min')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    model.fit([X_train_customer_id, X_train_country, X_train_other_features], y_train, 
              validation_data=([X_val_customer_id, X_val_country ,X_val_other_features], y_val), 
              epochs=20, batch_size=256, callbacks=[checkpoint, early_stopping])
    
    best_model = load_model(f'learned_models/Model_3/data_{dataset_nr}_run_{run}.keras')
    
        # Evaluate the best model of each run at the end
    y_pred = best_model.predict([X_test_customer_id, X_test_country, X_test_other_features])
    mae = mean_absolute_error(y_test.values, y_pred)
    
    end_time = time.time()
    run_time = end_time - start_time
    
    results.append((run, mae, run_time))

# Store results in a DataFrame
results_df = pd.DataFrame(results, columns=['Run', 'MAE', 'Run Time'])

# Save the DataFrame to a CSV file in the current directory
results_df.to_csv(f'Results_df/Model_3/Data_{dataset_nr}.csv', index=False)

# Compute average MAE, standard deviation, and average running time
average_mae = results_df['MAE'].mean()
std_mae = results_df['MAE'].std()
average_run_time = results_df['Run Time'].mean()

print("Results DataFrame:")
print(results_df)
print(f"Average MAE: {round(average_mae, ndigits=2)}")
print(f"Standard Deviation of MAE: {round(std_mae, ndigits=2)}")
print(f"Average Running Time: {round(average_run_time, ndigits=2)} seconds")

In [None]:
print("Number unique customers: ", n_unique_customers)
print("Embedding dimensions Customer ID: ", embedding_dim)
print("Number unique countries: ", n_unique_countries)
print("Embedding dimensions Country: ", embedding_dim_country)

model.summary()

## Testing performance on Test set with also new customers:

In [None]:
from keras.models import load_model
from sklearn.metrics import mean_absolute_error, median_absolute_error

# Testing with new customers:
dataset_nr = 5
X_test_all = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_test_all_customers.csv")
y_test_all = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_test_all_customers.csv")


# Comparison without new customers
X_test_subset = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_test.csv")
y_test_subset = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_test.csv")


best_model = load_model('learned_models/Model_3/data_5_run_3.keras')


### Results for Dataset with new customers

X_test_customer_id = X_test_all['customer_id_enc'].values
X_test_country = X_test_all['country_enc'].values
X_test_other_features = X_test_all.drop(columns=['customer_id_enc', 'country_enc']).values


# Make predictions on X_test_subset
predicted_values = best_model.predict([X_test_customer_id, X_test_country, X_test_other_features])

# Convert the predicted values to a DataFrame
predicted_df = pd.DataFrame(predicted_values, columns=['Predicted'])

# Optionally, add the actual values for comparison
actual_df = pd.DataFrame(y_test_all.values, columns=['Actual'])
df_comparison = pd.concat([actual_df, predicted_df], axis=1)
# Round the predicted values
df_comparison['Predicted'] = np.round(df_comparison['Predicted'])


### Results for Dataset without new customers for comparison

X_test_customer_id_subset = X_test_subset['customer_id_enc'].values
X_test_country_subset = X_test_subset['country_enc'].values
X_test_other_features_subset = X_test_subset.drop(columns=['customer_id_enc', 'country_enc']).values

predicted_values_subset = best_model.predict([X_test_customer_id_subset, X_test_country_subset, X_test_other_features_subset])

predicted_df_subset = pd.DataFrame(predicted_values_subset, columns=['Predicted'])

actual_df_subset = pd.DataFrame(y_test_subset.values, columns=['Actual'])
df_comparison_subset = pd.concat([actual_df_subset, predicted_df_subset], axis=1)
df_comparison_subset['Predicted'] = np.round(df_comparison_subset['Predicted'])




print("Shape of X_test without new customers: ", X_test_subset.shape)
MAE_subset = mean_absolute_error(df_comparison_subset['Actual'], df_comparison_subset['Predicted'])
MedAE_subset = median_absolute_error(df_comparison_subset['Actual'], df_comparison_subset['Predicted'])
print(f"Mean Absolute without new customers Error: {round(MAE_subset, ndigits=2)}")
print(f"Median Absolute without new customers Error: {round(MedAE_subset, ndigits=2)}")
print('------------------------------------------------')
print("Shape of X_test with new customers: ", X_test_all.shape)
MAE = mean_absolute_error(df_comparison['Actual'], df_comparison['Predicted'])
MedAE = median_absolute_error(df_comparison['Actual'], df_comparison['Predicted'])
print(f"Mean Absolute with new customers Error: {round(MAE, ndigits=2)}")
print(f"Median Absolute with new customers Error: {round(MedAE, ndigits=2)}")
print('------------------------------------------------')