In [4]:
import pandas as pd
import numpy as np
from keras_tuner import  RandomSearch
from keras.layers import Dense, Dropout, Flatten, Input, Embedding, Concatenate
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping
from math import sqrt
from keras.optimizers import Adam

In [6]:
dataset_nr = 5

X_train = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_train.csv")
y_train = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_train.csv")
X_val = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_val.csv")
y_val = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_val.csv")
X_test = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/X_test.csv")
y_test = pd.read_csv(f"Inputs/Dataset_{dataset_nr}/y_test.csv")

# Define input shape
n_features = X_train.shape[1]

# Number of unique customers
n_unique_customers = X_train['customer_id_enc'].nunique()
embedding_dim = int(sqrt(n_unique_customers))

# Number unique countries
n_unique_countries = X_train['country_enc'].nunique()
embedding_dim_country = int(sqrt(n_unique_countries))

# Prepare the data
X_train_customer_id = X_train['customer_id_enc'].values
X_train_country = X_train['country_enc'].values
X_train_other_features = X_train.drop(columns=['customer_id_enc', 'country_enc']).values

X_val_customer_id = X_val['customer_id_enc'].values
X_val_country = X_val['country_enc'].values
X_val_other_features = X_val.drop(columns=['customer_id_enc', 'country_enc']).values

# Extract the test features
X_test_customer_id = X_test['customer_id_enc'].values
X_test_country = X_test['country_enc'].values
X_test_other_features = X_test.drop(columns=['customer_id_enc', 'country_enc']).values

In [None]:
def build_model(hp):
    
    # Input layers
    customer_id_input = Input(shape=(1,), name='customer_id_input')  # Input for customer_id_enc
    country_input = Input(shape=(1,), name='country_input')  # Input for country_enc
    other_features_input = Input(shape=(n_features - 2,), name='other_features_input')  # Input for other features, excluding customer_id_enc and country_enc

    # Embedding layer for customer_id_enc; input_dim + 1 to account for unseen categories in test set
    customer_id_embedding = Embedding(input_dim=n_unique_customers + 1, output_dim=embedding_dim)(customer_id_input)
    customer_id_embedding = Flatten()(customer_id_embedding) # Flatten the embedding (from 2D to 1D)

    # Embedding layer for country_enc
    country_embedding = Embedding(input_dim=n_unique_countries + 1, output_dim=embedding_dim_country)(country_input)
    country_embedding = Flatten()(country_embedding)  # Flatten the embedding (from 2D to 1D)


    # Concatenate embeddings with other features
    x = Concatenate()([customer_id_embedding, country_embedding, other_features_input])
    
    """
    For loop: each time model can have between 1 and 5 hidden layers, based of the number of hidden layers ramdomly selected,
    a random number of neurons out of the specified values will be selected for each layer. 
    Additionally a dropout layer is added after each hidden layer to prevent overfitting, the dropout ratio is also randomly selected between 0 and 0.5.
    """
    
    for i in range(hp.Int('num_layers', min_value = 1, max_value = 5)): # possible n layers: [1,2,3,4,5]
        
        
        x = Dense(units=hp.Choice(f'units_{i}', values = [32, 64, 128, 256], ordered = True), activation='relu')(x)
        
        # tune dropout ratio:
        x = Dropout(rate = hp.Float(f"dropout_{i}",
                                    min_value = 0,
                                    max_value = 0.5))(x)
    
    output = Dense(1, activation='linear')(x)
    
    model = Model(inputs=[customer_id_input, country_input,other_features_input], outputs=output)
    
    """
    A random float lr is selected for each model beteen 0.00001 and 0.1, with a logarithmic sampling,
    meaning that the values are sampled uniformly on a log scale (0.00001; 0.0001; 0.001; 0.01; 0.1)
    """
    
    #Assess different values for learning rate
    learning_rate = hp.Float("lr",
                             min_value = 1e-5,
                             max_value = 1e-1,
                             step = 10,
                             sampling = "log") # logarithmic sampling; range: min_value * (max_value / min_value) ^ step

    model.compile(optimizer=Adam(learning_rate = learning_rate), loss="mean_squared_error", metrics=["mean_absolute_error"])
    
    return model

#Select tuner class to run search with RandomSearch

tuner = RandomSearch(
    hypermodel = build_model,
    objective = "val_mean_absolute_error",
    max_trials = 100, # Total number of trials to run during hyperparameter search; 1 trial has 1 type of hyperparameter configuration; 100 configuations will be evaluated
    executions_per_trial = 3, # Number of models to train per trial, to account for randomness in model initialization
    overwrite = True, # Overwrite the results of the previous tuning run
    directory = "tuning_dir", # Directory to store tuning results
    project_name = "DSO_predictor")


# Print the summary of the search space
tuner.search_space_summary()

In [None]:
# Implement early stopping, after 3 epochs without improvement in validation mean absolute error, the training will stop
early_stopping = EarlyStopping(monitor = "val_mean_absolute_error", patience = 3, restore_best_weights = True)

tuner.search([X_train_customer_id, X_train_country, X_train_other_features],  y_train,
             epochs = 10,
             validation_data=([X_val_customer_id, X_val_country, X_val_other_features], y_val),
             batch_size = 256,
             callbacks = [early_stopping]
             )

In [None]:
# Save two best model configuratio, i.e. the two models with the lowest validation mean absolute error
models = tuner.get_best_models(num_models=2)
best_model = models[0]
best_model.summary()

In [6]:
# save best model 
best_model.save("learned_models/tuned/final_model.keras")

In [None]:
tuner.results_summary()

Test model performance when trained on merged train and evaluation data

In [None]:
best_model = load_model("learned_models/tuned/final_model.keras")

# Retrain the model with entire dataset
Train_customer_all = np.concatenate((X_train_customer_id, X_val_customer_id))
Train_country_all = np.concatenate((X_train_country, X_val_country))
Train_other_features_all = np.concatenate((X_train_other_features, X_val_other_features))
y_train_all = np.concatenate((y_train, y_val))

best_model.fit([Train_customer_all, Train_country_all, Train_other_features_all], y_train_all, epochs = 15, batch_size = 256)

test_loss, test_mae = best_model.evaluate([X_test_customer_id, X_test_country, X_test_other_features], y_test)
print(f"Test Loss: {test_loss}")
print(f"Test MAE: {test_mae}")

# Make predictions on X_test_subset
predicted_values = best_model.predict([X_test_customer_id, X_test_country, X_test_other_features])

# Convert the predicted values to a DataFrame
predicted_df = pd.DataFrame(predicted_values, columns=['Predicted'])

# Optionally, add the actual values for comparison
actual_df = pd.DataFrame(y_test.values, columns=['Actual'])
df_comparison = pd.concat([actual_df, predicted_df], axis=1)

print('------------------------------------------------')
from sklearn.metrics import mean_absolute_error
MAE = mean_absolute_error(df_comparison['Actual'], df_comparison['Predicted'])
print(f"Mean Absolute Error: {round(MAE, ndigits= 2)}")
print('------------------------------------------------')