In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, ShuffleSplit
from tensorflow import keras
from keras import layers
from tensorflow_addons.metrics import RSquare
import keras_tuner

from umlaut_lte import write_result, get_data_csv, plot_loss

result_file = './results.txt'
input_data_file = 'data/lte.csv'
dnn_loss_fig_path = './dnn-loss-fig.png'

# Values dependent on the data availability and time frame
threshold_min_days_per_user = 20
test_split_size = 0.3  # relative value
validation_split_size = 0.3  # relative value
dnn_batch_size = 32

2023-01-20 15:19:47.478161: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-20 15:19:47.478201: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-20 15:19:47.509877: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-01-20 15:19:49.554182: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-20 15:19:49.554299: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: ca

In [2]:
def run_random_forest(X_train, X_test, y_train, y_test):
    """Performs a hyperparameter search for a random forest on the given data. The results are written
    to the result.txt file.

    :param X_train: The data to train
    :type X_train: numpy.Array
    :param X_test: The data to test the model
    :type X_test: numpy.Array
    :param y_train: The labels to train the model
    :type y_train: numpy.Array
    :param y_test: The labels for the given test data
    :type y_test: numpy.Array
    """
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Maximum number of levels in tree
    max_depth = [4, 5, 6, 8, 10, 12, 15, 20, 25, 30, 35, 40, 50, 60, 80, 100, 120]
    max_features = [0.75, 0.8, 0.85, 0.9, 0.95, 1.0]
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'max_depth': max_depth,
                   'max_features': max_features,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}

    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(
        estimator=rf,
        param_distributions=random_grid,
        n_iter=1200,
        cv=3,
        verbose=1,
        random_state=0,
        n_jobs=-1  # use all available CPU cores
    )
    rf_random.fit(X_train, y_train.values.ravel())

    write_result(f'BEST PARAMS: {rf_random.best_params_}')

    # create classifier with found parameters
    clf_rf = RandomForestRegressor(**rf_random.best_params_)

    # VALIDATE

    cv = ShuffleSplit(n_splits=5, test_size=validation_split_size, random_state=0)
    scores = cross_val_score(clf_rf, X_train, y_train.values.ravel(), cv=cv)
    write_result("RANDOM FOREST cross validation: %0.5f mean R^2 with a standard deviation of %0.5f" % (scores.mean(), scores.std()))

    clf_rf.fit(X_train, y_train.values.ravel())
    test_score = clf_rf.score(X_test, y_test.values.ravel())

    write_result(f'RANDOM FOREST test score: {test_score}')

In [3]:
def build_dnn_model(hp):
    """Builds the DNN model and compiles it.

    :param hp: The HyperParameters to use
    :type hp: keras_tuner.HyperParameters
    :return: Compiled keras model
    :rtype: keras.Model
    """
    model = keras.Sequential()
    model.add(
        layers.Dense(
            # Define the hyperparameter.
            units=hp.Int("units_0", min_value=8, max_value=80, step=8),
            activation="relu",
        )
    )
    if hp.Boolean("dropout"):
        model.add(layers.Dropout(
            rate=hp.Float("dropout_rate", min_value=0.1, max_value=0.3, step=0.05)
        ))
    model.add(
        layers.Dense(
            # Define the hyperparameter.
            units=hp.Int("units_1", min_value=2, max_value=16, step=4),
            activation="relu",
        )
    )
    model.add(layers.Dense(1))
    model.compile(
        optimizer="sgd",
        loss="mae",
        metrics=["mae", 'mean_squared_error', RSquare()],
    )
    return model


In [4]:
def run_dnn(X_train, X_test, y_train, y_test):
    """Performs a hyperparameter search for a DNN on the given data. The results are written
    to the result.txt file.

    :param X_train: The data to train
    :type X_train: numpy.Array
    :param X_test: The data to test the model
    :type X_test: numpy.Array
    :param y_train: The labels to train the model
    :type y_train: numpy.Array
    :param y_test: The labels for the given test data
    :type y_test: numpy.Array
    """
    build_dnn_model(keras_tuner.HyperParameters())

    tuner = keras_tuner.RandomSearch(
        hypermodel=build_dnn_model,
        objective="val_mae",
        max_trials=100,
        executions_per_trial=2,
        overwrite=True,
        directory=None,
        project_name="insurance-dnn-tuner",
    )

    tuner.search(X_train, y_train, epochs=100, validation_split=validation_split_size)
    best_hps = tuner.get_best_hyperparameters(5)
    best_model = build_dnn_model(best_hps[0])
    best_model.build(input_shape=(None,X_train.shape[1]))

    write_result('BEST DNN model architecture:')
    best_model.summary(print_fn=write_result)

    history = best_model.fit(
        X_train,
        y_train,
        batch_size=dnn_batch_size,
        shuffle=True,
        validation_split=validation_split_size,
        epochs=100)
    plot_loss(history)
    scores = best_model.evaluate(X_test, y_test)
    write_result('Testing trained DNN (mae_loss, mae, mse, r_square):')
    write_result(str(scores))

In [5]:
def run_lin_reg(X_train, X_test, y_train, y_test):
    """Performs a hyperparameter search for a linear regression model on the given data. The results are
    written to the result.txt file.

    :param X_train: The data to train
    :type X_train: numpy.Array
    :param X_test: The data to test the model
    :type X_test: numpy.Array
    :param y_train: The labels to train the model
    :type y_train: numpy.Array
    :param y_test: The labels for the given test data
    :type y_test: numpy.Array
    """
    cv = ShuffleSplit(n_splits=5, test_size=validation_split_size, random_state=0)
    clf_lr = LinearRegression()
    scores = cross_val_score(clf_lr, X_train, y_train.values.ravel(), cv=cv)
    write_result("LINEAR REGRESSION cross validation: %0.5f mean R^2 with a standard deviation of %0.5f" % (scores.mean(), scores.std()))
    clf_lr.fit(X_train, y_train.values.ravel())
    test_score = clf_lr.score(X_test, y_test.values.ravel())
    write_result(f'LINEAR REGRESSION test score: {test_score}')

In [6]:
# All the functions are run here because it has been used as a script on another machine
# The results found here are not comparable to the results found by Umlaut, as it runs
# on a very small amount of dummy data. The settings found by Umlaut are used in
# lte_federated.ipynb
write_result('Loading data...')
X_train, X_test, y_train, y_test = get_data_csv()

Loading data...
running with threshold 0,data point threshold 10,and test split 0.2...
target variable: radius_activity_user
Ignoring velocity? True
before filtering data: 864 rows in data set.
after filtering data: 864 rows in data set.


In [7]:
write_result('\nStarting RANDOM FOREST hyperparameter search...')
run_random_forest(X_train, X_test, y_train, y_test)
write_result('DONE (RANDOM FOREST)\n')


Starting RANDOM FOREST hyperparameter search...
Fitting 3 folds for each of 1200 candidates, totalling 3600 fits
BEST PARAMS: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 0.9, 'max_depth': 80, 'bootstrap': True}
RANDOM FOREST cross validation: 0.99465 mean R^2 with a standard deviation of 0.00087
RANDOM FOREST test score: 0.9956233164232386
DONE (RANDOM FOREST)



In [8]:
write_result('Starting DNN hyperparameter search...')
run_dnn(X_train, X_test, y_train, y_test)
write_result('DONE (DNN)\n')

Trial 22 Complete [00h 00m 17s]
val_mae: 5677.7119140625

Best val_mae So Far: 5475.080810546875
Total elapsed time: 00h 06m 22s

Search: Running Trial #23

Value             |Best Value So Far |Hyperparameter
56                |56                |units_0
False             |False             |dropout
2                 |10                |units_1
0.2               |0.2               |dropout_rate

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
 1/16 [>................

KeyboardInterrupt: 

In [None]:
write_result('Starting LINEAR REGRESSION...')
run_lin_reg(X_train, X_test, y_train, y_test)
write_result('DONE (LINEAR REGRESSION)\n')