In [0]:
# enable autoreload (https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules)
%load_ext autoreload
%autoreload 2

In [0]:
import data_processing as dpr
import training as tr 
import tuning as tu

In [0]:
# parameters for data processing
path = '' # path to the csv with data
train_fraction = 0.95 # full dataset contains ~55 milion records, so 5% for evaluation is sufficient
chronological_split = False # random split by default

# parameter for hyperparameter tuning
max_evals = 3 # number of trials to perform
parallelism = 1 # number trials to run in parallel (set to bigger than 1 to distribute tuning runs)

# parameters for training
number_of_epochs = tr.number_of_epochs # if this value is overwritten, the number of epochs will be different for hyperparameter tuning and final training
utilisation = 0.1 # percentage of available memory to be used for storing partitions of train dataset during training, value determined empirically
model_name = 'NYC_taxi_fare_prediction_model'

In [0]:
# process data
dpr.clean_and_split(path, train_fraction, chronological_split)
dpr.compute_mean_stddev()
dpr.prepare_sample_for_tuning()

In [0]:
# tune hyperparameters
best_params = tu.tune_hyperparameters(max_evals, parallelism)

In [0]:
# train and save model
learn, rmse_val = tr.train_model(best_params, utilisation, number_of_epochs)
print(f'RMSE on validation set: {rmse_val}')
tr.save_model(learn, model_name)