In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the rainfall.csv
rainfall_df = pd.read_csv('data/rainfall.csv')
rainfall_df.head()

Unnamed: 0,Bureau of Meteorology station number,Location,Latitude,Longitude,Product code,Year,Month,Day,Rainfall amount (millimetres),Period over which rainfall was measured (days),Quality,Date,Rainfall category
0,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,1,0.0,0.0,Y,2012-01-01,0
1,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,2,0.0,0.0,Y,2012-01-02,0
2,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,3,0.0,0.0,Y,2012-01-03,0
3,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,4,4.2,1.0,Y,2012-01-04,1
4,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,5,1.0,1.0,Y,2012-01-05,0


In [2]:
# Drop the non-beneficial columns, 'Year', 'Month', 'Day', 'Bureau of Meteorology station number' and 'Product code'.
rainfall_df.drop(['Year','Month','Day','Bureau of Meteorology station number','Product code'], axis = 1, inplace = True)
rainfall_df.head()

Unnamed: 0,Location,Latitude,Longitude,Rainfall amount (millimetres),Period over which rainfall was measured (days),Quality,Date,Rainfall category
0,Monbulk,-37.88,145.42,0.0,0.0,Y,2012-01-01,0
1,Monbulk,-37.88,145.42,0.0,0.0,Y,2012-01-02,0
2,Monbulk,-37.88,145.42,0.0,0.0,Y,2012-01-03,0
3,Monbulk,-37.88,145.42,4.2,1.0,Y,2012-01-04,1
4,Monbulk,-37.88,145.42,1.0,1.0,Y,2012-01-05,0


In [3]:
# Drop the 'Rainfall amount (millimetres)' and 'Period over which rainfall was measured (days)' as the amount of rain does not matter, due to utilising Rainfall Category.
rainfall_df.drop(['Rainfall amount (millimetres)','Period over which rainfall was measured (days)'], axis = 1, inplace = True)
rainfall_df.head()

Unnamed: 0,Location,Latitude,Longitude,Quality,Date,Rainfall category
0,Monbulk,-37.88,145.42,Y,2012-01-01,0
1,Monbulk,-37.88,145.42,Y,2012-01-02,0
2,Monbulk,-37.88,145.42,Y,2012-01-03,0
3,Monbulk,-37.88,145.42,Y,2012-01-04,1
4,Monbulk,-37.88,145.42,Y,2012-01-05,0


In [4]:
# Determine the number of unique values in each column.
rainfall_df.nunique()

Location                4
Latitude                4
Longitude               4
Quality                 2
Date                 4272
Rainfall category       2
dtype: int64

In [5]:
# Convert categorical data to numeric with `pd.get_dummies`
rainfall_df = pd.get_dummies(rainfall_df,dtype=float)
rainfall_df.head()

Unnamed: 0,Latitude,Longitude,Rainfall category,Location_Dandenong,Location_Monbulk,Location_Phillip Island,Location_Warburton,Quality_N,Quality_Y,Date_2012-01-01,...,Date_2023-09-02,Date_2023-09-03,Date_2023-09-04,Date_2023-09-05,Date_2023-09-06,Date_2023-09-07,Date_2023-09-08,Date_2023-09-09,Date_2023-09-10,Date_2023-09-11
0,-37.88,145.42,0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-37.88,145.42,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-37.88,145.42,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-37.88,145.42,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-37.88,145.42,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Split our preprocessed data into our features and target arrays
y = rainfall_df['Rainfall category'].values
X = rainfall_df.drop(columns='Rainfall category').values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [15]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=50,
        step=10), activation=activation, input_dim=len(X_train[0])))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=100,
            step=10),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [16]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=50,
    hyperband_iterations=2)

In [17]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 180 Complete [00h 01m 33s]
val_accuracy: 0.8671310544013977

Best val_accuracy So Far: 0.8717750310897827
Total elapsed time: 00h 52m 19s
INFO:tensorflow:Oracle triggered exit


In [18]:
# Get top 3 model hyperparameters and print the values
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
    print(param.values)

{'activation': 'tanh', 'first_units': 41, 'num_layers': 1, 'units_0': 21, 'units_1': 51, 'units_2': 81, 'units_3': 81, 'units_4': 81, 'tuner/epochs': 17, 'tuner/initial_epoch': 6, 'tuner/bracket': 3, 'tuner/round': 2, 'tuner/trial_id': '0036', 'units_5': 11}
{'activation': 'tanh', 'first_units': 21, 'num_layers': 1, 'units_0': 41, 'units_1': 41, 'units_2': 91, 'units_3': 1, 'units_4': 61, 'units_5': 21, 'tuner/epochs': 6, 'tuner/initial_epoch': 2, 'tuner/bracket': 3, 'tuner/round': 1, 'tuner/trial_id': '0009'}
{'activation': 'tanh', 'first_units': 31, 'num_layers': 1, 'units_0': 51, 'units_1': 11, 'units_2': 11, 'units_3': 51, 'units_4': 91, 'units_5': 81, 'tuner/epochs': 17, 'tuner/initial_epoch': 6, 'tuner/bracket': 3, 'tuner/round': 2, 'tuner/trial_id': '0034'}


In [19]:
# Get second best model hyperparameters
second_hyper = tuner.get_best_hyperparameters(2)[1]
second_hyper.values

{'activation': 'tanh',
 'first_units': 21,
 'num_layers': 1,
 'units_0': 41,
 'units_1': 41,
 'units_2': 91,
 'units_3': 1,
 'units_4': 61,
 'units_5': 21,
 'tuner/epochs': 6,
 'tuner/initial_epoch': 2,
 'tuner/bracket': 3,
 'tuner/round': 1,
 'tuner/trial_id': '0009'}

In [20]:
# Compare the performance to the second-best model
second_model = tuner.get_best_models(2)[1]
model_loss, model_accuracy = second_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

122/122 - 0s - loss: 0.4451 - accuracy: 0.8715 - 370ms/epoch - 3ms/step
Loss: 0.4451388716697693, Accuracy: 0.87151700258255
