In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the rainfall.csv
rainfall_df = pd.read_csv('data/rainfall.csv')
rainfall_df.head()

Unnamed: 0,Bureau of Meteorology station number,Location,Latitude,Longitude,Product code,Year,Month,Day,Rainfall amount (millimetres),Period over which rainfall was measured (days),Quality,Date,Rainfall category
0,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,1,0.0,0.0,Y,2012-01-01,0
1,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,2,0.0,0.0,Y,2012-01-02,0
2,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,3,0.0,0.0,Y,2012-01-03,0
3,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,4,4.2,1.0,Y,2012-01-04,1
4,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,5,1.0,1.0,Y,2012-01-05,0


In [2]:
# Drop the non-beneficial columns, 'Year', 'Month', 'Day', 'Bureau of Meteorology station number' and 'Product code'.
rainfall_df.drop(['Year','Month','Day','Bureau of Meteorology station number','Product code'], axis = 1, inplace = True)
rainfall_df.head()

Unnamed: 0,Location,Latitude,Longitude,Rainfall amount (millimetres),Period over which rainfall was measured (days),Quality,Date,Rainfall category
0,Monbulk,-37.88,145.42,0.0,0.0,Y,2012-01-01,0
1,Monbulk,-37.88,145.42,0.0,0.0,Y,2012-01-02,0
2,Monbulk,-37.88,145.42,0.0,0.0,Y,2012-01-03,0
3,Monbulk,-37.88,145.42,4.2,1.0,Y,2012-01-04,1
4,Monbulk,-37.88,145.42,1.0,1.0,Y,2012-01-05,0


In [3]:
# Drop the 'Rainfall amount (millimetres)' and 'Period over which rainfall was measured (days)' as the amount of rain does not matter, due to utilising Rainfall Category.
rainfall_df.drop(['Rainfall amount (millimetres)','Period over which rainfall was measured (days)'], axis = 1, inplace = True)
rainfall_df.head()

Unnamed: 0,Location,Latitude,Longitude,Quality,Date,Rainfall category
0,Monbulk,-37.88,145.42,Y,2012-01-01,0
1,Monbulk,-37.88,145.42,Y,2012-01-02,0
2,Monbulk,-37.88,145.42,Y,2012-01-03,0
3,Monbulk,-37.88,145.42,Y,2012-01-04,1
4,Monbulk,-37.88,145.42,Y,2012-01-05,0


In [4]:
# Determine the number of unique values in each column.
rainfall_df.nunique()

Location                4
Latitude                4
Longitude               4
Quality                 2
Date                 4272
Rainfall category       2
dtype: int64

In [5]:
# Convert categorical data to numeric with `pd.get_dummies`
rainfall_df = pd.get_dummies(rainfall_df,dtype=float)
rainfall_df.head()

Unnamed: 0,Latitude,Longitude,Rainfall category,Location_Dandenong,Location_Monbulk,Location_Phillip Island,Location_Warburton,Quality_N,Quality_Y,Date_2012-01-01,...,Date_2023-09-02,Date_2023-09-03,Date_2023-09-04,Date_2023-09-05,Date_2023-09-06,Date_2023-09-07,Date_2023-09-08,Date_2023-09-09,Date_2023-09-10,Date_2023-09-11
0,-37.88,145.42,0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-37.88,145.42,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-37.88,145.42,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-37.88,145.42,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-37.88,145.42,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Split our preprocessed data into our features and target arrays
y = rainfall_df['Rainfall category']
X = rainfall_df.drop(columns='Rainfall category')

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [7]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 4282 #number of columns + one for a bias term
hidden_nodes_layer2 = 2141

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4282)              18331242  
                                                                 
 dense_1 (Dense)             (None, 2141)              9169903   
                                                                 
 dense_2 (Dense)             (None, 1)                 2142      
                                                                 
Total params: 27,503,287
Trainable params: 27,503,287
Non-trainable params: 0
_________________________________________________________________


In [9]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [10]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [11]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

122/122 - 2s - loss: 3.8420 - accuracy: 0.8266 - 2s/epoch - 16ms/step
Loss: 3.8420112133026123, Accuracy: 0.826625406742096


In [22]:
#  Import and read the rainfall.csv
rainfall2_df = pd.read_csv('data/rainfall.csv')
rainfall2_df.head()

Unnamed: 0,Bureau of Meteorology station number,Location,Latitude,Longitude,Product code,Year,Month,Day,Rainfall amount (millimetres),Period over which rainfall was measured (days),Quality,Date,Rainfall category
0,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,1,0.0,0.0,Y,2012-01-01,0
1,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,2,0.0,0.0,Y,2012-01-02,0
2,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,3,0.0,0.0,Y,2012-01-03,0
3,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,4,4.2,1.0,Y,2012-01-04,1
4,86072,Monbulk,-37.88,145.42,IDCJAC0009,2012,1,5,1.0,1.0,Y,2012-01-05,0


In [23]:
# Drop the non-beneficial columns, 'Year', 'Month', 'Day', 'Bureau of Meteorology station number' and 'Product code'.
rainfall2_df.drop(['Year','Month','Day','Bureau of Meteorology station number','Product code'], axis = 1, inplace = True)
# Drop the 'Rainfall amount (millimetres)' and 'Period over which rainfall was measured (days)' as the amount of rain does not matter, due to utilising Rainfall Category.
rainfall2_df.drop(['Rainfall amount (millimetres)','Period over which rainfall was measured (days)'], axis = 1, inplace = True)
rainfall2_df['Date'] = pd.to_datetime(rainfall2_df['Date'])
rainfall2_df['Date'] = rainfall2_df['Date'].dt.to_period('D').apply(lambda x: x.ordinal)
rainfall2_df.head()

Unnamed: 0,Location,Latitude,Longitude,Quality,Date,Rainfall category
0,Monbulk,-37.88,145.42,Y,15340,0
1,Monbulk,-37.88,145.42,Y,15341,0
2,Monbulk,-37.88,145.42,Y,15342,0
3,Monbulk,-37.88,145.42,Y,15343,1
4,Monbulk,-37.88,145.42,Y,15344,0


In [24]:
# Convert categorical data to numeric with `pd.get_dummies`
rainfall2_df = pd.get_dummies(rainfall2_df,prefix=['Location','Quality'],columns=['Location','Quality'],dtype=float)
rainfall2_df.head()

Unnamed: 0,Latitude,Longitude,Date,Rainfall category,Location_Dandenong,Location_Monbulk,Location_Phillip Island,Location_Warburton,Quality_N,Quality_Y
0,-37.88,145.42,15340,0,0.0,1.0,0.0,0.0,0.0,1.0
1,-37.88,145.42,15341,0,0.0,1.0,0.0,0.0,0.0,1.0
2,-37.88,145.42,15342,0,0.0,1.0,0.0,0.0,0.0,1.0
3,-37.88,145.42,15343,1,0.0,1.0,0.0,0.0,0.0,1.0
4,-37.88,145.42,15344,0,0.0,1.0,0.0,0.0,0.0,1.0


In [25]:
# Split our preprocessed data into our features and target arrays
y = rainfall2_df['Rainfall category']
X = rainfall2_df.drop(columns='Rainfall category')

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [26]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [27]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X2_train_scaled[0])
hidden_nodes_layer1 = 11 #number of columns + one for a bias term
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 11)                110       
                                                                 
 dense_7 (Dense)             (None, 5)                 60        
                                                                 
 dense_8 (Dense)             (None, 1)                 6         
                                                                 
Total params: 176
Trainable params: 176
Non-trainable params: 0
_________________________________________________________________


In [28]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [29]:
# Train the model
fit_model = nn.fit(X2_train_scaled,y2_train,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [30]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

122/122 - 0s - loss: 0.5658 - accuracy: 0.7420 - 205ms/epoch - 2ms/step
Loss: 0.5657573938369751, Accuracy: 0.7420020699501038
