# Load and Examine the Dataset

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load and Preview Data

In [4]:
# Load the CSV file
dataset_path = 'C:/Users/ibrah/Downloads/Met dataset - 2015-to-2022_12months.csv'
data = pd.read_csv(dataset_path)
print("Dataset Preview:")
# Display the first few rows of the dataset
data.head()

Dataset Preview:


Unnamed: 0,x_coord,y_coord,year,groundfrost_1,groundfrost_2,groundfrost_3,groundfrost_4,groundfrost_5,groundfrost_6,groundfrost_7,...,rainfall_3,rainfall_4,rainfall_5,rainfall_6,rainfall_7,rainfall_8,rainfall_9,rainfall_10,rainfall_11,rainfall_12
0,1,1,1,,,,,,,,...,,,,,,,,,,
1,1,1,1,,,,,,,,...,,,,,,,,,,
2,1,1,1,,,,,,,,...,,,,,,,,,,
3,1,1,1,,,,,,,,...,,,,,,,,,,
4,1,1,1,,,,,,,,...,,,,,,,,,,


# Handling Missing Values
We'll fill missing values with the mean of each column.

In [6]:
# Fill missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
data_imputed

Unnamed: 0,x_coord,y_coord,year,groundfrost_1,groundfrost_2,groundfrost_3,groundfrost_4,groundfrost_5,groundfrost_6,groundfrost_7,...,rainfall_3,rainfall_4,rainfall_5,rainfall_6,rainfall_7,rainfall_8,rainfall_9,rainfall_10,rainfall_11,rainfall_12
0,1.0,1.0,1.0,7.967022,7.896864,7.882327,7.836455,7.69597,7.654871,7.630654,...,95.995318,95.486136,95.844339,95.301693,95.216766,94.560505,94.549497,95.647443,96.745711,96.063469
1,1.0,1.0,1.0,7.967022,7.896864,7.882327,7.836455,7.69597,7.654871,7.630654,...,95.995318,95.486136,95.844339,95.301693,95.216766,94.560505,94.549497,95.647443,96.745711,96.063469
2,1.0,1.0,1.0,7.967022,7.896864,7.882327,7.836455,7.69597,7.654871,7.630654,...,95.995318,95.486136,95.844339,95.301693,95.216766,94.560505,94.549497,95.647443,96.745711,96.063469
3,1.0,1.0,1.0,7.967022,7.896864,7.882327,7.836455,7.69597,7.654871,7.630654,...,95.995318,95.486136,95.844339,95.301693,95.216766,94.560505,94.549497,95.647443,96.745711,96.063469
4,1.0,1.0,1.0,7.967022,7.896864,7.882327,7.836455,7.69597,7.654871,7.630654,...,95.995318,95.486136,95.844339,95.301693,95.216766,94.560505,94.549497,95.647443,96.745711,96.063469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33230,23.0,23.0,8.0,7.967022,7.896864,7.882327,7.836455,7.69597,7.654871,7.630654,...,95.995318,95.486136,95.844339,95.301693,95.216766,94.560505,94.549497,95.647443,96.745711,96.063469
33231,23.0,23.0,8.0,7.967022,7.896864,7.882327,7.836455,7.69597,7.654871,7.630654,...,95.995318,95.486136,95.844339,95.301693,95.216766,94.560505,94.549497,95.647443,96.745711,96.063469
33232,23.0,23.0,8.0,7.967022,7.896864,7.882327,7.836455,7.69597,7.654871,7.630654,...,95.995318,95.486136,95.844339,95.301693,95.216766,94.560505,94.549497,95.647443,96.745711,96.063469
33233,23.0,23.0,8.0,7.967022,7.896864,7.882327,7.836455,7.69597,7.654871,7.630654,...,95.995318,95.486136,95.844339,95.301693,95.216766,94.560505,94.549497,95.647443,96.745711,96.063469


# Define feature columns for the model

In [8]:
# Define feature and target columns
features_columns = [f'{feature}_{month}' for feature in ['groundfrost', 'hurs', 'psl', 'sun', 'pv', 'sfcWind', 'tas', 'snowLying', 'rainfall'] for month in range(1, 13)]
# The target variable is the relative humidity for the next month
# Here we assume it's hurs_1 (next month after the 12th month)
target_column = 'hurs_1'

# Select Features and Target

In [10]:
# Selecting features and target
features = data_imputed[features_columns]
target = data_imputed[target_column]
# Select the most recent 12 months of data
recent_data = data_imputed[features_columns].iloc[-1].values.reshape(1, -1)

In [11]:
# Split data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [12]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
import os

# Ensure the directory exists
save_dir = 'C:/Users/ibrah/Downloads/data/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [14]:
# Save processed data to avoid repeated preprocessing
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=features_columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=features_columns)

X_train_scaled_df.to_csv(os.path.join(save_dir, 'X_train_scaled.csv'), index=False)
X_test_scaled_df.to_csv(os.path.join(save_dir, 'X_test_scaled.csv'), index=False)
y_train.to_csv(os.path.join(save_dir, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(save_dir, 'y_test.csv'), index=False)

# Build and Train the MLP Model
Build and train the MLP model using TensorFlow/Keras.

In [16]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [17]:
# Define the model
model = Sequential([
    tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [18]:
# Display model summary
model.summary()

In [19]:
# Train the model (early stopping to prevent overfitting)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
history = model.fit(X_train_scaled, y_train, epochs=50, validation_data=(X_test_scaled, y_test), callbacks=[early_stopping])

Epoch 1/50
[1m831/831[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - loss: 3755.1121 - mae: 52.4635 - val_loss: 25.5089 - val_mae: 2.4007
Epoch 2/50
[1m831/831[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 64.1646 - mae: 5.7353 - val_loss: 9.0271 - val_mae: 1.2536
Epoch 3/50
[1m831/831[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 46.3869 - mae: 5.1930 - val_loss: 8.6881 - val_mae: 1.4304
Epoch 4/50
[1m831/831[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 41.1154 - mae: 4.9609 - val_loss: 5.0326 - val_mae: 1.2500
Epoch 5/50
[1m831/831[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 38.0712 - mae: 4.8174 - val_loss: 7.4545 - val_mae: 1.0856
Epoch 6/50
[1m831/831[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 32.8528 - mae: 4.4983 - val_loss: 6.2498 - val_mae: 1.0050
Epoch 7/50
[1m831/831[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 

In [20]:
# Display training history
history_df = pd.DataFrame(history.history)
display(history_df.head())

Unnamed: 0,loss,mae,val_loss,val_mae
0,1698.807373,28.107025,25.508904,2.400707
1,57.123936,5.540397,9.027093,1.253639
2,44.899055,5.123977,8.688098,1.430443
3,40.886936,4.938976,5.032596,1.250011
4,37.046677,4.742568,7.454462,1.085601


In [21]:
# Evaluate the model
test_loss = model.evaluate(X_test_scaled, y_test)
# Print the results
print(f"Evaluation Loss: {test_loss[0]:.4f}")

[1m208/208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 4.9508 - mae: 1.2538
Evaluation Loss: 5.0326


In [22]:
# Save the model
model.save('relative_humidity_forecast_model.keras')

# Load the model
loaded_model = tf.keras.models.load_model('relative_humidity_forecast_model.keras')

In [41]:
# Load the scaler used during training
scaler_fit = StandardScaler()
X_train_scaled = pd.read_csv('C:/Users/ibrah/Downloads/data/X_train_scaled.csv')
scaler_fit.fit(X_train_scaled)  # Re-fit the scaler with training data

In [43]:
# Transform the recent data
recent_data_scaled = scaler.transform(recent_data)
display(recent_data_scaled)



array([[7.96702165e+00, 7.89686383e+00, 7.88232651e+00, 7.83645465e+00,
        7.69596962e+00, 7.65487103e+00, 7.63065383e+00, 7.63068824e+00,
        7.64555404e+00, 7.65010731e+00, 7.65511434e+00, 7.81018989e+00,
        8.26272479e+01, 8.25987880e+01, 8.25202254e+01, 8.24718923e+01,
        8.24944778e+01, 8.24750084e+01, 8.24446102e+01, 8.23961418e+01,
        8.24027257e+01, 8.24337494e+01, 8.24600718e+01, 8.24818662e+01,
        1.01377579e+03, 1.01378433e+03, 1.01386686e+03, 1.01384323e+03,
        1.01379201e+03, 1.01381578e+03, 1.01383211e+03, 1.01389076e+03,
        1.01389862e+03, 1.01382270e+03, 1.01365848e+03, 1.01364422e+03,
        1.20370605e+02, 1.20551623e+02, 1.21618347e+02, 1.22179224e+02,
        1.21509594e+02, 1.21777405e+02, 1.21653630e+02, 1.22232766e+02,
        1.21903138e+02, 1.21336267e+02, 1.20925937e+02, 1.21099494e+02,
        1.00820338e+01, 1.00902638e+01, 1.00996445e+01, 1.01108122e+01,
        1.01417015e+01, 1.01724939e+01, 1.02002191e+01, 1.021340

In [45]:
# Predict the relative humidity for the next month
predicted_humidity = loaded_model.predict(recent_data_scaled)
print(f"Predicted Relative Humidity for next month: {predicted_humidity[0][0]:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step
Predicted Relative Humidity for next month: 8369.19%
