In [17]:
import pandas as pd

# Load the combined climate and device data
climate_data = pd.read_csv('training/clean/combined_climate_data.csv')
device_data = pd.read_csv('training/clean/combined_device_data.csv')

# Convert the 'Time' and 'time' columns to datetime format
climate_data['Time'] = pd.to_datetime(climate_data['Time'])
device_data['time'] = pd.to_datetime(device_data['time'])

# Merge the datasets on the timestamp
merged_df = pd.merge(device_data, climate_data, left_on='time', right_on='Time', how='inner')

# Drop the redundant 'Time' column
merged_df.drop(columns=['Time'], inplace=True)

# Add time-based features
merged_df['hour'] = merged_df['time'].dt.hour
merged_df['day'] = merged_df['time'].dt.day
merged_df['month'] = merged_df['time'].dt.month

# Check for NaN values
print("NaN values in merged_df:\n", merged_df.isna().sum())

# Drop rows with NaN values or handle them appropriately
merged_df = merged_df.dropna()

# Print the merged dataframe
print(merged_df.head())
print("Column names:", merged_df.columns)

# Select features and target variable
features = ['Temperature', 'Humidity', 'hour', 'day', 'month', 'temperature']  # Include device temperature in features
target = 'temperature'


NaN values in merged_df:
 time           0
temperature    1
Temperature    0
Humidity       0
hour           0
day            0
month          0
dtype: int64
                       time  temperature  Temperature  Humidity  hour  day  \
0 2023-03-15 15:00:00+08:00    20.238870           31      0.52    15   15   
1 2023-03-15 16:00:00+08:00    20.248299           31      0.52    16   15   
2 2023-03-15 17:00:00+08:00    20.249218           30      0.55    17   15   
3 2023-03-15 18:00:00+08:00    20.248273           30      0.59    18   15   
4 2023-03-15 19:00:00+08:00    20.172554           29      0.58    19   15   

   month  
0      3  
1      3  
2      3  
3      3  
4      3  
Column names: Index(['time', 'temperature', 'Temperature', 'Humidity', 'hour', 'day',
       'month'],
      dtype='object')


In [18]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Scale the features and target separately
feature_scaler = MinMaxScaler(feature_range=(0, 1))
target_scaler = MinMaxScaler(feature_range=(0, 1))

scaled_features = feature_scaler.fit_transform(merged_df[features])
scaled_target = target_scaler.fit_transform(merged_df[[target]])

# Check for NaN values in scaled features and target
print("NaN values in scaled_features:", np.isnan(scaled_features).sum())
print("NaN values in scaled_target:", np.isnan(scaled_target).sum())

# Prepare sequences for LSTM
def create_sequences(features, target, seq_length):
    xs = []
    ys = []
    for i in range(len(features) - seq_length):
        x = features[i:i+seq_length]
        y = target[i+seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

seq_length = 10
X, y = create_sequences(scaled_features, scaled_target, seq_length)

# Check for NaN values in sequences
print("NaN values in X:", np.isnan(X).sum())
print("NaN values in y:", np.isnan(y).sum())

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape input to be [samples, time steps, features]
X_train = X_train.reshape((X_train.shape[0], seq_length, len(features)))
X_test = X_test.reshape((X_test.shape[0], seq_length, len(features)))

# Check for NaN values in training and testing sets
print("NaN values in X_train:", np.isnan(X_train).sum())
print("NaN values in X_test:", np.isnan(X_test).sum())
print("NaN values in y_train:", np.isnan(y_train).sum())
print("NaN values in y_test:", np.isnan(y_test).sum())


NaN values in scaled_features: 0
NaN values in scaled_target: 0
NaN values in X: 0
NaN values in y: 0
NaN values in X_train: 0
NaN values in X_test: 0
NaN values in y_train: 0
NaN values in y_test: 0


In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define a more complex LSTM model
model = Sequential([
    LSTM(128, activation='relu', return_sequences=True, input_shape=(seq_length, len(features))),  # Increased number of units
    Dropout(0.3),  # Increased dropout rate
    LSTM(64, activation='relu'),  # Second LSTM layer with more units
    Dropout(0.3),  # Increased dropout rate
    Dense(32, activation='relu'),  # Added a Dense layer
    Dense(1)  # Output layer
])

# Compile the model with a different optimizer and learning rate
optimizer = Adam(learning_rate=0.001)  # Specifying learning rate
model.compile(optimizer=optimizer, loss='mse')

# Train the model with modified batch size and number of epochs
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [20]:
# Evaluate the model on the test data
test_loss = model.evaluate(X_test, y_test)
print(f'Test Loss with all features: {test_loss}')

# Predict on test data
y_pred = model.predict(X_test)

# Inverse scale the predictions and target values
y_pred = target_scaler.inverse_transform(y_pred)
y_test = target_scaler.inverse_transform(y_test)

# Check for NaN values in predictions
print(f"NaN in y_pred: {np.isnan(y_pred).sum()}")

# Remove NaN values
mask = ~np.isnan(y_pred).flatten()
y_test_filtered = y_test[mask]
y_pred_filtered = y_pred[mask]

# Debugging: Check shapes of y_test_filtered and y_pred_filtered
print("y_test_filtered shape:", y_test_filtered.shape)
print("y_pred_filtered shape:", y_pred_filtered.shape)

# Calculate mean absolute error (MAE)
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test_filtered, y_pred_filtered)
print(f'Mean Absolute Error with all features: {mae}')


Test Loss with all features: 0.020013101398944855
NaN in y_pred: 0
y_test_filtered shape: (146, 1)
y_pred_filtered shape: (146, 1)
Mean Absolute Error with all features: 0.6287282043726254


In [21]:
# Select features without past device temperature
features_no_device_temp = ['Temperature', 'Humidity', 'hour', 'day', 'month']
scaled_features_no_device_temp = feature_scaler.fit_transform(merged_df[features_no_device_temp])

# Prepare sequences for LSTM without device temperature
X_no_device_temp, y_no_device_temp = create_sequences(scaled_features_no_device_temp, scaled_target, seq_length)

# Split the data
X_train_no_device_temp, X_test_no_device_temp, y_train_no_device_temp, y_test_no_device_temp = train_test_split(
    X_no_device_temp, y_no_device_temp, test_size=0.2, random_state=42)

# Reshape input
X_train_no_device_temp = X_train_no_device_temp.reshape((X_train_no_device_temp.shape[0], seq_length, len(features_no_device_temp)))
X_test_no_device_temp = X_test_no_device_temp.reshape((X_test_no_device_temp.shape[0], seq_length, len(features_no_device_temp)))

# Define and compile the model
model_no_device_temp = Sequential([
    LSTM(128, activation='relu', return_sequences=True, input_shape=(seq_length, len(features_no_device_temp))),
    Dropout(0.3),
    LSTM(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1)
])

optimizer = Adam(learning_rate=0.001)
model_no_device_temp.compile(optimizer=optimizer, loss='mse')

# Train the model
history_no_device_temp = model_no_device_temp.fit(X_train_no_device_temp, y_train_no_device_temp, epochs=30, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss_no_device_temp = model_no_device_temp.evaluate(X_test_no_device_temp, y_test_no_device_temp)
print(f'Test Loss without past device temperature: {test_loss_no_device_temp}')

# Predict on test data
y_pred_no_device_temp = model_no_device_temp.predict(X_test_no_device_temp)

# Inverse scale the predictions and target values
y_pred_no_device_temp = target_scaler.inverse_transform(y_pred_no_device_temp)
y_test_no_device_temp = target_scaler.inverse_transform(y_test_no_device_temp)

# Calculate mean absolute error (MAE)
mae_no_device_temp = mean_absolute_error(y_test_no_device_temp, y_pred_no_device_temp)
print(f'Mean Absolute Error without past device temperature: {mae_no_device_temp}')


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss without past device temperature: 0.07238898426294327
Mean Absolute Error without past device temperature: 1.9734119480575436


In [22]:
# Select features without climate temperature
features_no_climate_temp = ['hour', 'day', 'month', 'temperature']
scaled_features_no_climate_temp = feature_scaler.fit_transform(merged_df[features_no_climate_temp])

# Prepare sequences for LSTM without climate temperature
X_no_climate_temp, y_no_climate_temp = create_sequences(scaled_features_no_climate_temp, scaled_target, seq_length)

# Split the data
X_train_no_climate_temp, X_test_no_climate_temp, y_train_no_climate_temp, y_test_no_climate_temp = train_test_split(
    X_no_climate_temp, y_no_climate_temp, test_size=0.2, random_state=42)

# Reshape input
X_train_no_climate_temp = X_train_no_climate_temp.reshape((X_train_no_climate_temp.shape[0], seq_length, len(features_no_climate_temp)))
X_test_no_climate_temp = X_test_no_climate_temp.reshape((X_test_no_climate_temp.shape[0], seq_length, len(features_no_climate_temp)))

# Define and compile the model
model_no_climate_temp = Sequential([
    LSTM(128, activation='relu', return_sequences=True, input_shape=(seq_length, len(features_no_climate_temp))),
    Dropout(0.3),
    LSTM(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1)
])

optimizer = Adam(learning_rate=0.001)
model_no_climate_temp.compile(optimizer=optimizer, loss='mse')

# Train the model
history_no_climate_temp = model_no_climate_temp.fit(X_train_no_climate_temp, y_train_no_climate_temp, epochs=30, batch_size=32, validation_split=0.2)

# Evaluate the model
test_loss_no_climate_temp = model_no_climate_temp.evaluate(X_test_no_climate_temp, y_test_no_climate_temp)
print(f'Test Loss without climate temperature: {test_loss_no_climate_temp}')

# Predict on test data
y_pred_no_climate_temp = model_no_climate_temp.predict(X_test_no_climate_temp)

# Inverse scale the predictions and target values
y_pred_no_climate_temp = target_scaler.inverse_transform(y_pred_no_climate_temp)
y_test_no_climate_temp = target_scaler.inverse_transform(y_test_no_climate_temp)

# Calculate mean absolute error (MAE)
mae_no_climate_temp = mean_absolute_error(y_test_no_climate_temp, y_pred_no_climate_temp)
print(f'Mean Absolute Error without climate temperature: {mae_no_climate_temp}')


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test Loss without climate temperature: 0.025045743212103844
Mean Absolute Error without climate temperature: 1.0664309207319869
