# **Time-to-Cool AI Model**
### **Import library**
Import required libraries and suppress warnings

In [494]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings
import joblib

from sklearn.metrics import mean_absolute_error, mean_squared_error

warnings.filterwarnings('ignore')

### **Data Loading**
File path and constants

In [495]:
PATH = 'dataset.csv'
TARGET_TEMP = 23.0
TARGET_BAND = 0.5   
RESTART_HYST = 0.3  
MAX_HORIZON_MIN = 240 

### **Data Overview**
Load the dataset, convert timestamps to proper datetime format and sort the rows by time

In [496]:
df = pd.read_csv(PATH, sep=',')
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df = df.sort_values('timestamp').reset_index(drop=True)

Read the first few rows of the dataset to quickly check the data

In [497]:
df.head()

Unnamed: 0,timestamp,hour_of_day,day_of_week,day_of_year,outside_temp,outside_humidity,weather_condition,occupancy_count,is_occupied,room_temp,power_kw,fan_speed,ac_temp_setting,ac_control_reason
0,2024-01-01 00:00:00,0,0,1,24.593342,60.0,cloudy,0.0,0,26.006587,0.0,off,,SYSTEM OFF: Room unoccupied
1,2024-01-01 00:05:00,0,0,1,24.661599,60.0,rainy,0.0,0,26.03766,0.0,off,,SYSTEM OFF: Room unoccupied
2,2024-01-01 00:10:00,0,0,1,24.612982,60.0,cloudy,0.0,0,25.949107,0.0,off,,SYSTEM OFF: Room unoccupied
3,2024-01-01 00:15:00,0,0,1,24.970486,65.615747,cloudy,0.0,0,25.933911,0.0,off,,SYSTEM OFF: Room unoccupied
4,2024-01-01 00:20:00,0,0,1,24.346744,62.063288,cloudy,0.0,0,25.852349,0.0,off,,SYSTEM OFF: Room unoccupied


Data info

In [498]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105121 entries, 0 to 105120
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   timestamp          105121 non-null  datetime64[ns]
 1   hour_of_day        105121 non-null  int64         
 2   day_of_week        105121 non-null  int64         
 3   day_of_year        105121 non-null  int64         
 4   outside_temp       105121 non-null  float64       
 5   outside_humidity   105121 non-null  float64       
 6   weather_condition  105121 non-null  object        
 7   occupancy_count    105121 non-null  float64       
 8   is_occupied        105121 non-null  int64         
 9   room_temp          105121 non-null  float64       
 10  power_kw           105121 non-null  float64       
 11  fan_speed          105121 non-null  object        
 12  ac_temp_setting    46119 non-null   float64       
 13  ac_control_reason  105121 non-null  object  

Data description

In [499]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
timestamp,105121.0,,,,2024-07-01 11:59:59.999999744,2024-01-01 00:00:00,2024-04-01 06:00:00,2024-07-01 12:00:00,2024-09-30 18:00:00,2024-12-31 00:00:00,
hour_of_day,105121.0,,,,11.499891,0.0,5.0,11.0,17.0,23.0,6.922277
day_of_week,105121.0,,,,2.991762,0.0,1.0,3.0,5.0,6.0,2.003414
day_of_year,105121.0,,,,183.001741,1.0,92.0,183.0,274.0,366.0,105.36754
outside_temp,105121.0,,,,27.999405,21.941441,25.432435,28.001006,30.565238,34.087714,3.026859
outside_humidity,105121.0,,,,75.823283,60.0,61.944477,75.003321,88.059324,95.0,12.949781
weather_condition,105121.0,3.0,cloudy,52623.0,,,,,,,
occupancy_count,105121.0,,,,3.611885,0.0,0.0,0.0,5.0,20.0,5.879752
is_occupied,105121.0,,,,0.449891,0.0,0.0,0.0,1.0,1.0,0.497485
room_temp,105121.0,,,,24.266641,22.29994,23.49014,24.00035,24.809658,29.8584,1.016505


Check total missing values

In [500]:
df.isnull().sum()

timestamp                0
hour_of_day              0
day_of_week              0
day_of_year              0
outside_temp             0
outside_humidity         0
weather_condition        0
occupancy_count          0
is_occupied              0
room_temp                0
power_kw                 0
fan_speed                0
ac_temp_setting      59002
ac_control_reason        0
dtype: int64

### **Data Preprocessing**
Clean and prepare key columns: convert numeric columns to numbers, fill missing values, and ensure text columns are strings

In [501]:
df['power_kw'] = pd.to_numeric(df['power_kw'], errors='coerce').fillna(0.0)
df['fan_speed'] = df['fan_speed'].fillna('off').astype(str)
df['ac_control_reason'] = df['ac_control_reason'].fillna('').astype(str)
df['room_temp'] = pd.to_numeric(df['room_temp'], errors='coerce')

Calculate the time difference between consecutive measurements in minutes, find the typical interval (median cadence) to understand how often data is recorded and determine how many rows roughly correspond to a 5-minute period for later use

In [502]:
df['dt_min'] = df['timestamp'].diff().dt.total_seconds().div(60.0)
median_dt = df['dt_min'].median() if not df['dt_min'].isna().all() else 5.0
print('median cadence (min):', median_dt)
lag_5min = max(1, int(round(5.0 / median_dt))) 

median cadence (min): 5.0


Create a flag 'ac_on' that infers AC is running if power > 0.05, fan not off, reason includes 'cool', or set temp < room temp

In [503]:
df['ac_on'] = (df['power_kw'] > 0.05) | (df['fan_speed'].str.lower() != 'off') \
              | df['ac_control_reason'].str.contains('cool', case=False, na=False) \
              | ((pd.to_numeric(df['ac_temp_setting'], errors='coerce').notnull()) &
                 (pd.to_numeric(df['ac_temp_setting'], errors='coerce') < df['room_temp']))

Get room temperature values as an array

In [504]:
temps = df['room_temp'].values
n = len(df)

Start with empty results (NaN for time, False for censored)

In [505]:
time_to_cool = np.full(n, np.nan)
censored = np.zeros(n, dtype=bool)

Find all rows where room already <= target temperature

In [506]:
target_idxs = np.where(temps <= TARGET_TEMP)[0]

Label how long it takes for the room temperature to reach the target range, marking cases as censored if the target isn't reached or data is missing

In [507]:
temps = df['room_temp'].values
n = len(df)
time_to_cool = np.full(n, np.nan)
censored = np.zeros(n, dtype=bool)

# Track whether a cooling episode is currently active
episode_active = True 

for i in range(n):

     # Skip if temperature is missing
    if np.isnan(temps[i]):
        time_to_cool[i] = np.nan
        censored[i] = True
        continue

    # If temperature is already within target band, set time to 0
    if temps[i] <= TARGET_TEMP + TARGET_BAND:
        time_to_cool[i] = 0.0
        episode_active = False
        continue

    # Restart a new episode if temperature drifts above target + hysteresis
    if not episode_active and temps[i] > TARGET_TEMP + TARGET_BAND + RESTART_HYST:
        episode_active = True

    # Skip if no episode is active
    if not episode_active:
        time_to_cool[i] = np.nan
        censored[i] = True
        continue

    # Look ahead for the first time temperature enters target band
    j = i + 1
    while j < n and temps[j] > TARGET_TEMP + TARGET_BAND:
        j += 1

    # Target not reached within dataset
    if j >= n:
        time_to_cool[i] = float(MAX_HORIZON_MIN)
        censored[i] = True
        continue

    delta_min = (df.loc[j, 'timestamp'] - df.loc[i, 'timestamp']).total_seconds() / 60.0
    
    # Cap at maximum horizon
    if delta_min > MAX_HORIZON_MIN:
        time_to_cool[i] = float(MAX_HORIZON_MIN)
        censored[i] = True

    # If AC was on, record actual time
    elif df.loc[i:j, 'ac_on'].any():
        time_to_cool[i] = float(delta_min)

    # Otherwise, treat as censored
    else:
        time_to_cool[i] = float(MAX_HORIZON_MIN)
        censored[i] = True

# Save results to dataframe
df['time_to_cool_min'] = time_to_cool
df['censored'] = censored
print('Labeled rows:', (~df['censored']).sum(), 'Censored rows:', df['censored'].sum())


Labeled rows: 55182 Censored rows: 49939


### **Feature Engineering**
#### Temperature Differences & Cooling Rates
Generate temperature features that capture how far the room is from the target, what it was 5 minutes ago, the time elapsed and how quickly it has been cooling

In [508]:
df['temp_diff'] = df['room_temp'] - TARGET_TEMP
df['temp_lag_5'] = df['room_temp'].shift(lag_5min)
df['time_lag_5_min'] = (df['timestamp'] - df['timestamp'].shift(lag_5min)).dt.total_seconds().div(60.0)
df['cooling_rate_5min'] = (df['temp_lag_5'] - df['room_temp']) / df['time_lag_5_min']
df['cooling_rate_5min'] = df['cooling_rate_5min'].fillna(0.0)

#### Rolling Statistics (15-Minute Window)
Compute rolling 15-minute averages to capture recent trends in room temperature, power usage and AC activity

In [509]:
win = max(1, int(round(15.0 / median_dt)))
df['room_temp_roll_mean_15'] = df['room_temp'].rolling(window=win, min_periods=1).mean()
df['power_kw_roll_15'] = df['power_kw'].rolling(window=win, min_periods=1).mean()
df['ac_on_frac_15'] = df['ac_on'].rolling(window=win, min_periods=1).mean()

#### Fan Speed Encoding
Convert fan speed from text labels to numeric values for easier analysis and modeling

In [510]:
fan_map = {'off': 0, 'low': 1, 'medium': 2, 'med': 2, 'high': 3}
df['fan_speed_num'] = df['fan_speed'].str.lower().map(fan_map)
df['fan_speed_num'] = pd.to_numeric(df['fan_speed_num'], errors='coerce').fillna(0)

#### Cyclical Time Feature
Encode the hour of the day as cyclical features using sine and cosine to capture daily patterns

In [511]:
df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day'] / 24.0)
df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day'] / 24.0)

#### One-Hot Encoded Weather Conditions
Convert the categorical weather condition into one-hot encoded columns and remove the original column

In [512]:
weather_dummies = pd.get_dummies(df['weather_condition'].fillna('unknown').astype(str), prefix='weather')
df = pd.concat([df, weather_dummies], axis=1)
df = df.drop(columns=['weather_condition'])

Define the list of features to use for modeling, including temperature, cooling, occupancy, power, fan, time, AC activity, and weather indicators

In [513]:
FEATURES = [
    'room_temp', 'temp_diff', 'cooling_rate_5min', 'room_temp_roll_mean_15',
    'outside_temp', 'outside_humidity', 'occupancy_count', 'is_occupied',
    'power_kw', 'power_kw_roll_15', 'fan_speed_num', 'ac_temp_setting',
    'hour_sin', 'hour_cos', 'ac_on_frac_15'
] + [c for c in df.columns if c.startswith('weather_')]

Create a new DataFrame with only the selected features and target columns and remove rows where the target 'time_to_cool_min' is missing

In [514]:
df_features = df[FEATURES + ['time_to_cool_min', 'censored']].copy()
df_features = df_features.dropna(subset=['time_to_cool_min'])

Select only the rows where the cooling time is known (non-censored) to use for training the model

In [515]:
trainable = df_features[~df_features['censored']].copy()
print('Trainable rows (non-censored):', len(trainable))

Trainable rows (non-censored): 55182


### **Train/Validation/Test Split**
Reset the index of the trainable DataFrame for a clean, sequential order

In [516]:
trainable = trainable.reset_index(drop=True)

Get the total number of trainable (non-censored) samples.

In [517]:
ntr = len(trainable)

Determine cutoff indices for 70% train, 15% validation, and 15% test.

In [518]:
train_end = int(ntr * 0.70)
val_end = int(ntr * 0.85)
print("Train end index:", train_end, "Validation end index:", val_end)

Train end index: 38627 Validation end index: 46904


Split the trainable data into training, validation, and test sets for model training and evaluation

In [519]:
X_train = trainable.loc[:train_end-1, FEATURES]
y_train = trainable.loc[:train_end-1, 'time_to_cool_min']

X_val = trainable.loc[train_end:val_end-1, FEATURES]
y_val = trainable.loc[train_end:val_end-1, 'time_to_cool_min']

X_test = trainable.loc[val_end:, FEATURES]
y_test = trainable.loc[val_end:, 'time_to_cool_min']

Confirm the sample sizes for each split.

In [520]:
print("Train/Val/Test sizes:", len(X_train), len(X_val), len(X_test))

Train/Val/Test sizes: 38627 8277 8278


Convert target columns to NumPy arrays of type float for compatibility with machine learning models

In [521]:
y_train = y_train.to_numpy(dtype=float)
y_val   = y_val.to_numpy(dtype=float)
y_test  = y_test.to_numpy(dtype=float)

### **Model Training & Evaluation**
Train a LightGBM regression model to predict time-to-cool, using early stopping on the validation set and then save the trained model to a file for later use

In [522]:
model = lgb.LGBMRegressor(n_estimators=2000, learning_rate=0.05)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='l1',
    early_stopping_rounds=50,
    verbose=50
)

MODEL_PATH = "time_to_cool_model.pkl"
joblib.dump(model, MODEL_PATH)
print("Model saved at:", MODEL_PATH)

[50]	valid_0's l1: 17.7235	valid_0's l2: 1038.59
[100]	valid_0's l1: 15.1791	valid_0's l2: 1044.41
Model saved at: time_to_cool_model.pkl


Use the trained model to predict on the test set and evaluate its performance with MAE and RMSE

In [523]:
y_pred = model.predict(X_test)
print('MAE (min):', mean_absolute_error(y_test, y_pred))
print('RMSE:', mean_squared_error(y_test, y_pred))

MAE (min): 21.672360970179625
RMSE: 1486.2599139462154


### **Helper Functions & Example Predictions**
Predict the time-to-cool for a single row of data using the trained model

In [524]:
def predict_time_to_cool_row(row_df, model, features=FEATURES):
    x = row_df[features].values.reshape(1, -1)
    return float(model.predict(x)[0])

Recommend a simple pre-cooling start time by predicting the time-to-cool and subtracting it from the desired event time

In [525]:
def recommend_start_simple(event_time, now_row, model):
    t_pred = predict_time_to_cool_row(now_row, model)
    start_time = pd.to_datetime(event_time) - pd.Timedelta(minutes=t_pred)
    return start_time, t_pred

Example usage: predict time-to-cool from the latest data and suggest a simple pre-cooling start time

In [526]:
now_row = df_features.iloc[-1:][FEATURES]
event_time = pd.to_datetime('2024-01-01 09:00')
start_time_guess, pred_minutes = recommend_start_simple(event_time, now_row, model)
print('If started now predicted minutes to cool:', pred_minutes)
print('Suggested start time (simple):', start_time_guess)

If started now predicted minutes to cool: 82.49141500633803
Suggested start time (simple): 2024-01-01 07:37:30.515099620


Find the latest feasible pre-cooling start time by checking backward in steps until the predicted cooling time fits before the event

In [527]:
def find_latest_feasible_start(event_time, model, features_template, now_features, max_horizon=MAX_HORIZON_MIN, step_min=5):
    event_time = pd.to_datetime(event_time)
    for offset in range(0, max_horizon+1, step_min):
        candidate_start = event_time - pd.Timedelta(minutes=offset)
        candidate_features = now_features.copy()
        pred = predict_time_to_cool_row(candidate_features, model)
        if candidate_start + pd.Timedelta(minutes=pred) <= event_time:
            return candidate_start, pred
    return None, None

Use the simulation function to find the latest feasible pre-cooling start time under current conditions

In [528]:
cand_start, cand_pred = find_latest_feasible_start(event_time, model, FEATURES, now_row)
print('latest feasible start (simulation, assume current conditions):', cand_start, 'pred min:', cand_pred)

latest feasible start (simulation, assume current conditions): 2024-01-01 07:35:00 pred min: 82.49141500633803
