In [1]:
# === Section 1: Imports & logging setup ===

import os
import logging

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Create fresh log folders
os.makedirs("logs/main", exist_ok=True)
os.makedirs("logs/data", exist_ok=True)
os.makedirs("logs/model", exist_ok=True)

# Reset any existing logging handlers (important in Colab)
for h in logging.root.handlers[:]:
    logging.root.removeHandler(h)

LOG_FORMAT = "%(asctime)s | %(name)s | %(levelname)s | %(message)s"

# Root logger ‚Üí console only (high-level)
logging.basicConfig(
    level=logging.INFO,
    format=LOG_FORMAT,
    handlers=[logging.StreamHandler()]
)
root_logger = logging.getLogger("energy_pipeline")
root_logger.info("üöÄ Logging system initialized.")

# -------- DATA LOGGER --------
data_logger = logging.getLogger("energy.data")
data_logger.setLevel(logging.INFO)

if not data_logger.handlers:
    # File
    data_fh = logging.FileHandler("logs/data/data_steps.log")
    data_fh.setFormatter(logging.Formatter(LOG_FORMAT))
    data_logger.addHandler(data_fh)

    # Console
    data_ch = logging.StreamHandler()
    data_ch.setFormatter(logging.Formatter(LOG_FORMAT))
    data_logger.addHandler(data_ch)

data_logger.propagate = False  # avoid double logging in root


# -------- MODEL LOGGER --------
model_logger = logging.getLogger("energy.model")
model_logger.setLevel(logging.INFO)

if not model_logger.handlers:
    # File
    model_fh = logging.FileHandler("logs/model/model_steps.log")
    model_fh.setFormatter(logging.Formatter(LOG_FORMAT))
    model_logger.addHandler(model_fh)

    # Console
    model_ch = logging.StreamHandler()
    model_ch.setFormatter(logging.Formatter(LOG_FORMAT))
    model_logger.addHandler(model_ch)

model_logger.propagate = False

root_logger.info("‚úÖ Separate data_logger and model_logger configured.")


2025-11-18 05:09:02,187 | energy_pipeline | INFO | üöÄ Logging system initialized.
2025-11-18 05:09:02,191 | energy_pipeline | INFO | ‚úÖ Separate data_logger and model_logger configured.


In [2]:
# === Section 2: Upload & load dataset ===

from google.colab import files

root_logger.info("Waiting for user to upload energydata_complete.csv ...")
uploaded = files.upload()   # choose energydata_complete.csv

try:
    data_logger.info("Attempting to read 'energydata_complete.csv'...")
    df = pd.read_csv("energydata_complete.csv")
    data_logger.info(f"Dataset loaded successfully. Shape: {df.shape}")
    data_logger.info(f"Columns: {list(df.columns)}")
    display(df.head())
except Exception as e:
    data_logger.error(f"‚ùå Failed to load dataset: {e}")


2025-11-18 05:09:13,485 | energy_pipeline | INFO | Waiting for user to upload energydata_complete.csv ...


2025-11-18 05:09:29,305 | energy.data | INFO | Attempting to read 'energydata_complete.csv'...
2025-11-18 05:09:29,471 | energy.data | INFO | Dataset loaded successfully. Shape: (19735, 29)
2025-11-18 05:09:29,472 | energy.data | INFO | Columns: ['date', 'Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'rv1', 'rv2']


Saving energydata_complete.csv to energydata_complete.csv


Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [3]:
# === Section 3: Data quality checks & cleaning ===

data_logger.info("Starting data quality checks and cleaning...")

try:
    # Convert 'date' column to datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    invalid_dates = df['date'].isna().sum()
    if invalid_dates > 0:
        data_logger.warning(f"{invalid_dates} rows have invalid dates after conversion.")

    # Summary of missing values
    na_counts = df.isna().sum()
    data_logger.info("Missing values per column:\n" + na_counts.to_string())

    # Drop rows where target Appliances is missing
    before_rows = len(df)
    df = df.dropna(subset=['Appliances'])
    after_rows = len(df)
    dropped = before_rows - after_rows
    if dropped > 0:
        data_logger.warning(f"Dropped {dropped} rows with missing 'Appliances'.")

    # Basic sanity check: negative values in Appliances
    neg_count = (df['Appliances'] < 0).sum()
    if neg_count > 0:
        data_logger.warning(f"Found {neg_count} rows with negative 'Appliances' values.")
    else:
        data_logger.info("No negative values in 'Appliances'.")

    data_logger.info(f"Cleaning complete. Final shape: {df.shape}")

except Exception as e:
    data_logger.error(f"‚ùå Error during data quality / cleaning: {e}")


2025-11-18 05:09:58,479 | energy.data | INFO | Starting data quality checks and cleaning...
2025-11-18 05:09:58,504 | energy.data | INFO | Missing values per column:
date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
2025-11-18 05:09:58,511 | energy.data | INFO | No negative values in 'Appliances'.
2025-11-18 05:09:58,512 | energy.data | INFO | Cleaning complete. Final shape: (19735, 29)


In [4]:
# === Section 4: Feature engineering ===

data_logger.info("Starting feature engineering...")

try:
    # Time-based features
    df['hour'] = df['date'].dt.hour
    df['day_of_week'] = df['date'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

    # Rolling mean of Appliances (window = 60 rows)
    df['appliances_roll_mean'] = df['Appliances'].rolling(60, min_periods=1).mean()

    # Safe ratio feature: lights / (Appliances + small_constant)  ‚Üí avoids division by zero
    df['lights_to_appliances_ratio'] = df['lights'] / (df['Appliances'] + 1e-3)

    engineered_cols = [
        'hour', 'day_of_week', 'is_weekend',
        'appliances_roll_mean', 'lights_to_appliances_ratio'
    ]
    data_logger.info(f"Created engineered features: {engineered_cols}")

    # Choose feature columns (all REAL columns from dataset)
    feature_cols = engineered_cols + ['T1', 'RH_1', 'T_out', 'RH_out', 'Windspeed']
    target_col = 'Appliances'

    X = df[feature_cols]
    y = df[target_col]

    data_logger.info(f"Feature matrix shape: {X.shape}, target length: {len(y)}")

except Exception as e:
    data_logger.error(f"‚ùå Error during feature engineering: {e}")


2025-11-18 05:10:24,390 | energy.data | INFO | Starting feature engineering...
2025-11-18 05:10:24,405 | energy.data | INFO | Created engineered features: ['hour', 'day_of_week', 'is_weekend', 'appliances_roll_mean', 'lights_to_appliances_ratio']
2025-11-18 05:10:24,411 | energy.data | INFO | Feature matrix shape: (19735, 10), target length: 19735


In [5]:
# === Section 5: Train/test split ===

model_logger.info("Splitting data into train and test sets...")

try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    model_logger.info(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
except Exception as e:
    model_logger.error(f"‚ùå Error during train/test split: {e}")


2025-11-18 05:10:36,325 | energy.model | INFO | Splitting data into train and test sets...
2025-11-18 05:10:36,335 | energy.model | INFO | Train shape: (15788, 10), Test shape: (3947, 10)


In [6]:
# === Section 6: Model training & evaluation ===

try:
    model_logger.info("Starting RandomForestRegressor training...")
    rf = RandomForestRegressor(
        n_estimators=150,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)
    model_logger.info("Model training completed successfully.")

    # Evaluation
    model_logger.info("Starting model evaluation...")
    preds = rf.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    model_logger.info(f"Evaluation metrics ‚Äì MAE: {mae:.2f}, R2: {r2:.3f}")
    print("MAE:", mae)
    print("R2 :", r2)

except Exception as e:
    model_logger.error(f"‚ùå Error during model training/evaluation: {e}")


2025-11-18 05:10:45,044 | energy.model | INFO | Starting RandomForestRegressor training...
2025-11-18 05:11:05,163 | energy.model | INFO | Model training completed successfully.
2025-11-18 05:11:05,164 | energy.model | INFO | Starting model evaluation...
2025-11-18 05:11:05,315 | energy.model | INFO | Evaluation metrics ‚Äì MAE: 26.04, R2: 0.674


MAE: 26.036065084593083
R2 : 0.6738515179228146


In [7]:
# === Section 7: Simple anomaly check (no intentional errors) ===

data_logger.info("Running simple anomaly check for very high energy usage...")

try:
    high_usage_mask = df['Appliances'] > 5000  # arbitrary high threshold
    count_high = high_usage_mask.sum()

    if count_high > 0:
        data_logger.warning(
            f"Found {count_high} rows with 'Appliances' > 5000 (potential anomalies)."
        )
    else:
        data_logger.info("No extreme high 'Appliances' values detected (> 5000).")
except Exception as e:
    data_logger.error(f"Unexpected issue during anomaly check: {e}")


2025-11-18 05:11:50,156 | energy.data | INFO | Running simple anomaly check for very high energy usage...
2025-11-18 05:11:50,162 | energy.data | INFO | No extreme high 'Appliances' values detected (> 5000).


In [8]:
# === Section 8: Inspect log files ===

print("------ main pipeline log (logs/main/pipeline.log) ------")
!tail -n 20 logs/main/pipeline.log || echo "No main log found."

print("\n------ data log (logs/data/data_steps.log) ------")
!tail -n 30 logs/data/data_steps.log || echo "No data log found."

print("\n------ model log (logs/model/model_steps.log) ------")
!tail -n 30 logs/model/model_steps.log || echo "No model log found."


------ main pipeline log (logs/main/pipeline.log) ------
tail: cannot open 'logs/main/pipeline.log' for reading: No such file or directory
No main log found.

------ data log (logs/data/data_steps.log) ------
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
2025-11-18 05:09:58,511 | energy.data | INFO | No negative values in 'Appliances'.
2025-11-18 05:09:58,512 | energy.data | INFO | Cleaning complete. Final shape: (19735, 29)
2025-11-18 05:10:24,390 | energy.data | INFO | Starting feature engineering...
2025-11-18 05:10:24,405 | energy.data | INFO | Created engineered features: ['hour', 'day_of_week', 'is_weekend', 'appliances_roll_mean', 'lights_t