In [50]:
import os, logging

# Make sure all folders exist
os.makedirs("logs/main", exist_ok=True)
os.makedirs("logs/data", exist_ok=True)
os.makedirs("logs/model", exist_ok=True)

# Reset any old handlers
for h in logging.root.handlers[:]:
    logging.root.removeHandler(h)

LOG_FORMAT = "%(asctime)s | %(name)s | %(levelname)s | %(message)s"

# Root logger ‚Üí console only (high-level)
logging.basicConfig(
    level=logging.INFO,
    format=LOG_FORMAT,
    handlers=[logging.StreamHandler()]
)
root_logger = logging.getLogger("energy_pipeline")
root_logger.info("üöÄ Logging system initialized.")

# -------- DATA LOGGER --------
data_logger = logging.getLogger("energy.data")
data_logger.setLevel(logging.INFO)

if not data_logger.handlers:
    # File
    data_fh = logging.FileHandler("logs/data/data_steps.log")
    data_fh.setFormatter(logging.Formatter(LOG_FORMAT))
    data_logger.addHandler(data_fh)

    # Console
    data_ch = logging.StreamHandler()
    data_ch.setFormatter(logging.Formatter(LOG_FORMAT))
    data_logger.addHandler(data_ch)

data_logger.propagate = False  # don‚Äôt duplicate to root


# -------- MODEL LOGGER --------
model_logger = logging.getLogger("energy.model")
model_logger.setLevel(logging.INFO)

if not model_logger.handlers:
    # File
    model_fh = logging.FileHandler("logs/model/model_steps.log")
    model_fh.setFormatter(logging.Formatter(LOG_FORMAT))
    model_logger.addHandler(model_fh)

    # Console
    model_ch = logging.StreamHandler()
    model_ch.setFormatter(logging.Formatter(LOG_FORMAT))
    model_logger.addHandler(model_ch)

model_logger.propagate = False


2025-11-18 04:47:13,408 | energy_pipeline | INFO | üöÄ Logging system initialized.


In [51]:
# === Section 2: Upload & load dataset with logging ===

from google.colab import files

root_logger.info("Waiting for user to upload energydata_complete.csv...")
uploaded = files.upload()  # Choose energydata_complete.csv

try:
    data_logger.info("Attempting to read energydata_complete.csv")
    df = pd.read_csv("energydata_complete.csv")
    data_logger.info(f"Dataset loaded successfully. Shape: {df.shape}")
    data_logger.info(f"Columns: {list(df.columns)}")
    display(df.head())
except Exception:
    data_logger.exception("‚ùå Failed to load dataset.")
    raise


2025-11-18 04:47:15,781 | energy_pipeline | INFO | Waiting for user to upload energydata_complete.csv...


Saving energydata_complete.csv to energydata_complete (1).csv


Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [52]:
# === Section 3: Data quality checks & cleaning ===

data_logger.info("Starting data quality checks and basic cleaning...")

try:
    # Convert date column
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    invalid_dates = df['date'].isna().sum()
    if invalid_dates > 0:
        data_logger.warning(f"{invalid_dates} rows have invalid dates after conversion.")

    # Missing values summary
    na_counts = df.isna().sum()
    data_logger.info(f"Missing values per column:\n{na_counts.to_string()}")

    # Drop rows with missing target
    before_rows = len(df)
    df = df.dropna(subset=['Appliances'])
    after_rows = len(df)
    dropped = before_rows - after_rows
    if dropped > 0:
        data_logger.warning(f"Dropped {dropped} rows with missing 'Appliances'.")

    # Simple sanity check on target
    if (df['Appliances'] < 0).any():
        data_logger.warning("Detected negative values in 'Appliances'.")

    data_logger.info(f"Cleaning complete. Final shape: {df.shape}")

except Exception:
    data_logger.exception("‚ùå Error during data quality / cleaning.")
    raise


In [53]:
# === Section 4: Feature engineering with logging ===

data_logger.info("Starting feature engineering...")

try:
    # Time-based features
    df['hour'] = df['date'].dt.hour
    df['day_of_week'] = df['date'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

    # Rolling mean on Appliances (window=60 rows)
    df['appliances_roll_mean'] = df['Appliances'].rolling(60, min_periods=1).mean()

    engineered_cols = ['hour', 'day_of_week', 'is_weekend', 'appliances_roll_mean']
    data_logger.info(f"Created engineered features: {engineered_cols}")

    feature_cols = engineered_cols + ['T1', 'RH_1']
    target_col = 'Appliances'

    X = df[feature_cols]
    y = df[target_col]

    data_logger.info(f"Feature matrix shape: {X.shape}, target length: {len(y)}")

except Exception:
    data_logger.exception("‚ùå Error during feature engineering.")
    raise


In [54]:
# === Section 5: Train/test split ===

model_logger.info("Splitting data into train and test sets...")

try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    model_logger.info(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
except Exception:
    model_logger.exception("‚ùå Error during train/test split.")
    raise


In [55]:
# === Section 6: Model training ===

from sklearn.ensemble import RandomForestRegressor

try:
    model_logger.info("Starting RandomForestRegressor training...")
    rf = RandomForestRegressor(
        n_estimators=150,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)
    model_logger.info("Model training completed successfully.")
except Exception:
    model_logger.exception("‚ùå Error during model training.")
    raise


In [56]:
# === Section 7: Evaluation ===

try:
    model_logger.info("Starting model evaluation...")
    preds = rf.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)

    model_logger.info(f"Evaluation metrics ‚Äì MAE: {mae:.2f}, R2: {r2:.3f}")
    print("MAE:", mae)
    print("R2 :", r2)
except Exception:
    model_logger.exception("‚ùå Error during evaluation.")
    raise


MAE: 31.371792387299102
R2 : 0.5420251484708565


In [57]:
# === Section 8: Exception logging demo (deliberate error) ===

data_logger.info("Demonstrating exception logging with a deliberate bug...")

def create_buggy_feature(df, logger):
    try:
        # Intentional bug: column does not exist
        df['non_existent_ratio'] = df['Appliances'] / df['NotARealColumn']
        logger.info("This line should not be reached.")
    except Exception:
        logger.exception("Expected failure: attempted to use missing column 'NotARealColumn'.")

create_buggy_feature(df, data_logger)


In [58]:
# === Section 9: Inspect log files ===

print("------ main pipeline log (logs/main/pipeline.log) ------")
!tail -n 15 logs/main/pipeline.log || echo "No main log found."

print("\n------ data log (logs/data/data_steps.log) ------")
!tail -n 20 logs/data/data_steps.log || echo "No data log found."

print("\n------ model log (logs/model/model_steps.log) ------")
!tail -n 20 logs/model/model_steps.log || echo "No model log found."


------ main pipeline log (logs/main/pipeline.log) ------
2025-11-18 04:40:45,925 | energy_pipeline | INFO | üöÄ Logging system initialized.
2025-11-18 04:40:45,929 | energy_pipeline | INFO | Separate data_logger and model_logger configured.
2025-11-18 04:41:24,239 | energy_pipeline | INFO | Waiting for user to upload energydata_complete.csv...

------ data log (logs/data/data_steps.log) ------
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'NotARealColumn'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/