In [8]:
import pandas as pd
import numpy as np
import gc # Garbage collection to free up memory
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

# Create output directory if it doesn't exist
os.makedirs('data/processed', exist_ok=True)
print("Environment set up.")

Environment set up.


In [9]:
def reduce_mem_usage(df):
    """ Iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%')
    
    return df

In [10]:
class CyclicalTimeEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        # Ensure timestamp is datetime
        if not pd.api.types.is_datetime64_any_dtype(X_copy['timestamp']):
            X_copy['timestamp'] = pd.to_datetime(X_copy['timestamp'])
        
        # 1. Hour of Day (24 hour cycle)
        X_copy['hour_sin'] = np.sin(2 * np.pi * X_copy['timestamp'].dt.hour / 24)
        X_copy['hour_cos'] = np.cos(2 * np.pi * X_copy['timestamp'].dt.hour / 24)
        
        # 2. Month of Year (12 month cycle)
        X_copy['month_sin'] = np.sin(2 * np.pi * X_copy['timestamp'].dt.month / 12)
        X_copy['month_cos'] = np.cos(2 * np.pi * X_copy['timestamp'].dt.month / 12)
        
        return X_copy

class WeatherFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, window_sizes=[3, 24]):
        self.window_sizes = window_sizes

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        print("Calculating Weather Lags (this may take a moment)...")
        X_copy = X.copy()
        
        # --- FIX STARTS HERE ---
        # 1. Upcast float16 to float32. 
        # Pandas 'rolling' and 'fillna' C-backends often fail with float16.
        float16_cols = X_copy.select_dtypes(include=['float16']).columns
        if len(float16_cols) > 0:
            X_copy[float16_cols] = X_copy[float16_cols].astype('float32')
            
        # 2. Explicitly cast air_temperature to float64 for higher precision rolling calc
        X_copy['air_temperature'] = X_copy['air_temperature'].astype('float64')
        # --- FIX ENDS HERE ---

        # Sort is critical for rolling windows!
        X_copy = X_copy.sort_values(by=['site_id', 'timestamp'])
        
        grouped = X_copy.groupby('site_id')['air_temperature']
        
        for window in self.window_sizes:
            # Shift prevents using current hour to predict current hour
            X_copy[f'temp_mean_lag_{window}h'] = grouped.transform(lambda x: x.rolling(window=window, min_periods=1).mean())
            X_copy[f'temp_std_lag_{window}h'] = grouped.transform(lambda x: x.rolling(window=window, min_periods=1).std())
            
        # Fill resulting NaNs (first few rows) with 0 for std, and backfill for mean
        X_copy[f'temp_std_lag_{window}h'] = X_copy[f'temp_std_lag_{window}h'].fillna(0)
        
        # Use bfill() method directly (newer pandas syntax)
        X_copy = X_copy.bfill()
        
        return X_copy

In [11]:
print("Loading data...")
train_df = pd.read_csv('../data/raw/train.csv')
weather_df = pd.read_csv('../data/raw/weather_train.csv')
building_df = pd.read_csv('../data/raw/building_metadata.csv')

# Optimization 1: Reduce Memory immediately
train_df = reduce_mem_usage(train_df)
weather_df = reduce_mem_usage(weather_df)
building_df = reduce_mem_usage(building_df)

# Optimization 2: Timestamp Alignment
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
weather_df['timestamp'] = pd.to_datetime(weather_df['timestamp'])

# Merge 1: Building Metadata
train_df = train_df.merge(building_df, on='building_id', how='left')

# Merge 2: Weather Data
# Note: Weather data has missing hours. We should interpolate weather BEFORE merging ideally, 
# but for simplicity we merge then fill.
train_df = train_df.merge(weather_df, on=['site_id', 'timestamp'], how='left')

print(f"Merged Data Shape: {train_df.shape}")
del weather_df, building_df # Free up memory
gc.collect()

Loading data...
Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 289.19 MB
Decreased by 53.1%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 3.07 MB
Decreased by 68.1%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.03 MB
Decreased by 60.3%
Merged Data Shape: (20216100, 16)


0

In [12]:
print("Handling missing values...")

# 1. Weather Interpolation (Fill small gaps linearly)
# Limit direction='both' handles start/end of series
train_df['air_temperature'] = train_df['air_temperature'].interpolate(method='linear', limit_direction='both')
train_df['cloud_coverage'] = train_df['cloud_coverage'].fillna(0) # Assume clear if unknown
train_df['dew_temperature'] = train_df['dew_temperature'].interpolate(method='linear', limit_direction='both')

# 2. Building Metadata Filling
# Year built is often missing. We fill with median or a flag.
train_df['year_built'] = train_df['year_built'].fillna(train_df['year_built'].median())
train_df['floor_count'] = train_df['floor_count'].fillna(train_df['floor_count'].median())

# Drop rows where target (meter_reading) is 0?
# In ASHRAE, 0 often means the meter was off or broken.
# We will drop them to help the model learn better.
print(f"Rows before dropping zero-readings: {len(train_df)}")
train_df = train_df[train_df['meter_reading'] > 0]
print(f"Rows after dropping zero-readings: {len(train_df)}")

Handling missing values...
Rows before dropping zero-readings: 20216100
Rows after dropping zero-readings: 18342124


In [13]:
# 1. Label Encoding for Categorical Variable
print("Encoding Primary Use...")
le = LabelEncoder()
train_df['primary_use'] = le.fit_transform(train_df['primary_use'])

# 2. Run the MLOps Pipeline
pipeline = Pipeline([
    ('cyclical_time', CyclicalTimeEncoder()),
    ('weather_lags', WeatherFeatureEngineer(window_sizes=[3, 24]))
])

print("Running Feature Engineering Pipeline...")
train_df = pipeline.fit_transform(train_df)

# 3. Final Memory Reduction before saving
train_df = reduce_mem_usage(train_df)

Encoding Primary Use...
Running Feature Engineering Pipeline...
Calculating Weather Lags (this may take a moment)...
Memory usage of dataframe is 2448.94 MB
Memory usage after optimization is: 1102.02 MB
Decreased by 55.0%


In [15]:
print("Saving processed data to pickle...")
# Pickle preserves the 'category' and 'datetime' types perfectly
train_df.to_pickle('../data/processed/train_processed.pkl')

print("Success! Data saved to data/processed/train_processed.pkl")
print("You can now proceed to 03_modeling.ipynb")

Saving processed data to pickle...
Success! Data saved to data/processed/train_processed.pkl
You can now proceed to 03_modeling.ipynb
