In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv("/Users/mac/Downloads/Building Energy/train.csv")
building = pd.read_csv("/Users/mac/Downloads/Building Energy/building_metadata.csv")
weather = pd.read_csv("/Users/mac/Downloads/Building Energy/weather_train.csv")

In [3]:
train = train.merge(building, on="building_id", how="left")
train = train.merge(weather, on=["site_id", "timestamp"], how="left")


In [4]:
train["timestamp"] = pd.to_datetime(train["timestamp"])
train["hour"] = train["timestamp"].dt.hour
train["day"] = train["timestamp"].dt.day
train["month"] = train["timestamp"].dt.month
train["year"] = train["timestamp"].dt.year

In [5]:
train.drop(columns=["timestamp", "building_id"], inplace=True)

In [6]:
num_cols = [
    'meter', 'site_id', 'square_feet',
    'year_built', 'floor_count',
    'air_temperature', 'cloud_coverage', 'dew_temperature',
    'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed',
    'hour', 'day', 'month', 'year'
]

cat_cols = ["primary_use"]

train[num_cols] = train[num_cols].fillna(
    train[num_cols].median()
)

In [7]:
train = train.sample(n=50000,random_state=42)

In [8]:
X = train.drop(columns=["meter_reading"])
y = train["meter_reading"]

In [9]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [10]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

In [12]:
pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", LinearRegression())
    ]
)


In [13]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)


In [14]:
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)


In [15]:
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)

print(mse)
print(rmse)

75594646582.51198
274944.80642942135


In [16]:
with open("building_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("Pipeline model saved")

Pipeline model saved
