In [68]:
import os
import pandas as pd
import pickle
import numpy as np

# modelling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Data load

Read the data from a csv file.

In [69]:
path = "./data/"
data = pd.read_csv(os.path.join(path, "energy_efficiency_data.csv"))

In [70]:
data.head()

Unnamed: 0,relative_compactnes,surface_area,wall_area,roof_area,overall_height,orientation,glazing_area,glazing_area_distribution,heating_load,cooling_load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [71]:
data.tail()

Unnamed: 0,relative_compactnes,surface_area,wall_area,roof_area,overall_height,orientation,glazing_area,glazing_area_distribution,heating_load,cooling_load
763,0.64,784.0,343.0,220.5,3.5,5,0.4,5,17.88,21.4
764,0.62,808.5,367.5,220.5,3.5,2,0.4,5,16.54,16.88
765,0.62,808.5,367.5,220.5,3.5,3,0.4,5,16.44,17.11
766,0.62,808.5,367.5,220.5,3.5,4,0.4,5,16.48,16.61
767,0.62,808.5,367.5,220.5,3.5,5,0.4,5,16.64,16.03


Rename the columns for a better overview about th features.

In [72]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   relative_compactnes        768 non-null    float64
 1   surface_area               768 non-null    float64
 2   wall_area                  768 non-null    float64
 3   roof_area                  768 non-null    float64
 4   overall_height             768 non-null    float64
 5   orientation                768 non-null    int64  
 6   glazing_area               768 non-null    float64
 7   glazing_area_distribution  768 non-null    int64  
 8   heating_load               768 non-null    float64
 9   cooling_load               768 non-null    float64
dtypes: float64(8), int64(2)
memory usage: 60.1 KB


In [73]:
# how many unique values contains each column
data.nunique()

relative_compactnes           12
surface_area                  12
wall_area                      7
roof_area                      4
overall_height                 2
orientation                    4
glazing_area                   4
glazing_area_distribution      6
heating_load                 586
cooling_load                 636
dtype: int64

In [74]:
# general statistic information about all columns in the dataframe
data.describe()

Unnamed: 0,relative_compactnes,surface_area,wall_area,roof_area,overall_height,orientation,glazing_area,glazing_area_distribution,heating_load,cooling_load
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.764167,671.708333,318.5,176.604167,5.25,3.5,0.234375,2.8125,22.307201,24.58776
std,0.105777,88.086116,43.626481,45.16595,1.75114,1.118763,0.133221,1.55096,10.090196,9.513306
min,0.62,514.5,245.0,110.25,3.5,2.0,0.0,0.0,6.01,10.9
25%,0.6825,606.375,294.0,140.875,3.5,2.75,0.1,1.75,12.9925,15.62
50%,0.75,673.75,318.5,183.75,5.25,3.5,0.25,3.0,18.95,22.08
75%,0.83,741.125,343.0,220.5,7.0,4.25,0.4,4.0,31.6675,33.1325
max,0.98,808.5,416.5,220.5,7.0,5.0,0.4,5.0,43.1,48.03


In [75]:
# change data type from integer to string for categorical features
data[["orientation", "glazing_area_distribution"]] = data[
    ["orientation", "glazing_area_distribution"]
].astype("string")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   relative_compactnes        768 non-null    float64
 1   surface_area               768 non-null    float64
 2   wall_area                  768 non-null    float64
 3   roof_area                  768 non-null    float64
 4   overall_height             768 non-null    float64
 5   orientation                768 non-null    string 
 6   glazing_area               768 non-null    float64
 7   glazing_area_distribution  768 non-null    string 
 8   heating_load               768 non-null    float64
 9   cooling_load               768 non-null    float64
dtypes: float64(8), string(2)
memory usage: 60.1 KB


In [76]:
# Define features and target variables (from now on, I will only consider "heating_load" as target)
features = [
    "relative_compactnes",
    "surface_area",
    "wall_area",
    "roof_area",
    "overall_height",
    "orientation",
    "glazing_area",
    "glazing_area_distribution",
]
target = "heating_load"

### Train- / Validation- / Test Sets

In [77]:
# trian-/ val-/ test-split
data_train_full, data_test = train_test_split(
    data[features + [target]], test_size=0.2, random_state=42
)
data_train, data_val = train_test_split(
    data_train_full[features + [target]], test_size=0.25, random_state=42
)

In [78]:
# reset indices
data_train_full = data_train_full.reset_index(drop=True)
data_train = data_train.reset_index(drop=True)
data_val = data_val.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

In [79]:
# check dataframes
print(f"train full data length {len(data_train_full)}")
print(f"train data length {len(data_train)}")
print(f"val data length {len(data_val)}")
print(f"test data length {len(data_test)}")

train full data length 614
train data length 460
val data length 154
test data length 154


In [80]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460 entries, 0 to 459
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   relative_compactnes        460 non-null    float64
 1   surface_area               460 non-null    float64
 2   wall_area                  460 non-null    float64
 3   roof_area                  460 non-null    float64
 4   overall_height             460 non-null    float64
 5   orientation                460 non-null    string 
 6   glazing_area               460 non-null    float64
 7   glazing_area_distribution  460 non-null    string 
 8   heating_load               460 non-null    float64
dtypes: float64(7), string(2)
memory usage: 32.5 KB


In [81]:
# define numerical and categorical features
categorical = ["orientation", "glazing_area_distribution"]

numerical = [
    "relative_compactnes",
    "surface_area",
    "wall_area",
    "roof_area",
    "overall_height",
    "glazing_area",
]

In [82]:
# one-hot-encoding
train_dicts = data_train[categorical].to_dict(orient="records")
val_dicts = data_val[categorical].to_dict(orient="records")

# normalize numerical features
scaler = StandardScaler()
X_train_num = scaler.fit_transform(data_train[numerical])
X_val_num = scaler.transform(data_val[numerical])

dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train_cat = dv.transform(train_dicts)
X_val_cat = dv.transform(val_dicts)

In [83]:
# concatenate numerical and categorical features
X_train = np.concatenate((X_train_num, X_train_cat), axis=1)
X_val = np.concatenate((X_val_num, X_val_cat), axis=1)

In [84]:
train_dicts = data_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = data_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [85]:
y_train = data_train[target]
y_val = data_val[target]

In [86]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [87]:
y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)



2.8997977233806895

In [88]:
def dump_pickle(obj, filename: str):
    with open(filename, "wb") as f_out:
        return pickle.dump(obj, f_out)

In [89]:
# Save DictVectorizer and datasets
dump_pickle(dv, "./models/dv.pkl")
dump_pickle((X_train, y_train), "./models/train.pkl")
dump_pickle((X_val, y_val), "./models/val.pkl")

#Experiment Tracking

In [90]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [91]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [92]:
mlflow.search_experiments()

[<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1721112880487, experiment_id='1', last_update_time=1721112880487, lifecycle_stage='active', name='energy-efficiency-experiment', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1721112849931, experiment_id='0', last_update_time=1721112849931, lifecycle_stage='active', name='Default', tags={}>]

In [93]:
mlflow.set_experiment("energy-efficiency-experiment")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1721112880487, experiment_id='1', last_update_time=1721112880487, lifecycle_stage='active', name='energy-efficiency-experiment', tags={}>

In [94]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [100]:
with mlflow.start_run():

    X_train, y_train = load_pickle("./models/train.pkl")
    X_val, y_val = load_pickle("./models/val.pkl")
    
    #params = {"max_depth": 10, "random_state": 42}
    #mlflow.log_params(params)
    
    lr = LinearRegression().fit(X_train, y_train)
    y_pred = lr.predict(X_val)
    #mlflow.log_metric("accuracy", accuracy_score(y_val, y_pred))
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    print(rmse)
    
    mlflow.sklearn.log_model(lr, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")



2.8997977233806895
default artifacts URI: 'mlflow-artifacts:/1/020f73f4c44f461eb337f1c05d7837d4/artifacts'




#Model Registry

In [101]:
EXPERIMENT_NAME="energy-efficiency-experiment"

In [103]:
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
client = MlflowClient()

In [107]:
# Retrieve the top 5 model runs and log the models
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

for run in runs:
    print(run.info)

<RunInfo: artifact_uri='mlflow-artifacts:/1/020f73f4c44f461eb337f1c05d7837d4/artifacts', end_time=1721114470398, experiment_id='1', lifecycle_stage='active', run_id='020f73f4c44f461eb337f1c05d7837d4', run_name='flawless-stork-843', run_uuid='020f73f4c44f461eb337f1c05d7837d4', start_time=1721114468953, status='FINISHED', user_id='codespace'>


In [110]:
# Select the model with the lowest test RMSE
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
run_id = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.rmse ASC"]
)[0]
print(run_id)

<Run: data=<RunData: metrics={'rmse': 2.8997977233806895}, params={}, tags={'mlflow.log-model.history': '[{"run_id": "020f73f4c44f461eb337f1c05d7837d4", '
                             '"artifact_path": "models", "utc_time_created": '
                             '"2024-07-16 07:21:09.054815", "flavors": '
                             '{"python_function": {"model_path": "model.pkl", '
                             '"predict_fn": "predict", "loader_module": '
                             '"mlflow.sklearn", "python_version": "3.9.19", '
                             '"env": {"conda": "conda.yaml", "virtualenv": '
                             '"python_env.yaml"}}, "sklearn": '
                             '{"pickled_model": "model.pkl", '
                             '"sklearn_version": "1.5.1", '
                             '"serialization_format": "cloudpickle", "code": '
                             'null}}, "model_uuid": '
                             '"3f8492b2fb644c7a8502d06ce75acecd"

In [111]:
run_id = "020f73f4c44f461eb337f1c05d7837d4"

In [112]:
mlflow.register_model(
        model_uri=f"runs:/{run_id}/models",
        name='energy-efficiency-model'
    )

Successfully registered model 'energy-efficiency-model'.
2024/07/16 07:30:37 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: energy-efficiency-model, version 1
Created version '1' of model 'energy-efficiency-model'.


<ModelVersion: aliases=[], creation_timestamp=1721115037930, current_stage='None', description='', last_updated_timestamp=1721115037930, name='energy-efficiency-model', run_id='020f73f4c44f461eb337f1c05d7837d4', run_link='', source='mlflow-artifacts:/1/020f73f4c44f461eb337f1c05d7837d4/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>

In [113]:
client.search_registered_models()

[<RegisteredModel: aliases={}, creation_timestamp=1721115037878, description='', last_updated_timestamp=1721115037930, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1721115037930, current_stage='None', description='', last_updated_timestamp=1721115037930, name='energy-efficiency-model', run_id='020f73f4c44f461eb337f1c05d7837d4', run_link='', source='mlflow-artifacts:/1/020f73f4c44f461eb337f1c05d7837d4/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>], name='energy-efficiency-model', tags={}>]