In [64]:
import os
import pandas as pd
import pickle
import numpy as np

# modelling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Data load

Read the data from a csv file.

In [2]:
path = "./data/"
data = pd.read_csv(os.path.join(path, "energy_efficiency_data.csv"))

In [3]:
data.head()

Unnamed: 0,relative_compactnes,surface_area,wall_area,roof_area,overall_height,orientation,glazing_area,glazing_area_distribution,heating_load,cooling_load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [4]:
data.tail()

Unnamed: 0,relative_compactnes,surface_area,wall_area,roof_area,overall_height,orientation,glazing_area,glazing_area_distribution,heating_load,cooling_load
763,0.64,784.0,343.0,220.5,3.5,5,0.4,5,17.88,21.4
764,0.62,808.5,367.5,220.5,3.5,2,0.4,5,16.54,16.88
765,0.62,808.5,367.5,220.5,3.5,3,0.4,5,16.44,17.11
766,0.62,808.5,367.5,220.5,3.5,4,0.4,5,16.48,16.61
767,0.62,808.5,367.5,220.5,3.5,5,0.4,5,16.64,16.03


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   relative_compactnes        768 non-null    float64
 1   surface_area               768 non-null    float64
 2   wall_area                  768 non-null    float64
 3   roof_area                  768 non-null    float64
 4   overall_height             768 non-null    float64
 5   orientation                768 non-null    int64  
 6   glazing_area               768 non-null    float64
 7   glazing_area_distribution  768 non-null    int64  
 8   heating_load               768 non-null    float64
 9   cooling_load               768 non-null    float64
dtypes: float64(8), int64(2)
memory usage: 60.1 KB


In [7]:
# how many unique values contains each column
data.nunique()

relative_compactnes           12
surface_area                  12
wall_area                      7
roof_area                      4
overall_height                 2
orientation                    4
glazing_area                   4
glazing_area_distribution      6
heating_load                 586
cooling_load                 636
dtype: int64

In [8]:
# general statistic information about all columns in the dataframe
data.describe()

Unnamed: 0,relative_compactnes,surface_area,wall_area,roof_area,overall_height,orientation,glazing_area,glazing_area_distribution,heating_load,cooling_load
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.764167,671.708333,318.5,176.604167,5.25,3.5,0.234375,2.8125,22.307201,24.58776
std,0.105777,88.086116,43.626481,45.16595,1.75114,1.118763,0.133221,1.55096,10.090196,9.513306
min,0.62,514.5,245.0,110.25,3.5,2.0,0.0,0.0,6.01,10.9
25%,0.6825,606.375,294.0,140.875,3.5,2.75,0.1,1.75,12.9925,15.62
50%,0.75,673.75,318.5,183.75,5.25,3.5,0.25,3.0,18.95,22.08
75%,0.83,741.125,343.0,220.5,7.0,4.25,0.4,4.0,31.6675,33.1325
max,0.98,808.5,416.5,220.5,7.0,5.0,0.4,5.0,43.1,48.03


In [9]:
# change data type from integer to string for categorical features
data[["orientation", "glazing_area_distribution"]] = data[
    ["orientation", "glazing_area_distribution"]
].astype("string")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   relative_compactnes        768 non-null    float64
 1   surface_area               768 non-null    float64
 2   wall_area                  768 non-null    float64
 3   roof_area                  768 non-null    float64
 4   overall_height             768 non-null    float64
 5   orientation                768 non-null    string 
 6   glazing_area               768 non-null    float64
 7   glazing_area_distribution  768 non-null    string 
 8   heating_load               768 non-null    float64
 9   cooling_load               768 non-null    float64
dtypes: float64(8), string(2)
memory usage: 60.1 KB


In [10]:
# Define features and target variables (from now on, I will only consider "heating_load" as target)
features = [
    "relative_compactnes",
    "surface_area",
    "wall_area",
    "roof_area",
    "overall_height",
    "orientation",
    "glazing_area",
    "glazing_area_distribution",
]
target = "heating_load"

### Train- / Validation- / Test Sets

In [11]:
# trian-/ val-/ test-split
data_train_full, data_test = train_test_split(
    data[features + [target]], test_size=0.2, random_state=42
)
data_train, data_val = train_test_split(
    data_train_full[features + [target]], test_size=0.25, random_state=42
)

In [12]:
# reset indices
data_train_full = data_train_full.reset_index(drop=True)
data_train = data_train.reset_index(drop=True)
data_val = data_val.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

In [13]:
# check dataframes
print(f"train full data length {len(data_train_full)}")
print(f"train data length {len(data_train)}")
print(f"val data length {len(data_val)}")
print(f"test data length {len(data_test)}")

train full data length 614
train data length 460
val data length 154
test data length 154


In [14]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 460 entries, 0 to 459
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   relative_compactnes        460 non-null    float64
 1   surface_area               460 non-null    float64
 2   wall_area                  460 non-null    float64
 3   roof_area                  460 non-null    float64
 4   overall_height             460 non-null    float64
 5   orientation                460 non-null    string 
 6   glazing_area               460 non-null    float64
 7   glazing_area_distribution  460 non-null    string 
 8   heating_load               460 non-null    float64
dtypes: float64(7), string(2)
memory usage: 32.5 KB


In [15]:
# define numerical and categorical features
categorical = ["orientation", "glazing_area_distribution"]

numerical = [
    "relative_compactnes",
    "surface_area",
    "wall_area",
    "roof_area",
    "overall_height",
    "glazing_area",
]

In [17]:
dv =DictVectorizer()

In [18]:
train_dicts = data_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = data_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [19]:
y_train = data_train[target]
y_val = data_val[target]

In [20]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [21]:
y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)



2.899749871345434

In [22]:
def dump_pickle(obj, filename: str):
    with open(filename, "wb") as f_out:
        return pickle.dump(obj, f_out)

In [23]:
# Save DictVectorizer and datasets
dump_pickle(dv, "./models/dv.pkl")
dump_pickle((X_train, y_train), "./models/train.pkl")
dump_pickle((X_val, y_val), "./models/val.pkl")

#Experiment Tracking

In [24]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [25]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [26]:
mlflow.search_experiments()

[<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1721112880487, experiment_id='1', last_update_time=1721112880487, lifecycle_stage='active', name='energy-efficiency-experiment', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1721112849931, experiment_id='0', last_update_time=1721112849931, lifecycle_stage='active', name='Default', tags={}>]

In [27]:
mlflow.set_experiment("energy-efficiency-experiment")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1721112880487, experiment_id='1', last_update_time=1721112880487, lifecycle_stage='active', name='energy-efficiency-experiment', tags={}>

In [28]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [63]:
with mlflow.start_run():

    X_train, y_train = load_pickle("./models/train.pkl")
    X_val, y_val = load_pickle("./models/val.pkl")
    
    params = {"C": 1.0, "random_state": 0}
    mlflow.log_params(params)
    
    lr = LinearRegression().fit(X_train, y_train)
    y_pred = lr.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    print(rmse)
    
    mlflow.sklearn.log_model(lr, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")



2.899749871345434
default artifacts URI: 'mlflow-artifacts:/1/85c6ef169aab42f18aee42542ffb912a/artifacts'




#Model Registry

In [30]:
EXPERIMENT_NAME="energy-efficiency-experiment"

In [31]:
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
client = MlflowClient()

In [32]:
# Retrieve the top 5 model runs and log the models
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

for run in runs:
    print(run.info)

<RunInfo: artifact_uri='mlflow-artifacts:/1/ac5268480eb047d2bdf0403f738a7609/artifacts', end_time=1721120877025, experiment_id='1', lifecycle_stage='active', run_id='ac5268480eb047d2bdf0403f738a7609', run_name='bittersweet-bat-782', run_uuid='ac5268480eb047d2bdf0403f738a7609', start_time=1721120873900, status='FINISHED', user_id='codespace'>
<RunInfo: artifact_uri='mlflow-artifacts:/1/020f73f4c44f461eb337f1c05d7837d4/artifacts', end_time=1721114470398, experiment_id='1', lifecycle_stage='active', run_id='020f73f4c44f461eb337f1c05d7837d4', run_name='flawless-stork-843', run_uuid='020f73f4c44f461eb337f1c05d7837d4', start_time=1721114468953, status='FINISHED', user_id='codespace'>


In [33]:
# Select the model with the lowest test RMSE
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
run_id = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.rmse ASC"]
)[0]
print(run_id)

<Run: data=<RunData: metrics={'rmse': 2.899749871345434}, params={}, tags={'mlflow.log-model.history': '[{"run_id": "ac5268480eb047d2bdf0403f738a7609", '
                             '"artifact_path": "models", "utc_time_created": '
                             '"2024-07-16 09:07:54.007333", "flavors": '
                             '{"python_function": {"model_path": "model.pkl", '
                             '"predict_fn": "predict", "loader_module": '
                             '"mlflow.sklearn", "python_version": "3.9.19", '
                             '"env": {"conda": "conda.yaml", "virtualenv": '
                             '"python_env.yaml"}}, "sklearn": '
                             '{"pickled_model": "model.pkl", '
                             '"sklearn_version": "1.5.1", '
                             '"serialization_format": "cloudpickle", "code": '
                             'null}}, "model_uuid": '
                             '"2af572beb83d459d99a3ad1c91d61a65",

In [34]:
run_id = "ac5268480eb047d2bdf0403f738a7609"

In [35]:
mlflow.register_model(
        model_uri=f"runs:/{run_id}/models",
        name='energy-efficiency-model'
    )

Registered model 'energy-efficiency-model' already exists. Creating a new version of this model...
2024/07/16 09:08:47 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: energy-efficiency-model, version 2
Created version '2' of model 'energy-efficiency-model'.


<ModelVersion: aliases=[], creation_timestamp=1721120927319, current_stage='None', description='', last_updated_timestamp=1721120927319, name='energy-efficiency-model', run_id='ac5268480eb047d2bdf0403f738a7609', run_link='', source='mlflow-artifacts:/1/ac5268480eb047d2bdf0403f738a7609/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>

In [48]:
client.search_registered_models()

[<RegisteredModel: aliases={}, creation_timestamp=1721115037878, description='', last_updated_timestamp=1721121059120, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1721115037930, current_stage='Production', description='', last_updated_timestamp=1721121055927, name='energy-efficiency-model', run_id='020f73f4c44f461eb337f1c05d7837d4', run_link='', source='mlflow-artifacts:/1/020f73f4c44f461eb337f1c05d7837d4/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>,
  <ModelVersion: aliases=[], creation_timestamp=1721120927319, current_stage='Staging', description='', last_updated_timestamp=1721121059120, name='energy-efficiency-model', run_id='ac5268480eb047d2bdf0403f738a7609', run_link='', source='mlflow-artifacts:/1/ac5268480eb047d2bdf0403f738a7609/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>], name='energy-efficiency-model', tags={}>]

In [37]:
client = mlflow.tracking.MlflowClient()
client.transition_model_version_stage(
    name="energy-efficiency-model",
    version=1,
    stage="Production"
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1721115037930, current_stage='Production', description='', last_updated_timestamp=1721121055927, name='energy-efficiency-model', run_id='020f73f4c44f461eb337f1c05d7837d4', run_link='', source='mlflow-artifacts:/1/020f73f4c44f461eb337f1c05d7837d4/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>

In [38]:
client = mlflow.tracking.MlflowClient()
client.transition_model_version_stage(
    name="energy-efficiency-model",
    version=2,
    stage="Staging"
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1721120927319, current_stage='Staging', description='', last_updated_timestamp=1721121059120, name='energy-efficiency-model', run_id='ac5268480eb047d2bdf0403f738a7609', run_link='', source='mlflow-artifacts:/1/ac5268480eb047d2bdf0403f738a7609/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>

In [40]:
import mlflow.pyfunc

model_name = "energy-efficiency-model"
stage = 'Production'

model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{stage}"
)

y_pred = model.predict(X_val)
print(y_pred)

  latest = client.get_latest_versions(name, None if stage is None else [stage])


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

[13.38897632 27.80723313 40.20732803 34.16646466 30.31901437 31.80854113
  7.33171925 29.65826259 30.25600292 15.88601406 35.20572576 17.46163242
 15.90688653 39.52271857 14.72696021 39.65597375 13.86710198  9.55839366
 31.88435016 13.29959963 12.59913296 39.8114509  31.16018398 13.24596222
 28.30369697 14.00657837 33.14615228 16.71064805  9.84259011 27.06831044
 15.20584927 15.74483171 13.7235686  14.78839764 11.56134629 13.15034369
 32.1260731  15.21831232 12.9572299  35.33898094 15.73053691 16.98508558
  8.82243433 28.57023183 34.59397135 30.74892905 30.69738249 15.34456225
 30.34788932 10.80073014 11.70111881 10.30937803 35.37231646 13.75118156
 17.98461409 36.74848963 31.45684348 32.47275952 28.49085433 26.92529635
 20.96095368 34.72722652  4.84714457 32.53270732 29.35977134 14.29107095
 14.03991389 30.92527867 10.56053509 13.53369639 30.41148007 10.97707976
 27.19732588 31.81646831 12.48264195 24.36267418 33.00143222 29.08533381
 28.39021696 17.66254624 11.80518129 13.56809145 33

In [53]:
X_val

<154x16 sparse matrix of type '<class 'numpy.float64'>'
	with 1232 stored elements in Compressed Sparse Row format>

In [77]:
def preprocess(data):
    """Preprocessing of the data"""
    # turn json input to dataframe
    data = pd.DataFrame([data])
    print(data)
    # define numerical and categorical features
    categorical = ["orientation", "glazing_area_distribution"]
    
    numerical = ["relative_compactnes", "surface_area", "wall_area", "roof_area", "overall_height", "glazing_area"]
    
    train_dicts = data[categorical + numerical].to_dict(orient='records')
    X = dv.transform(train_dicts)

    return X

In [78]:
input_data = {
    "relative_compactnes": 0.68,
    "surface_area": 423.40,
    "wall_area": 205.00,
    "roof_area": 115.55,
    "overall_height": 6.00,
    "orientation": 1,
    "glazing_area": 0.00,
    "glazing_area_distribution": 0
}

In [79]:
features = preprocess(input_data)

   relative_compactnes  surface_area  wall_area  roof_area  overall_height  \
0                 0.68         423.4      205.0     115.55             6.0   

   orientation  glazing_area  glazing_area_distribution  
0            1           0.0                          0  


In [80]:
print(features)

  (0, 0)	0.0
  (0, 11)	6.0
  (0, 12)	0.68
  (0, 13)	115.55
  (0, 14)	423.4
  (0, 15)	205.0


In [81]:
def predict(X):
    """make predictions"""
    pred = model.predict(X)
    print('prediction', pred[0])
    return float(pred[0])

In [82]:
pred = predict(features)
result = {'heat_load': pred, 'model_version': RUN_ID}

prediction 40.21897884760737


In [83]:
print(result)

{'heat_load': 40.21897884760737, 'model_version': '020f73f4c44f461eb337f1c05d7837d4'}


In [87]:
def predict_endpoint(input_data):
    """request input, preprocess it and make prediction"""
    features = preprocess(input_data)
    pred = predict(features)
    result = {'heat_load': pred, 'model_version': RUN_ID}
    
    return result

In [88]:
predict_endpoint(input_data)

   relative_compactnes  surface_area  wall_area  roof_area  overall_height  \
0                 0.68         423.4      205.0     115.55             6.0   

   orientation  glazing_area  glazing_area_distribution  
0            1           0.0                          0  
prediction 40.21897884760737


{'heat_load': 40.21897884760737,
 'model_version': '020f73f4c44f461eb337f1c05d7837d4'}