# Workshop MLflow

In [1]:
import pandas as pd
import numpy as np
import pandas_profiling as pdp
import mlflow
import mlflow.sklearn
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report

  env = yaml.load(_conda_header)


In [2]:
pd.options.display.max_columns = None

## Import data set

In [64]:
df_all = pd.read_csv('./data/energydata_complete.csv')

print('nb observations: {} - nb features: {}'.format(*df_all.shape))

nb observations: 19735 - nb features: 29


## Information about the data set :
1. date time year-month-day hour:minute:second
2. lights, energy use of light fixtures in the house in Wh 
3. T1, Temperature in kitchen area, in Celsius 
4. RH_1, Humidity in kitchen area, in % 
5. T2, Temperature in living room area, in Celsius 
6. RH_2, Humidity in living room area, in % 
7. T3, Temperature in laundry room area 
8. RH_3, Humidity in laundry room area, in % 
9. T4, Temperature in office room, in Celsius 
10. RH_4, Humidity in office room, in % 
11. T5, Temperature in bathroom, in Celsius 
12. RH_5, Humidity in bathroom, in % 
13. T6, Temperature outside the building (north side), in Celsius 
14. RH_6, Humidity outside the building (north side), in % 
15. T7, Temperature in ironing room , in Celsius 
16. RH_7, Humidity in ironing room, in % 
17. T8, Temperature in teenager room 2, in Celsius 
18. RH_8, Humidity in teenager room 2, in % 
19. T9, Temperature in parents room, in Celsius 
20. RH_9, Humidity in parents room, in % 
21. To, Temperature outside (from Chievres weather station), in Celsius 
22. Pressure (from Chievres weather station), in mm Hg 
23. RH_out, Humidity outside (from Chievres weather station), in % 
24. Wind speed (from Chievres weather station), in m/s 
25. Visibility (from Chievres weather station), in km 
26. Tdewpoint (from Chievres weather station), Â°C 
27. rv1, Random variable 1, nondimensional 
29. rv2, Random variable 2, nondimensional 

``Output variable (desired target)``:
30. Appliances, energy use in Wh


We will create a report named `report-all-data.html` in the repo `./analysis`.
This report helps us to understand all distribution and correlation in the data set. You can go into that repo and open it in your browser

In [65]:
# Just Random variable for robustness
df_all.drop(columns=['date', 'rv1', 'rv2'], inplace=True)

## Get report analysis

In [11]:
profile = pdp.ProfileReport(df_all)
profile.to_file(outputfile="./analysis/report-all-data.html")

We will use a first ML model to see what kind of information we need to record to (for example) evaluate the capacity of the model, if we suffer from overfitting or underfitting etc. From that we will understand why `mlflow` is a great tool for tracking metrics and save artifacts.

If you looked at the report you've seen that we have some categorical features. We encode them first

## Preprocess

In [66]:
from sklearn.model_selection import train_test_split

target_column = "Appliances" # "y"

# Split data
train, test = train_test_split(df_all)

train_x = train.drop([target_column], axis=1)
test_x = test.drop([target_column], axis=1)
train_y = train[target_column]
test_y = test[target_column]

In [67]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2


def scatter_plot_result(y_actual, y_pred, model_name):
    plt.scatter(y_actual, y_pred)
    plt.ylabel('Target predicted')
    plt.xlabel('True Target')
    plt.title(model_name)
    plt.text(500, 250, r'$RMSE=%.2f, R^2$=%.2f, MAE=%.2f' % (np.sqrt(mean_squared_error(y_actual, y_pred)), 
                                              r2_score(y_actual, y_pred), 
                                              mean_absolute_error(y_actual, y_pred)))
    plt.savefig('./scatter_results-{}.png'.format(model_name))
    plt.close()

## Build our first model

In [14]:
from sklearn.ensemble import RandomForestRegressor

# Train model
rfp = RandomForestRegressor(random_state=0, n_estimators=100)
model = rfp.fit(train_x, train_y)


pred_test = model.predict(test_x)
# print('rmse: {} - mae: {} - r2: {}'.format(*eval_metrics(test_y, pred_test)))
scatter_plot_result(test_y, pred_test, 'RandomForest')

-> `Retrain your model with another set of parameters and compare results`

## Build a second model

Often, we use a second model in order to challenge the first one...

In [15]:
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import QuantileTransformer, quantile_transform


# Train model
lr = ElasticNet(random_state=0, alpha=0.5, l1_ratio=0.2)
model = lr.fit(train_x, train_y)


pred_test = model.predict(test_x)
# print('rmse: {} - mae: {} - r2: {}'.format(*eval_metrics(test_y, pred_test)))
scatter_plot_result(test_y, pred_test, 'ElasticNet')

-> `Retrain your model with another set of parameters and compare results`

At this point you may want to draw more visualization to compare your models (performance, feature importance, or other metrics..). You understand that we will have do this process EVERY TIME, to compare or analyse any model or ML code. Also, if your data change, your metrics can change. It would be great to have the history of the data ATTACHED to the code's history

This were Tracking with MLflow is useful. 

Same exercise in `train.py`. 

In [60]:
from sklearn.compose import TransformedTargetRegressor

# If you wish to try on classification problem
def log_metrics_classification(y_true, y_prediction):
    report = classification_report(y_true, y_prediction, output_dict=True)
    for class_ in ['0', '1']:
        for metric in report[class_]:
            log_name = class_ + '_' + metric
            # insert your code here ~ 1 line
         
        
def log_metrics_regression(y_true, y_prediction):
    rmse, mae, r2 = eval_metrics(y_true, y_prediction)
    # log metrics here ~ 3 lines


def run_experiment(df, alpha, l1_ratio):    
    # Split data
    train, test = train_test_split(df)
    
    train_x = train.drop([target_column], axis=1)
    test_x = test.drop([target_column], axis=1)
    train_y = train[target_column]
    test_y = test[target_column]

    with mlflow.start_run():
        print("Running with alpha: {} - l1_ratio: {}".format(alpha, l1_ratio))

        # fit models
        lr = ElasticNet(random_state=0, alpha=alpha, l1_ratio=l1_ratio)
        lr.fit(train_x, train_y)

        prediction_test = lr.predict(test_x)

        # log parameters
        # Your code here ~ 2 lines

        # log artifact
        scatter_name = './scatter_results-ElasticNet.png'
        # save scatter plot as artifact here ~ 2 lines

        # log metrics
        log_metrics_regression(test_y, prediction_test)

        # log sklearn model
        # log the sklearn model here  ~ 1 line
        

In [68]:
# play yourself with parameters
# ! both parameters have min 0 and max 1 ! 


# Remove break to see all runs
for alpha in np.arange(0.1, 1, 0.2):
    for l1_ratio in np.arange(0.1, 1, 0.2):
        run_experiment(df_all, alpha, l1_ratio)
        break
    break

Running with alpha: 0.1 - l1_ratio: 0.1
