Anticipez les besoins en consommation électrique de bâtiments
=============================================================

![logo-seattle](https://www.seattle.gov/Documents/Departments/Arts/Downloads/Logo/Seattle_logo_landscape_blue-black.png)


Explication des variables:
[City of seattle](https://data.seattle.gov/dataset/2015-Building-Energy-Benchmarking/h7rm-fz6m)



On cherche ici à déterminer quel modèle est le plus adapté.
Les modèles de régression possible sont:
   
   * **Linéaires** :
      * LinearRegression (Overfitting)
      * Ridge
      * Lasso
      * Elastic-Net
      * *LARS* (context : number of features >> number of samples [1])
      
   * **Support Vector Machine (SVM)**
      * Support Vector Regression (SVR)
      
   * **Stochastic Gradient Descent**
      * SGDRegressor
     
   * **Nearest Neighbors**
      * Nearest Neighbors Regression (poor results on sparse data [2])
   
   * **Gaussian Processes**
      * *Gaussian Process Regression (GPR)*
   

   * **Decision Trees**
      * DecisionTreeRegressor
      
   * **Ensemble methods**
      * RandomForestRegressor
      * *ExtraTreesRegressor*
      * GradientBoostingRegressor
      * *VotingRegressor*
      
   * **Multiclass and multilabel algorithms**
      * *Regressor Chain* (Intéressant si on cherche à prévoir des sorties multiples corrélées)
      
   * **Neural Network**
      * Multi Layer Perceptron - MLPRegressor

[1] [Scikit-learn documentation](https://scikit-learn.org/stable/modules/linear_model.html#least-angle-regression).

[2] Müller, A. C., & Guido, S. (2017). Introduction to machine learning with Python: A guide for data scientists.

In [None]:
from importlib import import_module
import os
from pathlib import Path
import pickle
from shutil import rmtree
from tempfile import mkdtemp
from time import time

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# https://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import statsmodels.api as sm
cache_dir = mkdtemp()

sns.set()
sns.set_context("notebook", font_scale=1.0)

matplotlib.rcParams['figure.figsize'] = (10, 6)

On recharge les données

In [None]:
data = pd.read_pickle('../data/processed/model_data_percentV2.pickle')

In [None]:
data.describe()

In [None]:
data.head()

Variable à prédire

In [None]:
target = ['SiteEnergyUseWN_kBtu']

In [None]:
log_transform = FunctionTransformer(np.log1p, inverse_func=np.expm1)

In [None]:
# data_train = data.loc[2015]
# data_test = data.loc[2016]

# Reduce overfitting by increasing the number of observations 
data_train, data_test = train_test_split(data, test_size=0.2)

In [None]:
preprocessor = ColumnTransformer(
        transformers=[
            ('log_transform', log_transform, [-1]),
            ('others', FunctionTransformer(), slice(0, -1))
        ]
)

clf = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', TransformedTargetRegressor(KNeighborsRegressor(n_neighbors=4), 
                                             transformer=log_transform))
])

clf.fit(data_train.drop(target, axis=1), data_train[target])

In [None]:
models = {
    'LinearRegression': 'linear_model.LinearRegression',
    'Ridge': 'linear_model.Ridge',
    'Lasso': 'linear_model.Lasso',
    'Elastic-Net': 'linear_model.ElasticNet',
    'SGDRegressor': 'linear_model.SGDRegressor',
    'KNNRegressor': 'neighbors.KNeighborsRegressor',
    'DecisionTreeRegressor': 'tree.DecisionTreeRegressor',
    'GradientBoostingRegressor': 'ensemble.GradientBoostingRegressor',
    'RandomForestRegressor': 'ensemble.RandomForestRegressor',
    'SVR': 'svm.SVR',
    'MLP': 'neural_network.MLPRegressor',
}

In [None]:
models

In [None]:
scores = dict()
scores_train = dict()
times = dict()

for model, cls in models.items():
    mod = import_module(f"sklearn.{cls.split('.')[0]}")
    cls = getattr(mod, cls.split('.')[1])
    preprocessor = ColumnTransformer(
        transformers=[
            ('log_transform', log_transform, [-1]),
            ('others', FunctionTransformer(), slice(0, -1))
        ]
    )

    clf = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', TransformedTargetRegressor(cls(), 
                                                 transformer=log_transform))
    ])
    t1 = time()
    clf.fit(data_train.drop(target, axis=1), data_train[target])
    t2 = time()
    score = clf.score(data_test.drop(target, axis=1), data_test[target])
    print("score -- %25s : %5f (%3f s)" % (model, score, t2 - t1))
    scores[model] = score
    times[model] = t2 - t1
    scores_train[model] = clf.score(data_train.drop(target, axis=1), 
                                    data_train[target])

In [None]:
dataframe = pd.DataFrame(pd.Series(scores, name='score'))
dataframe['time'] = pd.Series(times)
dataframe['score train'] = pd.Series(scores_train)
dataframe.sort_values('score', ascending=False)
dataframe['overfit'] = dataframe['score train'] > dataframe['score']
default_score = dataframe
dataframe

In [None]:
NUM = 20
model_params = {
    'Ridge': {'regressor__regressor__alpha': np.logspace(-3, 0, num=NUM),
              'regressor__regressor__tol': [0.001]},
    
    'Lasso': {'regressor__regressor__alpha': np.logspace(-5, -3, num=NUM),
              'regressor__regressor__tol': [0.001]},
    
    'Elastic-Net': {'regressor__regressor__alpha': np.logspace(-5, -3, num=NUM),
                    'regressor__regressor__tol': [0.001]},
    
    'SVR': {"regressor__regressor__C": [5, 10, 15],
            "regressor__regressor__gamma": np.arange(0.1, 1.0, NUM),
            "regressor__regressor__kernel": ['rbf', 'linear']},
    
    'SGDRegressor': {'regressor__regressor__alpha': np.logspace(-6, -3, num=NUM)},
    
    'KNNRegressor': {'regressor__regressor__n_neighbors': np.arange(1, 10)},
    
    'DecisionTreeRegressor': None,
    
    'RandomForestRegressor': {'regressor__regressor__n_estimators': [250, 275, 300],
                              'regressor__regressor__max_depth': [25, 50, 100]},
    
    'GradientBoostingRegressor': None,
    
    'MLP': {'regressor__regressor__hidden_layer_sizes' : [(50, 50, 50), 
                                                          (50, 100, 50), 
                                                          (100,)],
            #'regressor__regressor__activation': ['tanh', 'logistic', 'relu'],
            #'regressor__regressor__solver': ['sgd', 'adam'],
            'regressor__regressor__alpha': [0.00005, 0.0001, 0.0005],
            'regressor__regressor__learning_rate': ['constant', 'adaptive']
           }
}

In [None]:
%%time
scores = dict()
scores_train = dict()
times = dict()
models_ = dict()

for model, cls in models.items():
    mod = import_module(f"sklearn.{cls.split('.')[0]}")
    cls = getattr(mod, cls.split('.')[1])
    preprocessor = ColumnTransformer(
        transformers=[
            ('log_transform', log_transform, [-1]),
            ('others', FunctionTransformer(), slice(0, -1))
        ]
    )
    clf = Pipeline(memory=cache_dir, 
                   steps=[
                            ('preprocessor', preprocessor),
                            ('regressor', TransformedTargetRegressor(cls(), 
                                          transformer=log_transform))])
    params = model_params.get(model)
    if params:
        clf = GridSearchCV(clf, params, n_jobs=-1)
    else:
        pass
    t1 = time()
    clf.fit(data_train.drop(target, axis=1), data_train[target])
    t2 = time()
    score = clf.score(data_test.drop(target, axis=1), data_test[target])
    print("score -- %25s : %5f (%3f s)" % (model, score, t2 - t1))
    scores[model] = score
    times[model] = t2 - t1
    models_[model] = clf
    scores_train[model] = clf.score(data_train.drop(target, axis=1), 
                data_train[target])

In [None]:
dataframe = pd.DataFrame(pd.Series(scores, name='score'))
dataframe['score train'] = pd.Series(scores_train)
dataframe['time'] = pd.Series(times)
dataframe.sort_values('score', ascending=False, inplace=True)
dataframe.reset_index(inplace=True)
dataframe.rename(columns={'index': 'model'}, inplace=True)
dataframe.set_index(dataframe['model'], inplace=True)
dataframe.drop('model', axis=1, inplace=True)
dataframe['old score'] = default_score['score']
dataframe['gain'] = dataframe['score'] - default_score['score']
dataframe['overfit'] = dataframe['score train'] > dataframe['score']
dataframe

In [None]:
dataframe.reset_index(inplace=True)
sns.barplot(y='model', x='score', data=dataframe, facecolor=(0.6, 0.6, 0.6, 1))

In [None]:
best_params = {}
for model, clf in models_.items():
    if model_params.get(model):
        best_params[model] = clf.best_params_
best_params

In [None]:
for model, params in best_params.items():
    params_ = dict()
    for param_name, param_val in params.items():
        params_['__'.join(param_name.split('__')[1:])] = param_val
        
    best_params[model] = params_

In [None]:
%%time
scores = dict()
score_train = dict()
times = dict()
models_ = dict()

for model, cls in models.items():
    mod = import_module(f"sklearn.{cls.split('.')[0]}")
    cls = getattr(mod, cls.split('.')[1])
    preprocessor = ColumnTransformer(
        transformers=[
            ('log_transform', log_transform, [-1]),
            ('others', FunctionTransformer(), slice(0, -1))
        ]
    )
    clf = Pipeline(memory=cache_dir, 
                   steps=[
                            ('preprocessor', preprocessor),
                            ('regressor', TransformedTargetRegressor(cls(), 
                                          transformer=log_transform))])
    params = best_params.get(model)
    if params:
        clf.named_steps['regressor'].set_params(**params)
    t1 = time()
    clf.fit(data_train.drop(target, axis=1), data_train[target])
    t2 = time()
    score = clf.score(data_test.drop(target, axis=1), data_test[target])
    print("score -- %25s : %5f (%3f s)" % (model, score, t2 - t1))
    scores[model] = score
    times[model] = t2 - t1
    models_[model] = clf
    scores_train[model] = clf.score(data_train.drop(target, axis=1), 
                data_train[target])

In [None]:
dataframe = pd.DataFrame(pd.Series(scores, name='score'))
dataframe['score train'] = pd.Series(scores_train)
dataframe['time'] = pd.Series(times)
dataframe.sort_values('score', ascending=False, inplace=True)
dataframe.reset_index(inplace=True)
dataframe.rename(columns={'index': 'model'}, inplace=True)
dataframe.set_index(dataframe['model'], inplace=True)
dataframe.drop('model', axis=1, inplace=True)
dataframe['old score'] = default_score['score']
dataframe['gain'] = dataframe['score'] - default_score['score']
dataframe['overfit'] = dataframe['score train'] > dataframe['score']
dataframe.reset_index(inplace=True)
sns.barplot(y='model', x='score', data=dataframe, facecolor=(0.6, 0.6, 0.6, 1))
plt.show()
dataframe

In [None]:
best_model = models_[dataframe.loc[dataframe['score'].idxmax(), 'model']]
y_pred = best_model.predict(data_test.drop(target, axis=1))
y_true = data_test[target].values

In [None]:
fig, ax = plt.subplots(1, figsize=(12, 8))
sns.scatterplot(x=data_train[target].values.ravel(),
                y=best_model.predict(data_train.drop(target, axis=1)).ravel(),
                marker='+', alpha=0.4)
sns.scatterplot(x='y_true', y='y_pred',
            data=pd.DataFrame({'y_true': y_true.ravel(), 
                               'y_pred': y_pred.ravel()}),
           ax=ax, alpha=0.5)

plt.show()

In [None]:
base_path = os.path.abspath('..')
model_name = dataframe.loc[dataframe['score'].idxmax(), 'model']

In [None]:
path = os.path.join(base_path, 'models', model_name + 'V1.pickle')
with open(path, 'wb') as f:
    pickle.Pickler(f).dump(best_model)