Anticipez les besoins en consommation électrique de bâtiments
=============================================================

![logo-seattle](https://www.seattle.gov/Documents/Departments/Arts/Downloads/Logo/Seattle_logo_landscape_blue-black.png)


Explication des variables:
[City of seattle](https://data.seattle.gov/dataset/2015-Building-Energy-Benchmarking/h7rm-fz6m)

On cherche ici à déterminer quelle représentation de données utiliser pour le modèle.
On peut soit utiliser les données brute (En sélectionant uniquement les colonnes adéquates) ou alors transformer
les données sous un autre format (voir `src.features.transform_features.py`). 

In [None]:
from tempfile import mkdtemp
from shutil import rmtree

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
import seaborn as sns

sns.set()
sns.set_context("notebook", font_scale=1.0)

matplotlib.rcParams['figure.figsize'] = (10, 6)

In [None]:
data_raw = pd.read_pickle('../data/interim/full_dataV2.pickle')

In [None]:
data_tr = pd.read_pickle('../data/processed/model_data.pickle')
data_tr2 = pd.read_pickle('../data/processed/model_data_percent.pickle')

## Baseline error

### Données d'origine

In [None]:
target = ['SiteEnergyUseWN_kBtu']
inputs = ['PropertyGFATotal',
          'LargestPropertyUseType', 'LargestPropertyUseTypeGFA',
          'SecondLargestPropertyUseType', 'SecondLargestPropertyUseTypeGFA',
          'ThirdLargestPropertyUseType', 'ThirdLargestPropertyUseTypeGFA',]

In [None]:
categorical_features = [x for x in inputs if data_raw[x].dtype.name == 'category']

# ensure categories in test/train set are the same
uncommon_cat = list()
for col in categorical_features:
    diff_15_16 = list(set(data_raw.loc[2015][col].unique()) -\
                      set(data_raw.loc[2016][col].unique()))
    print(diff_15_16)
    diff_16_15 = list(set(data_raw.loc[2016][col].unique()) -\
                      set(data_raw.loc[2015][col].unique()))
    print(diff_16_15)
    uncommon_cat += diff_15_16 
    uncommon_cat += diff_16_15
uncommon_cat = list(dict.fromkeys(uncommon_cat))

In [None]:
total = 0
for col in categorical_features:
    for cat in uncommon_cat:
        total += data_raw[data_raw[col] == cat].shape[0]
        data_raw.drop(data_raw[data_raw[col] == cat].index, axis=0, inplace=True)
print('removed %i rows' % total)

#### Preprocessing

In [None]:
log_transform = FunctionTransformer(np.log1p, inverse_func=np.expm1)

In [None]:
data_raw = data_raw.dropna(axis=0, subset=target)

In [None]:
X = data_raw[inputs]
y_train = data_raw[target].loc[2015].values.ravel()
y_test = data_raw[target].loc[2016].values.ravel()

In [None]:
cache_dir = mkdtemp()

numeric_features = [x for x in inputs if data_raw[x].dtype.name != 'category']

num_transformer = Pipeline(memory=cache_dir, steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('log_transformer', log_transform),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(memory=cache_dir, steps=[
    ('inputer', SimpleImputer(strategy='constant', fill_value='')),
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
)

clf = Pipeline(
    [('preprocessing', preprocessor),
     ('regressor', TransformedTargetRegressor(LinearRegression(), 
                                              transformer=log_transform))]
)

#### Train

In [None]:
clf.fit(X.loc[2015], y_train)

In [None]:
print("Score on training set : %5f" % clf.score(X.loc[2015], y_train))
print("Score on testing set : %5f" % clf.score(X.loc[2016], y_test))

In [None]:
sns.scatterplot(y_test, clf.predict(X.loc[2016]))

In [None]:
test = pd.DataFrame({'y true' : y_test,
                     'y pred' : clf.predict(X.loc[2016]).ravel(),
                     'set': ['test' for x in range(X.loc[2016].shape[0])] })

train = pd.DataFrame({'y true' :y_train,
                      'y pred' : clf.predict(X.loc[2015]).ravel(),
                      'set': ['train' for x in range(X.loc[2015].shape[0])] })

res = pd.concat([test, train])

sns.scatterplot(data=res, x='y true', y='y pred', 
                style='set', markers=['o', 'X'], hue='set')

In [None]:
names = clf['preprocessing'].transformers_[1][1]['encoder']\
                                .get_feature_names(categorical_features)
index = numeric_features + list(names)
coef = pd.Series(clf.named_steps['regressor'].regressor_.coef_, index=index)

plt.figure(figsize=(12, 25))
coef.sort_values(inplace=True)
coef.plot(kind='barh')
plt.show()

Le modèle linéaire overfit

### Données transformées

voir `src.features.transform_features.py`

In [None]:
data_tr.sort_values('SiteEnergyUseWN_kBtu')

In [None]:
data_train = data_tr.loc[2015].reset_index(drop=True)
data_test = data_tr.loc[2016].reset_index(drop=True)


clf = Pipeline([
    ('log_transformer', log_transform),
    ('regressor', TransformedTargetRegressor(LinearRegression(), 
                                             transformer=log_transform))
])

clf.fit(data_train.drop(target, axis=1), data_train[target])

In [None]:
data_train.drop(target, axis=1).describe()

In [None]:
print("Score on training set : %5f" % 
      clf.score(data_train.drop(target, axis=1), data_train[target]))
print("Score on testing set : %5f" % 
      clf.score(data_test.drop(target, axis=1), data_test[target]))

In [None]:
coef = pd.Series(clf.named_steps['regressor'].regressor_.coef_,
                 index=data_tr.drop(target, axis=1).columns)

plt.figure(figsize=(5, 12))
coef.sort_values(inplace=True)
coef.plot(kind='barh')
plt.show()

In [None]:
test = pd.DataFrame({'y true' : data_test[target].values.ravel(),
                     'y pred' : clf.predict(data_test\
                                            .drop(target, axis=1)).ravel(),
                     'set': ['test' for x in range(data_test.shape[0])] })

train = pd.DataFrame({'y true' : data_train[target].values.ravel(),
                      'y pred' : clf.predict(data_train\
                                            .drop(target, axis=1)).ravel(),
                      'set': ['train' for x in range(data_train.shape[0])] })

res = pd.concat([test, train])

sns.scatterplot(data=res, x='y true', y='y pred', 
                style='set', markers=['o', 'X'], hue='set')

Mauvais fit et overfit

### Données transformées (pourcentages de la surface totale)

In [None]:
data_tr2 = data_tr2[data_tr2['SiteEnergyUseWN_kBtu'] > 0]
data_tr2 = data_tr2[data_tr2['TotalGFA'] > 0]

In [None]:
data_tr2

In [None]:
data_train = data_tr2.loc[2015]
data_test = data_tr2.loc[2016]

preprocessor = ColumnTransformer(
        transformers=[
            ('log_transform', log_transform, [-1]),
            ('others', FunctionTransformer(), slice(0, -1))
        ]
)

clf = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', TransformedTargetRegressor(LinearRegression(), 
                                             transformer=log_transform))
])

clf.fit(data_train.drop(target, axis=1), data_train[target])

In [None]:
print("Score on training set : %5f" % 
      clf.score(data_train.drop(target, axis=1), data_train[target]))
print("Score on testing set : %5f" % 
      clf.score(data_test.drop(target, axis=1), data_test[target]))

In [None]:
coef = pd.Series(clf.named_steps['regressor'].regressor_.coef_,
                 index=data_tr2.drop(target, axis=1).columns)

plt.figure(figsize=(5, 12))
coef.sort_values(inplace=True)
coef.plot(kind='barh')
plt.show()

In [None]:
test = pd.DataFrame({'y true' : data_test[target].values.ravel(),
                     'y pred' : clf.predict(data_test\
                                            .drop(target, axis=1)).ravel(),
                     'set': ['test' for x in range(data_test.shape[0])] })

train = pd.DataFrame({'y true' : data_train[target].values.ravel(),
                      'y pred' : clf.predict(data_train\
                                            .drop(target, axis=1)).ravel(),
                      'set': ['train' for x in range(data_train.shape[0])] })

res = pd.concat([test, train])

sns.scatterplot(data=res, x='y true', y='y pred', 
                style='set', markers=['o', 'X'], hue='set')

### Conclusions

Il est préférable de transformer les données. L'utilisation des proportions de la surface totale semble être la meilleure représentation des données. 

A ce stade, il est clair que la régression linéaire simple n'est pas la plus adaptée (Overfit systématique).

In [None]:
data_tr2.to_pickle('../data/processed/model_data_percentV2.pickle')