Anticipez les besoins en consommation électrique de bâtiments
=============================================================

![logo-seattle](../reports/figures/logo-seattle.png)


Explication des variables:
[City of seattle](https://data.seattle.gov/dataset/2015-Building-Energy-Benchmarking/h7rm-fz6m)

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import decomposition
from statsmodels.stats.stattools import medcouple

from src.utils.univar import UnivariateAnalysis
from src.utils.bivar import BivariateAnalysis


In [None]:
data = pd.read_pickle('../data/interim/full_data.pickle')

In [None]:
data.columns

Variable à prédire (targets)

   * SiteEnergyUse/WN (wether normalized?)
   * TotalGHGEmissions

Variables sélectionnées comme entrée du modèle
    
   * Surface du bâtiment
   * Premiere utilisation du bâtiment
   * Surface destinee au premier usage du batiment
    

## Préparation des variables pour modélisation

Usage des surfaces UseTypeGFA ==> pourcentage de la surface totale

In [None]:
data['LargestPropertyUseTypeGFAPercent'] = data['LargestPropertyUseTypeGFA'] / data['PropertyGFATotal']
data['SecondLargestPropertyUseTypeGFAPercent'] = data['SecondLargestPropertyUseTypeGFA'] / data['PropertyGFATotal']
data['ThirdLargestPropertyUseTypeGFAPercent'] = data['ThirdLargestPropertyUseTypeGFA'] / data['PropertyGFATotal']

data[data['LargestPropertyUseTypeGFAPercent'].isna()]

data.loc[data[data['SecondLargestPropertyUseTypeGFAPercent'].isna()].index, 'SecondLargestPropertyUseTypeGFAPercent'] = 0
data.loc[data[data['ThirdLargestPropertyUseTypeGFAPercent'].isna()].index, 'ThirdLargestPropertyUseTypeGFAPercent'] = 0

data.describe()

data = data[data['LargestPropertyUseType'].notna()]
data = data[data['SiteEnergyUseWN_kBtu'].notna()]

data.shape

Pour l'usage des surfaces, on se ramène à une variable ordinale. En effet, l'analyse
bivariée (ANOVA) des variables `SiteEnergyUseWN_kBtu`/`SiteEUIWN_kBtu_sf` et `LargestPropertyUseTypeGFA`
montre que seul quelques catégories ont un impact sur la cible. On distingue alors les 
batiments en trois types : Usage normal, Usage ayant une demande élevée et Usage ayant une demande très élevée.

In [None]:
# low demand use type 
low_demand_use_type = ['']

# high demand use type
# SiteEUI between 200 and 400 kBtu/sf
high_demand_use_type = ['Laboratory',
                        'Urgent care/clinic/other outpatient',
                        'Supermarket/grocery store',
                        'Restaurant',
                        'Hospital (general medical & surgical)']

# crazy high demand use type
# SiteEUI > 600 kBtu/sf
crazy_high_demand_use_type = ['Data center']

def mapper(x):
    if x in high_demand_use_type:
        return 'high_demand_use_type'
    elif x in crazy_high_demand_use_type:
        return 'very_high_demand_use_type'
    elif type(x) != str:
        return np.nan
    else:
        return 'normal_demand_use_type'

In [None]:
data['LargestPropertyUseTypeNorm'] = data['LargestPropertyUseType'].apply(mapper)
data['LargestPropertyUseTypeNorm'] = data['LargestPropertyUseTypeNorm'].astype('category')

In [None]:
data['LargestPropertyUseTypeNorm'] = data['LargestPropertyUseType'].apply(mapper)
data['LargestPropertyUseTypeNorm'] = data['LargestPropertyUseTypeNorm'].astype('category')

In [None]:
data[data['LargestPropertyUseTypeGFAPercent'] > 1][['PropertyName',
                                                    'LargestPropertyUseTypeGFA',
                                                    'PropertyGFATotal',
                                                    'LargestPropertyUseType']]

On passe la cible `SiteEnergyUseWN_kBtu` en log

In [None]:
def log_target(x):
    if x > 0:
        return np.log10(x)
    else:
        return x

In [None]:
data['SiteEnergyUseWN_kBtu_log'] = data['SiteEnergyUseWN_kBtu'].apply(log_target)

On passe la variable `PropertyGFATotal` en log

In [None]:
data['log_GFATotal'] = data['PropertyGFATotal'].apply(log_target)

## Quelles sont les variables corrélées avec la cible?

In [None]:
bivar = BivariateAnalysis(data)

In [None]:
bivar.anova('SiteEUIWN_kBtu_sf', 'LargestPropertyUseTypeNorm')

In [None]:
bivar.anova('SiteEnergyUse_kBtu', 'LargestPropertyUseTypeNorm')

In [None]:
bivar.regression(['PropertyGFATotal', 'SiteEnergyUseWN_kBtu'])

In [None]:
sns.scatterplot(x='PropertyGFATotal', 
                y='SiteEnergyUse_kBtu',
                data=data, hue='LargestPropertyUseTypeNorm',
                palette=['#f5aa42', 
                         '#69f542',
                         '#f54242'])

In [None]:
use_type = "LargestPropertyUseTypeNorm"
norm_ = "normal_demand_use_type"

In [None]:
bivar = BivariateAnalysis(data[data[use_type] == norm_])

In [None]:
bivar.regression(['PropertyGFATotal', 'SiteEnergyUseWN_kBtu'])

In [None]:
data

In [None]:
fig = px.scatter(data, x='log_GFATotal', 
                 y='SiteEnergyUseWN_kBtu_log', 
                 color='LargestPropertyUseTypeNorm',
                 hover_data=['PropertyName'])

fig.show()

In [None]:
data = data[data['SiteEnergyUseWN_kBtu_log'] > 0]

In [None]:
fig = px.scatter(data, x='log_GFATotal', 
                 y='SiteEnergyUseWN_kBtu_log', 
                 color='LargestPropertyUseTypeNorm',
                 hover_data=['PropertyName'])

fig.show()

In [None]:
fig = px.scatter(data, x='log_GFATotal', 
                 y='SiteEnergyUseWN_kBtu_log', 
                 color='LargestPropertyUseType',
                 hover_data=['PropertyName'])

fig.show()

In [None]:
fig = px.scatter(data, x='PropertyGFATotal', 
                 y='SiteEnergyUseWN_kBtu', 
                 color='LargestPropertyUseType',
                 hover_data=['PropertyName'])

fig.show()

In [None]:
fig = px.scatter(data, x='log_GFATotal', 
                 y='SiteEnergyUseWN_kBtu_log', 
                 color='ComplianceStatus',
                 hover_data=['PropertyName'])

fig.show()

In [None]:
bivar = BivariateAnalysis(data)

In [None]:
bivar.regression(['log_GFATotal', 'SiteEnergyUseWN_kBtu_log'])

In [None]:
bivar = BivariateAnalysis(data.query('LargestPropertyUseType == "Multifamily housing"'))
bivar.regression(['log_GFATotal', 'SiteEnergyUseWN_kBtu_log'])

In [None]:
bivar = BivariateAnalysis(data.query('LargestPropertyUseType == "Hotel"'))
bivar.regression(['log_GFATotal', 'SiteEnergyUseWN_kBtu_log'])

In [None]:
bivar = BivariateAnalysis(data.query('LargestPropertyUseType == "K-12 school"'))
bivar.regression(['log_GFATotal', 'SiteEnergyUseWN_kBtu_log'])

In [None]:
bivar = BivariateAnalysis(data.query('LargestPropertyUseType == "Office"'))
bivar.regression(['log_GFATotal', 'SiteEnergyUseWN_kBtu_log'])

In [None]:
sns.regplot(x='log_GFATotal', 
            y='SiteEnergyUseWN_kBtu_log',
            data=data, fit_reg=True)

In [None]:
query = 'LargestPropertyUseType == "Hotel" | LargestPropertyUseType == "Multifamily housing"'
bivar = BivariateAnalysis(data.query(query))
bivar.regression(['log_GFATotal', 'SiteEnergyUseWN_kBtu_log'])

### Outliers

In [None]:
def asymetric_boxplot(df, col):
    data_ = df[col]
    med = data_.median()
    q1 = data_.quantile(0.25)
    q3 = data_.quantile(0.75)
    iqr = q3 - q1
    mc = medcouple(data_)
    mean = data_.mean()
    
    if mc > 0:
        low_whisker = q1 - (1.5 * np.exp(-3 * mc) * iqr)
        high_whisker = q3 + (1.5 * np.exp(4 * mc) * iqr)
    else:
        low_whisker = q1 - (1.5 * np.exp(-4 * mc) * iqr)
        high_whisker = q3 + (1.5 * np.exp(3 * mc) * iqr)
    
    fig = go.Figure()
    fig.add_trace(go.Box(
        x=[data_], name="Skewed box plot",
        boxpoints='suspectedoutliers'
    ))
    
    fig.update_traces(q1=[q1], median=[med], q3=[q3], 
                      lowerfence=[low_whisker], 
                      upperfence=[high_whisker])
    
    
    
    fig.show()
    
    fig2 = px.histogram(df, x=col)
    
    fig2.show()

In [None]:
asymetric_boxplot(data, 'PropertyGFATotal')

In [None]:
asymetric_boxplot(data, 'log_GFATotal')

In [None]:
asymetric_boxplot(data, 'SiteEnergyUseWN_kBtu')

In [None]:
asymetric_boxplot(data, 'SiteEnergyUseWN_kBtu_log')

### PCA (Principal component analysis)