Anticipez les besoins en consommation électrique de bâtiments
=============================================================

![logo-seattle](../reports/figures/logo-seattle.png)


Explication des variables:
[City of seattle](https://data.seattle.gov/dataset/2015-Building-Energy-Benchmarking/h7rm-fz6m)

## Importation des librairies

In [None]:
import os

import numpy as np
import pandas as pd
# V2.3.0 (Bug in widget rendering in V2.4.0)
from pandas_profiling import ProfileReport
import folium
from folium.plugins import FastMarkerCluster
from IPython.display import display
import ipywidgets as widgets
from ipywidgets import interact

from src.utils.univar import UnivariateAnalysis
from src.utils.bivar import BivariateAnalysis


## Chargement des données

In [None]:
data = dict()
data_dir = os.path.abspath('../data/raw')
for file in os.listdir(data_dir):
    if file.endswith('.csv'):
        key = file.split('.')[0]
        data[key] = pd.read_csv(os.path.join(data_dir, file))

In [None]:
_2015 = '2015-building-energy-benchmarking'
_2016 = '2016-building-energy-benchmarking'

In [None]:
col_2015 = data[_2015].columns.values.tolist()
col_2016 = data[_2016].columns.values.tolist()
print(set(col_2016) - set(col_2015))
print(set(col_2015) - set(col_2016))

In [None]:
to_drop = [
            'Zip Codes',
            'City Council Districts',
            'SPD Beats',
            '2010 Census Tracts',
            'Seattle Police Department Micro Community Policing Plan Areas'
           ]

In [None]:
data[_2015].drop(to_drop, axis=1, inplace=True)
columns = {'GHGEmissions(MetricTonsCO2e)': 'TotalGHGEmissions',
           'GHGEmissionsIntensity(kgCO2e/ft2)': 'GHGEmissionsIntensity',
           'Comment': 'Comments'}
data[_2015].rename(columns=columns, inplace=True)

In [None]:
location = data[_2015]['Location']
data[_2015]['Location'] = location.apply(eval)

In [None]:
data[_2015]['Latitude'] = location.apply(lambda x: float(x['latitude']))
data[_2015]['Longitude'] = location.apply(lambda x: float(x['longitude']))

In [None]:
address_2015 = data[_2015]['Location'].apply(lambda x: x['human_address'])
address_2015 = address_2015.map(eval)
for field in ['Address', 'State', 'City']:
    data[_2015][field] = address_2015.apply(lambda x: x[field.lower()])
data[_2015]['ZipCode'] = address_2015.apply(lambda x: x['zip'])

In [None]:
col_2015 = data[_2015].columns.values.tolist()
col_2016 = data[_2016].columns.values.tolist()
print(set(col_2016) - set(col_2015))
print(set(col_2015) - set(col_2016))

In [None]:
data = pd.concat(data, sort=False)

In [None]:
data.rename({"2015-building-energy-benchmarking": 2015,
             "2016-building-energy-benchmarking": 2016}, inplace=True)

In [None]:
data.drop(['Location', 'DataYear'], axis=1, inplace=True)

In [None]:

def strip_all_string(x):
    if type(x) == str:
        return x.capitalize().strip()
    else:
        return x


for col in data.columns:
    data[col] = data[col].apply(strip_all_string)

### Correction des types de données

In [None]:
categorical_fields = ['BuildingType', 'PrimaryPropertyType', 'Neighborhood',
                      'LargestPropertyUseType', 'SecondLargestPropertyUseType',
                      'ThirdLargestPropertyUseType']
for col in categorical_fields:
    data[col] = data[col].astype('category')

In [None]:
for col in data.columns:
    print(f"col : {col} dtype : {data[col].dtype}")

In [None]:
data.dtypes.to_latex('../reports/latex-report/includes/variables.tex')

In [None]:
data['ZipCode'] = data['ZipCode'].map(float)

In [None]:
data.index.names = ['year', 'idx']

In [None]:
print(data.shape)

### Localisation des bâtiments


In [None]:
year_widget = widgets.Dropdown(options=[2015, 2016])
usage_type = data['LargestPropertyUseType'].sort_values()
usage_type = usage_type.drop_duplicates().tolist()
usage_type.insert(0, 'ALL')
usage_type.remove(np.nan)
usage_widget = widgets.Dropdown(option=usage_type)


@interact
def make_map(year=year_widget, usage=usage_type):
    location = data.loc[year][['Latitude', 'Longitude']].mean(axis=0).values
    data_map = data.loc[year][['Latitude',
                               'Longitude',
                               'LargestPropertyUseType']]
    if usage != 'ALL':
        data_map = data_map[data_map['LargestPropertyUseType'] == usage]
    m = folium.Map(location=location,
                   tiles='cartodbpositron',
                   zoom_start=11)

    mc = FastMarkerCluster(data_map)
    mc.add_to(m)

    display(m)

## Analyses univariées

In [None]:

data.columns = data.columns.map(lambda x: x.replace('(', '_'))
data.columns = data.columns.map(lambda x: x.replace(')', ''))
data.columns = data.columns.map(lambda x: x.replace('/', '_'))

dtypes = data.columns.map(lambda x: data[x].dtype.name)
opt = ['BuildingType',
       'PrimaryPropertyType',
       'Neighborhood',
       'YearBuilt',
       'NumberofBuildings',
       'NumberofFloors',
       'PropertyGFATotal',
       'PropertyGFAParking',
       'PropertyGFABuilding_s',
       'LargestPropertyUseType',
       'SecondLargestPropertyUseType',
       'ThirdLargestPropertyUseType',
       'ENERGYSTARScore',
       'LargestPropertyUseTypeGFA',
       'SecondLargestPropertyUseTypeGFA',
       'ThirdLargestPropertyUseTypeGFA',
       'SiteEUI_kBtu_sf',
       'SiteEUIWN_kBtu_sf',
       'SiteEnergyUse_kBtu',
       'SiteEnergyUseWN_kBtu',
       'SourceEUI_kBtu_sf',
       'SourceEUIWN_kBtu_sf',
       'TotalGHGEmissions',
       'GHGEmissionsIntensity',
       'SteamUse_kBtu',
       'Electricity_kBtu',
       'NaturalGas_kBtu']
variable_widget = widgets.Dropdown(options=opt)


@interact
def univariate_analysis(var=variable_widget):
    univar = UnivariateAnalysis(data)
    univar.make_analysis(var, orient='h', figsize=(8, 12))

## Analyses bivariées

### Catégoriel vs Continu

In [None]:

dtypes = list(map(lambda x: data[x].dtype.name, data.columns))
names_dtypes = zip(data.columns.values.tolist(), dtypes)
names_dtypes = [(x, y) for x, y in names_dtypes]

opt_1 = [x for x, y in names_dtypes if y in ['float64', 'int64']]
opt_2 = [x for x, y in names_dtypes if y == 'category']
outcome_variable = widgets.Dropdown(options=opt_1)
group = widgets.Dropdown(options=opt_2)
years = widgets.Dropdown(options=['ALL', 2015, 2016])


@interact
def anova(outcome_variable=outcome_variable, group=group, year=years):
    bivar = BivariateAnalysis(data)
    if year != 'ALL':
        bivar = BivariateAnalysis(data.loc[year])
    bivar.anova(outcome_variable=outcome_variable,
                group=group,
                orient='h',
                figsize=(10, 10),
                label_rotation=0)

### Catégoriel vs Catégoriel

In [None]:

dtypes = list(map(lambda x: data[x].dtype.name, data.columns))
names_dtypes = zip(data.columns.values.tolist(), dtypes)
names_dtypes = [(x, y) for x, y in names_dtypes]

variables = [x for x, y in names_dtypes if y in ['category']]
var_1 = widgets.Dropdown(options=variables)
var_2 = widgets.Dropdown(options=variables)
years_2 = widgets.Dropdown(options=['ALL', 2015, 2016])


@interact
def chi2_test(var_1=var_1, var_2=var_2, year=years_2):
    variables = (var_1, var_2)
    bivar = BivariateAnalysis(data)
    if year != 'ALL':
        bivar = BivariateAnalysis(data.loc[year])
    bivar.chi_square_contingency(variables)

### Continu vs Continu

In [None]:

dtypes = list(map(lambda x: data[x].dtype.name, data.columns))
names_dtypes = zip(data.columns.values.tolist(), dtypes)
names_dtypes = [(x, y) for x, y in names_dtypes]

variables = [x for x, y in names_dtypes if y in ['int64', 'float64']]
var_3 = widgets.Dropdown(options=variables)
var_4 = widgets.Dropdown(options=variables)
years_3 = widgets.Dropdown(options=['ALL', 2015, 2016])


@interact
def regression(x=var_3, y=var_4, year=years_3):
    variables = (x, y)
    bivar = BivariateAnalysis(data)
    if year != 'ALL':
        bivar = BivariateAnalysis(data.loc[year])
    bivar.regression(variables=variables)

## traitement des outliers

## Check point

In [None]:

data.to_pickle('../data/interim/full_data.pickle')
