### Utilisation Pipeline et chaining estimators

In [None]:
import matplotlib.pyplot as plt
import missingno
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from tempfile import mkdtemp

from src.utils.univar import UnivariateAnalysis

sns.set()


In [None]:
data = pd.read_pickle('../data/interim/full_data.pickle')

In [None]:
target = ['SiteEnergyUseWN_kBtu']
inputs = ['PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding_s',
          'LargestPropertyUseType', 'LargestPropertyUseTypeGFA',
          'SecondLargestPropertyUseType', 'SecondLargestPropertyUseTypeGFA',
          'ThirdLargestPropertyUseType', 'ThirdLargestPropertyUseTypeGFA',]

In [None]:
missingno.matrix(data[inputs + target])
plt.show()

On drop toutes les lignes contenant des valeurs manquantes essentielles

In [None]:
data[data['LargestPropertyUseType'].isnull()]

In [None]:
data[data['LargestPropertyUseType'].isnull()]

In [None]:
data.drop(data[data['LargestPropertyUseType'].isnull()].index, axis=0, inplace=True)
# No missing values in target allowed
data.drop(data[data['SiteEnergyUseWN_kBtu'].isnull()].index, axis=0, inplace=True)
data.drop(data[data['SiteEUI_kBtu_sf'].isnull()].index, axis=0, inplace=True)

In [None]:
missingno.matrix(data[inputs + target])
plt.show()

Certain usage sont trop peu représentés. 

In [None]:
univar = UnivariateAnalysis(data.loc[2015])
col = 'LargestPropertyUseType'
univar.make_analysis(col, orient='h', order=data.loc[2016][col].value_counts().index)

In [None]:
univar = UnivariateAnalysis(data.loc[2016])
col = 'LargestPropertyUseType'
univar.make_analysis(col, orient='h', order=data.loc[2016][col].value_counts().index)

In [None]:
list(set(data.loc[2015][col].unique()) - set(data.loc[2016][col].unique()))

In [None]:
categorical_features = [x for x in inputs if data[x].dtype.name == 'category']

for col in categorical_features:
    print(list(set(data.loc[2015][col].unique()) -\
               set(data.loc[2016][col].unique())))
    print(list(set(data.loc[2016][col].unique()) -\
               set(data.loc[2015][col].unique())))

In [None]:
uncommon_cat = list()
for col in categorical_features:
    diff_15_16 = list(set(data.loc[2015][col].unique()) -\
                      set(data.loc[2016][col].unique()))
    print(diff_15_16)
    diff_16_15 = list(set(data.loc[2016][col].unique()) -\
                      set(data.loc[2015][col].unique()))
    print(diff_16_15)
    uncommon_cat += diff_15_16 
    uncommon_cat += diff_16_15
uncommon_cat = list(dict.fromkeys(uncommon_cat))

In [None]:
uncommon_cat

In [None]:
total = 0
for col in categorical_features:
    for cat in uncommon_cat:
        total += data[data[col] == cat].shape[0]
        data.drop(data[data[col] == cat].index, axis=0, inplace=True)
print('removed %i rows' % total)

In [None]:
missingno.matrix(data[inputs + target])
plt.show()

### Pipelines

In [None]:
cache_dir = mkdtemp()

In [None]:
def assert_positive(x):
    return x if x > 0 else 0

In [None]:
numeric_features = [x for x in inputs if data[x].dtype.name != 'category']
num_transformer = Pipeline(memory=cache_dir, steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('assert_positive', FunctionTransformer(np.vectorize(assert_positive))),
    ('log_transformer', FunctionTransformer(np.log1p)),
    ('imput_log_nans', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])
print(numeric_features)

In [None]:
categorical_transformer = Pipeline(memory=cache_dir, steps=[
    ('inputer', SimpleImputer(strategy='constant', fill_value='')),
    ('encoder', OneHotEncoder())
])
print(categorical_features)

In [None]:
target_transformer = Pipeline(steps=[
    ('log_transformer', FunctionTransformer(np.log1p))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        #('target', target_transformer, target)
    ]
)

In [None]:
X_train = data.loc[2015][inputs]
y_train = data.loc[2015][target].values
X_test = data.loc[2016][inputs]
y_test = data.loc[2016][target].values

y_train_tr = target_transformer.fit_transform(y_train)
y_test_tr = target_transformer.fit_transform(y_test)

In [None]:
clf = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('regressor', LassoCV(alphas=np.logspace(-5, 5, num=10 ** 3)))
])

### Train

In [None]:
clf.fit(X_train, y_train_tr.ravel())

### Score

In [None]:
clf.score(X_train, y_train_tr.ravel())

In [None]:
clf.score(X_test, y_test_tr.ravel())

In [None]:
names = clf['preprocessing'].transformers_[1][1]['encoder']\
                                .get_feature_names(categorical_features)
index = numeric_features + list(names)
coef = pd.Series(clf.named_steps['regressor'].coef_, index=index)
coef = coef[coef != 0]

In [None]:
plt.figure(figsize=(5, 12))
coef.sort_values(inplace=True)
coef.plot(kind='barh')

In [None]:
sns.scatterplot(y_test_tr.ravel(), clf.predict(X_test))

In [None]:
sns.scatterplot(y_train_tr.ravel(), clf.predict(X_train))