In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from src.models.train_model import ModelTrainer
from src.features.data_processing import ProcessingPipeline

sns.set()

In [None]:
data = pd.read_pickle('../data/interim/full_data.pickle')

In [None]:
model_inputs = ['BuildingType',
                'Neighborhood',
                'LargestPropertyUseType',
                'LargestPropertyUseTypeGFA',
                'SecondLargestPropertyUseType',
                'SecondLargestPropertyUseTypeGFA',
                'NumberofFloors',
                'ENERGYSTARScore']

model_target = ['TotalGHGEmissions']

In [None]:
train_data = data.loc[2015]
test_data = data.loc[2016]

In [None]:
trainer = ModelTrainer(train_data,
                       model_inputs=model_inputs,
                       model_target=model_target)

In [None]:
trainer.train_models()

In [None]:
p = ProcessingPipeline(test_data, input_=model_inputs, target=model_target)
X_test, y_test = p.X, p.y

In [None]:
m = trainer.get_best_model(X_test, y_test)

In [None]:
y_true = trainer.processing_data.scalers[model_target[0]].inverse_transform(y_test)

In [None]:
y_pred = trainer.processing_data.scalers[model_target[0]].inverse_transform(m.predict(X_test))

In [None]:
y_pred

In [None]:
y_true

In [None]:
fig, ax = plt.subplots(1, figsize=(12, 8))
ax.scatter(y_pred, y_pred)
ax.set_xlabel('y_true')
ax.set_ylabel('y_predict')
plt.title('perfect fit (ideal model)')
plt.show()

In [None]:
for model in trainer.trained_models:
    print('=' * 80)
    len_s = len(repr(model))
    print(" " * ((80 - len_s) // 2) + repr(model))
    print('=' * 80)
    y_true = trainer.processing_data.scalers[model_target[0]]\
    .inverse_transform(y_test)
    y_pred = trainer.processing_data.scalers[model_target[0]]\
    .inverse_transform(model.predict(X_test))
    fig, ax = plt.subplots(1, figsize=(12, 8))
    ax.scatter(y_true, y_pred)
    ax.set_xlabel('y_true')
    ax.set_ylabel('y_predict')
    plt.title('Compare ground truth with prediction (test_data)')
    plt.show()
    y_pred = trainer.processing_data.scalers[model_target[0]]\
    .inverse_transform(model.predict(trainer.processing_data.X))
    y_true = trainer.processing_data.scalers[model_target[0]]\
    .inverse_transform(trainer.processing_data.y)
    fig, ax = plt.subplots(1, figsize=(12, 8))
    ax.scatter(y_true, y_pred)
    ax.set_xlabel('y_true')
    ax.set_ylabel('y_predict')
    plt.title('Compare ground truth with prediction (train_data)')
    plt.show()

In [None]:
print(repr(m))
m.plot_coeff_as_function()