Introduction to model evaluation
---

In [None]:
import pandas as pd

# Load data
data_df = pd.read_csv('three-models.csv')
data_df.head()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# Extract variables
x = data_df.temp.values
y = data_df.users.values

# Plot the models
plt.scatter(x, y, s=10)
plt.plot(x, data_df.pred_lr, c='C0', label='linear regression')
plt.plot(x, data_df.pred_poly3, c='C2', label='polyfit(deg=3)')
plt.plot(x, data_df.pred_huber3, c='C3', label='with Huber loss')
plt.legend()
plt.show()

In [None]:
import numpy as np

# Root mean squared error
def RMSE(y, y_pred):
    mse = np.mean(np.square(y - y_pred)) # MSE
    return np.sqrt(mse) # RMSE

rmse_lr = RMSE(y, data_df.pred_lr)
rmse_poly3 = RMSE(y, data_df.pred_poly3)
rmse_huber3 = RMSE(y, data_df.pred_huber3)

# Print values
print('linear regression:', rmse_lr) # 232.53
print('polyfit(deg=3):', rmse_poly3) # 210.55
print('with huber loss:', rmse_huber3) # 215.67

In [None]:
import numpy as np

# Mean absolute error
def MAE(y, y_pred):
    return np.mean(np.abs(y - y_pred))

mae_lr = MAE(y, data_df.pred_lr)
mae_poly3 = MAE(y, data_df.pred_poly3)
mae_huber3 = MAE(y, data_df.pred_huber3)

# Print values
print('linear regression:', mae_lr) # 186.58
print('polyfit(deg=3):', mae_poly3) # 161.72
print('with huber loss:', mae_huber3) # 164.75

In [None]:
# Compute baseline
pred_baseline = np.mean(y) # equals to 674.7218543
rmse_baseline = RMSE(y, pred_baseline)
rmse_baseline # 371.11

In [None]:
# Vector with predictions from the baseline
pred_baseline = np.full_like(y, fill_value=np.mean(y), dtype=np.float)
pred_baseline # Returns: array([ 674.7218543, 674.7218543,  ...

In [None]:
from sklearn.dummy import DummyRegressor

# Create the DummyRegressor object
dummy = DummyRegressor(strategy='mean')

In [None]:
# Fit the estimator
dummy.fit(x[:, np.newaxis], y);

In [None]:
# Vector with predictions from the baseline
pred_baseline = dummy.predict(x[:, np.newaxis])

rmse_baseline = RMSE(y, pred_baseline)
rmse_baseline # Returns: 371.11459394676217

In [None]:
# Bar chart
plt.bar([1, 2, 3, 4], [rmse_baseline, rmse_lr, rmse_poly3, rmse_huber3])
plt.xticks([1, 2, 3, 4], ['baseline (mean)', 'linreg', 'poly3', 'huber3'])
plt.show()