# 4. Model Validation

In [2]:
import pandas as pd
melbourne_data = pd.read_csv('./resources/melb_data.csv')
filtered_melbourne_data = melbourne_data.dropna(axis=0)
y = filtered_melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
X = filtered_melbourne_data[melbourne_features]

In [3]:
from sklearn.tree import DecisionTreeRegressor
melbourne_model = DecisionTreeRegressor()
melbourne_model.fit(X, y)

- We use <mark>MAE</mark> (mean absolute value) to evaluate the model

In [5]:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

434.71594577146544

## Problem with "In-Sample" Scores
- Don't evaluate the model on the data it is trained on
- It may learn wrong patterns that fit the training data well but don't generalize to other datasets
- Therefore, we use a <mark>validation set</mark>

In [7]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)
melbourne_model = DecisionTreeRegressor()
melbourne_model.fit(train_X, train_y)
val_predictions = melbourne_model.predict(val_X)
mean_absolute_error(val_y, val_predictions)

264900.84893479664

## Exercises

In [10]:
home_data = pd.read_csv('./resources/home-data-for-ml-course/train.csv')
y = home_data.SalePrice
feature_columns = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[feature_columns]
iowa_model = DecisionTreeRegressor()
iowa_model.fit(X, y)

print("First in-sample predictions:", iowa_model.predict(X.head()))
print("Actual target values for those homes:", y.head().tolist())

First in-sample predictions: [208500. 181500. 223500. 140000. 250000.]
Actual target values for those homes: [208500, 181500, 223500, 140000, 250000]


### 1.

In [12]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

### 2.

In [None]:
iowa_model = DecisionTreeRegressor(random_state=1)
iowa_model.fit(train_X, train_y)

### 3.

In [14]:
val_predictions = iowa_model.predict(val_X)
print(val_predictions)
print(val_y)

[231500. 179500. 122000.  84500. 142000. 325624. 285000. 147576. 195000.
 275000. 175000.  61000. 174000. 385000. 230000.  87000. 125000.  98600.
 260000. 143000. 124000. 122500. 236500. 337500.  76000. 187000. 128000.
 179000. 485000. 122500. 106000. 118000. 127000.  80000. 153000. 360000.
 132000.  85500. 262280. 112000. 131000. 139000.  87000. 135000. 181000.
 163500. 116900. 159895. 244600. 294000.  97000. 295000. 120500. 239500.
 194000. 115000. 119500. 180000. 118000. 178000. 167000. 267000.  82000.
 133900. 167000. 132500. 135000. 248000. 160000. 144500. 200500. 113000.
 354000. 164000. 170000. 224000. 163900. 160000. 466500. 172500. 193500.
 133000. 137000. 167500. 196500. 146000. 159500. 158000. 189000. 172500.
 194201. 181000. 115000. 101800. 100000. 139000. 115000. 139000. 156000.
 158000. 172000. 138000. 125500. 123000. 134500. 163000. 169990. 140000.
 140000. 325000. 157500. 225000. 107000. 185500. 239900. 163990. 201000.
 127000. 172500. 228000. 117000. 232600. 403000. 16

### 4.

In [15]:
val_mae = mean_absolute_error(val_y, val_predictions)
val_mae

31.923287671232877