# Preprocessing

In this notebook, we review preprocessing in scikit-learn.

In [None]:
import seaborn as sns
sns.set_theme(context="notebook", font_scale=1.4,
              rc={"figure.constrained_layout.use": True,
                  "figure.figsize": [10, 6]})

In [None]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

boston = fetch_openml(data_id=531, as_frame=True)
boston_df = boston.frame

In [None]:
print(boston.DESCR)

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(3, 4, figsize=(20, 10))

for name, ax in zip(boston.feature_names, axes.ravel()):
    sns.scatterplot(x=name, y='MEDV', ax=ax, data=boston_df)

In [None]:
boston_df[boston.feature_names].plot(kind='box');

## Model without scaling

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    boston.data, boston.target, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knr = KNeighborsRegressor().fit(X_train, y_train)
knr.score(X_train, y_train)

In [None]:
knr.score(X_test, y_test)

## Model with scaling

### Scale first!

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
import pandas as pd
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=boston.feature_names)

In [None]:
X_train_scaled_df.plot(kind='box');

### Train model on scaled data

In [None]:
knr = KNeighborsRegressor().fit(X_train_scaled, y_train)
knr.score(X_train_scaled, y_train)

In [None]:
X_test_scaled = scaler.transform(X_test)
knr.score(X_test_scaled, y_test)

## Exercise 1

1. Train a `sklearn.linear_model.Ridge` model on the unscaled training data and evaluate on the unscaled test data. **Hint** Be sure to set the `random_state` to have comparable results.
2. Train the same model on the scaled data and evalute on the scaled test data.
3. Does scaling the data change the performance of the model?

In [None]:
# %load solutions/03-ex1-solution.py

## Tree based models

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree = DecisionTreeRegressor(random_state=0, max_depth=3).fit(X_train, y_train)
tree.score(X_test, y_test)

In [None]:
tree_scaled = DecisionTreeRegressor(random_state=0, max_depth=3).fit(X_train_scaled, y_train)
tree_scaled.score(X_test_scaled, y_test)

### Why are the scores the same?

In [None]:
from sklearn.tree import plot_tree
sns.reset_orig()
fig, ax = plt.subplots(figsize=(20, 10))
_ = plot_tree(tree, ax=ax, fontsize=16, feature_names=boston.feature_names)

In [None]:
from sklearn.tree import plot_tree
sns.reset_orig()
fig, ax = plt.subplots(figsize=(20, 10))
_ = plot_tree(tree_scaled, ax=ax, fontsize=16, feature_names=boston.feature_names)