# Missing values in scikit-learn

## SimpleImputer

In [None]:
from sklearn.impute import SimpleImputer
import numpy as np
import sklearn
sklearn.set_config(display='diagram')

In [None]:
X = np.array([
  [1.0, np.nan, 5.0],
  [2.0, 3.0, np.nan],
  [3.4, 2.0, 4.0],
  [4.0, 3.0, 6.5],
  [4.0, 1.0, 6.5],
])

### Default uses mean

In [None]:
imputer = SimpleImputer()
imputer.fit_transform(X)

### Add indicator!

In [None]:
imputer = SimpleImputer(add_indicator=True)
imputer.fit_transform(X)

### Other strategies

In [None]:
imputer = SimpleImputer(strategy='median')
imputer.fit_transform(X)

In [None]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X)

## Categorical data

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({
    'a': ['dog', 'cat', 'snake', np.nan]
})

In [None]:
df

In [None]:
imputer = SimpleImputer()
imputer.fit_transform(df)

In [None]:
imputer = SimpleImputer(strategy='constant', fill_value='sk_missing')
imputer.fit_transform(df)

## pandas categorical

In [None]:
df['a'] = df['a'].astype('category')

In [None]:
df

In [None]:
df.dtypes

In [None]:
imputer.fit_transform(df)

## Exercise 1

1. Load the `breast-w` dataset using `sklearn.datasets.fetch_openml` by setting `data_id=15` and `as_frame=True`.
2. Print the description of the dataset.
3. How many samples and features are there in the dataset?
4. Which feature(s) or the dataset are missing? **Hint**: Use panda's `isna().sum()`
5. Use a `SimpleImputer` with `add_indicator=True` to `fit_transform` the dataset. What is the shape of the transformed data?
6. **Extra**: Split the data into training and test datasets. Build a pipeline with the `SimpleImputer`, `StandardScaler`, and `LogisticRegression`. Train the pipeline on the training dataset and evaluate the performance of the model on the test set.

In [42]:
# %load solutions/03-ex01-solutions.py

## HistGradientBoosting Native support for missing values

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
hist = HistGradientBoostingClassifier(random_state=42)
hist.fit(X_train, y_train)

In [None]:
hist.score(X_test, y_test)

## Grid searching the imputer

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [None]:
iris = pd.read_csv('data/iris_w_missing.csv')

In [None]:
iris.head()

In [None]:
X = iris.drop('target', axis='columns')
y = iris['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, stratify=y
)

In [None]:
pipe = Pipeline([
    ('imputer', SimpleImputer(add_indicator=True)),
    ('rf', RandomForestClassifier(random_state=42))
])

## sklearn's `get_params`

In [None]:
pipe.get_params()

## Is it better to add the indicator?

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'imputer__add_indicator': [True, False]
}

grid_search = GridSearchCV(pipe, param_grid=params, verbose=1)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_search.score(X_test, y_test)

## Compare to `make_pipeline`

In [None]:
from sklearn.pipeline import make_pipeline

pipe2 = make_pipeline(SimpleImputer(add_indicator=True),
                     RandomForestClassifier(random_state=42))

In [None]:
pipe2.get_params()

## Which imputer to use?

In [None]:
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

params = {
    'imputer': [
        SimpleImputer(strategy='median', add_indicator=True),
        SimpleImputer(strategy='mean', add_indicator=True),
        KNNImputer(add_indicator=True),
        IterativeImputer(estimator=RandomForestRegressor(random_state=42),
                         random_state=42, add_indicator=True)]
}

search_cv = GridSearchCV(pipe, param_grid=params, verbose=1, n_jobs=-1)

In [None]:
search_cv.fit(X_train, y_train)

In [None]:
search_cv.best_params_

In [None]:
search_cv.best_score_

In [None]:
search_cv.score(X_test, y_test)