# Cross-Validation in scikit-learn

In [None]:
import seaborn as sns
import sklearn
sns.set_theme(context="notebook", font_scale=1.2,
              rc={"figure.figsize": [10, 6]})
sklearn.set_config(display="diagram")

## Load sample data

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, stratify=y
)

## Cross validation for model selection

### Try DummyClassifier

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
import numpy as np

In [None]:
dummy_clf = DummyClassifier(strategy="prior")
dummy_scores = cross_val_score(dummy_clf, X_train, y_train)

In [None]:
dummy_scores

In [None]:
dummy_scores.mean()

### Try KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

knc = make_pipeline(StandardScaler(), KNeighborsClassifier())
knc_scores = cross_val_score(knc, X_train, y_train)

In [None]:
knc_scores

In [None]:
knc_scores.mean()

### Try LogisticRegression

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = make_pipeline(
    StandardScaler(), LogisticRegression(random_state=0))
log_reg

In [None]:
log_reg_scores = cross_val_score(log_reg, X_train, y_train)

In [None]:
log_reg_scores

In [None]:
log_reg_scores.mean()

### Which model do we choose?

1. Dummy
2. KNeighborsClassifier
3. LogisticRegression

## Exercise 1

1. Is the target, `y_train`, balanced? (**Hint**: `np.bincount`)
2. Train the best model on the training set and evaluate on the test data.
3. **Extra**: Add the `scoring='roc_auc_ovr'` to change the return a one-vs-rest roc auc score. Which model performs the best in this case?

In [None]:
# %load solutions/01-ex01-solutions.py

## Cross validation Strategies

### KFold

In [None]:
from sklearn.model_selection import KFold

cross_val_score(log_reg, X_train, y_train, cv=KFold(n_splits=3))

## Repeated KFold

In [None]:
from sklearn.model_selection import RepeatedKFold

scores = cross_val_score(log_reg, X_train, y_train,
                         cv=RepeatedKFold(n_splits=3, n_repeats=4))

In [None]:
scores

In [None]:
scores.shape

## StratifiedKFold

In [None]:
from sklearn.model_selection import StratifiedKFold

scores = cross_val_score(log_reg, X_train, y_train,
                         cv=StratifiedKFold(n_splits=6))

In [None]:
scores

The problem is a multiclass problem:

In [None]:
np.unique(y)

In [None]:
cross_val_score(log_reg, X_train, y_train, cv=6)

## RepeatedStratifiedKFold

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

scores = cross_val_score(
    log_reg, X_train, y_train,
    cv=RepeatedStratifiedKFold(n_splits=4, n_repeats=5))

In [None]:
scores

In [None]:
scores.shape

## Exercise 2

1. Use `sklearn.model_selection.cross_validate` instead of of `cross_val_score` with `cv=4`.
2. What additional information does `cross_validate` provide?
3. **Extra**: Set `scoring=['f1_macro', 'accuracy']` in `cross_validate`'s evalute on multiple metrics.

In [None]:
# %%load solutions/01-ex02-solutions.py