# Quick Review of scikit-learn

In [None]:
import seaborn as sns
import sklearn
sns.set_theme(context="notebook", font_scale=1.2,
              rc={"figure.figsize": [10, 6]})
sklearn.set_config(display="diagram")

In [None]:
from sklearn.datasets import fetch_openml

steel = fetch_openml(data_id=1504, as_frame=True)

In [None]:
print(steel.DESCR)

In [None]:
_ = steel.data.hist(figsize=(30, 15), layout=(5, 8))

### Split Data

In [None]:
from sklearn.model_selection import train_test_split
X, y = steel.data, steel.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42, stratify=y)

### Train DummyClassifer

In [None]:
from sklearn.dummy import DummyClassifier

dc = DummyClassifier(strategy='prior').fit(X_train, y_train)
dc.score(X_test, y_test)

### Train KNN based model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier

knc = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)
knc.fit(X_train, y_train)

In [None]:
knc.score(X_test, y_test)

## Exercise 1

1. Load the wisconsin breast cancer dataset from `sklearn.datasets.load_breast_cancer`.
2. Is the labels imbalanced? (**Hint**: `np.bincount`)
3. Split the data into a training and test set.
4. Create a pipeline with a `StandardScaler` and `LogisticRegression` and fit on the training set.
5. Evalute the pipeline on the test set.
6. **Extra**: Use `sklearn.metrics.f1_score` to compute the f1 score on the test set.

In [None]:
# %load solutions/00-ex01-solutions.py