# Supervised learning with scikit-learn

In this notebook, we review scikit-learn's API for training a model.

In [None]:
from sklearn.datasets import fetch_openml

blood = fetch_openml('blood-transfusion-service-center', as_frame=True)

In [None]:
blood.frame.head()

In [None]:
blood.data.head()

In [None]:
blood.target.head()

In [None]:
blood.target.value_counts(normalize=True)

## Split Data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    blood.data, blood.target, random_state=0
)

In [None]:
X_train.head()

In [None]:
y_train.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

### Stratify!

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    blood.data, blood.target, random_state=0, stratify=blood.target
)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

## scikit-learn API

In [None]:
from sklearn.linear_model import Perceptron

In [None]:
percept = Perceptron()

In [None]:
percept.fit(X_train, y_train)

In [None]:
percept.predict(X_train)

In [None]:
y_train

In [None]:
percept.score(X_train, y_train)

In [None]:
percept.score(X_test, y_test)

## Another estimator

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf.score(X_train, y_train)

In [None]:
rf.score(X_test, y_test)

## Exercise 1 

1. Import and evaluate the performance of `sklearn.linear_model.LogisticRegression` on the above dataset
2. How does the test performance compare to the ones we already looked at?

In [None]:
# %load solutions/02-ex1-solution.py

## Exercise 2

1. Load the wine dataset from `sklearn.datasets` module using the `load_wine` dataset.
2. Split it into a training and test set using `train_test_split`.
3. Train and evalute `sklearn.neighbors.KNeighborsClassifer`, `sklearn.ensemble.RandomForestClassifier` and `sklearn.linear_model.LogisticRegression` on the win dataset.
4. How do they perform on the training and test set?
5. Which one is best on the training set and which one is best on the test set?

In [None]:
# %load solutions/02-ex2-solution.py