# Validation - All the scoring we've done so far is wrong

# Machine Learning only cares how well you do on unseen data

### First idea, hold out some data for training

In [24]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

heart = pd.read_csv('../data/heart.csv')
heart.head()

Unnamed: 0,age,sex,chest_pain,rest_bp,chol,fbs,rest_ecg,max_hr,exang,old_peak,slope,ca,thal,disease
0,63,Male,typical,145,233,1,left ventricular hypertrophy,150,0,2.3,3,0.0,fixed,0
1,67,Male,asymptomatic,160,286,0,left ventricular hypertrophy,108,1,1.5,2,3.0,normal,1
2,67,Male,asymptomatic,120,229,0,left ventricular hypertrophy,129,1,2.6,2,2.0,reversable,1
3,37,Male,nonanginal,130,250,0,normal,187,0,3.5,3,0.0,normal,0
4,41,Female,nontypical,130,204,0,left ventricular hypertrophy,172,0,1.4,1,0.0,normal,0


In [2]:
X = heart[['max_hr', 'rest_bp']].values
y = heart['disease'].values

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [5]:
X_train.shape

(227, 2)

In [6]:
y_train.shape

(227,)

In [7]:
X_test.shape

(76, 2)

In [8]:
y_test.shape

(76,)

### Once data is split into train and test, learn on the training set and evaluate on the test set

In [9]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [10]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf='deprecated', min_samples_split=2,
            min_weight_fraction_leaf='deprecated', presort=False,
            random_state=None, splitter='best')

In [11]:
dtc.score(X_test, y_test)

0.5263157894736842

### Let's see how our Decision Tree did when evaluating it on the same data that it was trained on
Training score is much higher than test score!

In [12]:
dtc.score(X_train, y_train)

0.960352422907489

# Cross Validation

![][1]

The **`cross_val_score`** function automates the process of doing cross validation. Simply pass it an estimator, the data, and the number of validation sets to use. The estimator does NOT need to be trained. **`cross_val_score`** will handle that for you.

### `cross_val_score` gives a good indication of how your model will perform on unseen data

Let's use it now:

[1]: images/kf.png

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

In [25]:
cross_val_score(rfc, X, y, cv=5)

array([0.59016393, 0.60655738, 0.60655738, 0.59016393, 0.54237288])

## Exercise
Use **`cross_val_score`** to see how other machine learning models are likely to fare when used on unseen data.

In [15]:
# your code here

In [20]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()

In [21]:
cross_val_score(dtc, X, y, cv = 5)

array([0.49180328, 0.59016393, 0.59016393, 0.63934426, 0.52542373])

In [26]:
from sklearn.linear_model import LogisticRegression
logist = LogisticRegression()
cross_val_score(logist, X, y, cv = 5)

array([0.60655738, 0.67213115, 0.80327869, 0.73770492, 0.61016949])

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
cross_val_score(rf, X, y, cv = 6)

array([0.61538462, 0.62745098, 0.62      , 0.54      , 0.68      ,
       0.58      ])

# Use KFold specifically from scikit-learn
By default, `cross_val_score` does not shuffle the data when splitting the data. You must use a different tool for this. One of the objects capable of doing this is `KFold` from the `model_selection` module. There are several other cross validation objects there.

### This is not an estimator
We will import and instantiate it like an estimator but it doesn't fit or predict. It is used to split and shuffle the data.

* step 1: import 
* step 2: instantiation with defaults

In [28]:
from sklearn.model_selection import KFold

In [33]:
kf = KFold(n_splits=5, shuffle=True, random_state = 123)
kf

KFold(n_splits=5, random_state=123, shuffle=True)

In [32]:
cross_val_score(rfc, X, y, cv=kf)

array([0.63934426, 0.70491803, 0.55737705, 0.56666667, 0.5       ])

In [34]:
# Example 1

In [35]:
from sklearn.model_selection import StratifiedKFold

In [36]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 123)
skf

StratifiedKFold(n_splits=5, random_state=123, shuffle=True)

In [38]:
cross_val_score(rfc, X , y, cv = skf)

array([0.59016393, 0.52459016, 0.50819672, 0.60655738, 0.47457627])

In [None]:
# Example 2

In [39]:
from sklearn.model_selection import LeaveOneOut

In [40]:
loo = LeaveOneOut()
loo

LeaveOneOut()

In [41]:
cross_val_score(rfc, X, y, cv = loo)

array([0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1.,
       1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1.,
       0., 0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1.,
       1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
       1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1.,
       1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1.,
       1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0.,
       1., 1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0., 0.,
       1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 0.,
       1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 0., 1., 1.,
       0., 1., 1., 0., 1.

In [42]:
# Example 3

In [43]:
from sklearn.model_selection import LeavePOut

In [47]:
lpo = LeavePOut(2)
#cross_val_score(rfc, X, y, cv = lpo)

# Exercise
Practice using KFold along with the other [splitter classes from the API][1].

[1]: http://scikit-learn.org/stable/modules/classes.html#splitter-classes