# K-fold Cross Validation

Here we will look at the stratified K-Folds cross-validator from Scikit-learn.

Instead of validating our machine learning algorithm with `train_test_split` and running one iteration of the train/test procedure, we split our data `K` times (below `K=5`) and run `K` iterations of the train/test cycle. By the end of the process, all of our data has been used as test data once (and has had a prediction made about it), and has been used as training data `K-1` times.

![](https://upload.wikimedia.org/wikipedia/commons/thumb/b/b5/K-fold_cross_validation_EN.svg/500px-K-fold_cross_validation_EN.svg.png)

The class `StratifiedKFold` randomly provides train/test indices to split data into train/test sets for each iteration.

We can gather all of the predictions and compare then to a reordered version or the original labels at the end to get overall scores.

In [None]:
# Download libraries as needed:
# (Replace pip with conda where applicable)

# !pip install numpy
# !pip install pandas
# !pip install matplotlib
# !pip install sklearn

In [None]:
# Download data

import urllib.request
import os

def download_data(path):
    if os.path.exists(path):
        return
    if not os.path.exists('data'):
        os.mkdir('data')
    if not os.path.exists('data/titanic'):
        os.mkdir('data/titanic')
    url = 'https://raw.githubusercontent.com/ualberta-rcg/python-machine-learning/main/notebooks/' + path
    output_file = path
    urllib.request.urlretrieve(url, output_file)
    print("Downloaded " + path)

download_data('data/titanic/train.csv')

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

np.random.seed(1337)

# Load data
train_df = pd.read_csv('data/titanic/train.csv')

# Choose features and lables
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_df[features], drop_first=True)
y = train_df['Survived']

skf = StratifiedKFold(n_splits=5, shuffle=True)

# Collecting all of the labels and prediction to a full set
labels = []
predicted_labels = []

# Run the pipeline on the 5 batches
counter = 0
for train_indices, test_indices in skf.split(X, y):
    print('\nBatch {}:'.format(counter))
    # Uncomment the following to get a lot more output.
    # print('  Indices used for testing: {}', test_indices)

    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]

    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]
    
    model = DecisionTreeClassifier(max_depth=3)
    model = model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    
    print('Accuracy: {}'.format(accuracy_score(y_test, predictions)))
    print('Precision: {}'.format(precision_score(y_test, predictions)))
    print('Recall: {}'.format(recall_score(y_test, predictions)))

    # At the end, this will hold all of the values from y, just reordered
    # We will hold these for testing later
    labels.extend(y_test)
    # Add predictions for testing at the end
    predicted_labels.extend(predictions)

    counter += 1

print('\nOverall:')
print('Accuracy: {}'.format(accuracy_score(labels, predicted_labels)))
print('Precision: {}'.format(precision_score(labels, predicted_labels)))
print('Recall: {}'.format(recall_score(labels, predicted_labels)))