# Chapter 16: Introduction to Machine Learning 
## Exercises solutions
The dataset used in this chapter can be imported with the *Sklearn* *laod_iris()* function.


In [18]:
# import all the modules
import sklearn
from sklearn.datasets import load_iris 
import numpy as np
import random
import time
import warnings

Load the dataset and show its attributes:

In [19]:
# load the dataset
iris = load_iris() 
# we can see that this dataset is a class 
print(f'Dataset type: \n{type(iris)}\n')
# and it has some functions and attributes that describe it
print(f'Dataset attributes: \n{iris.keys()}\n')
# these are our features (X)
print(f'Feaures names: \n{iris.feature_names}\n')
# these are the targets/labels (y)
print(f'Target names: \n{iris.target_names}\n')
# number of samples
print(f'Number of samples: \n{len(iris.data)}\n')

Dataset type: 
<class 'sklearn.utils.Bunch'>

Dataset attributes: 
dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

Feaures names: 
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

Target names: 
['setosa' 'versicolor' 'virginica']

Number of samples: 
150



## Exercise 1 
Implement your own version of *train_test_split*, so that it generates three sets (train, validation, test). You should accept two arguments, train_size and validation_size. <br>
You then need to internally determine the test_size. 

There would be many ways to resolve this problem. In this implementation we use the inputs to generate random indexes to slice the arrays (sampling) and return *shuffled* train, validation and test sets:

In [538]:
def train_test_split(X, y, train_size, validation_size):
    '''
    Function: 

            Generates train, validation and test sets from arrays of features and targets.

    Inputs: 

            X: array containing the features 
            y: array containing the targets/labels
            train_size: percentage of data to be used for training (takes values beween 0-1)
            validation_size: percentage of the remaining data (1 - tain_size) to be used for validating (takes values beween 0-1)

    Outputs: 

             1. array of features for training of size: 'train_size' * len(data)
             2. array of targets for training of size: 'train_size' * len(data)
             3. array of features for validating of size: (1 - 'train_size') * 'validation_size' * len(data) 
             4. array of targets for validating of size: (1 - 'train_size') * 'validation_size' * len(data) 
             5. array of features for testing of size: (1 - 'train_size') * (1 - 'validation_size') * len(data) 
             6. array of targets for testing of size: (1 - 'train_size') * (1 - 'validation_size') * len(data) 
    '''
    
    # create a random index of integers 
    ind = random.sample(range(0, len(X)), int(len(X)*train_size))
    #create a mask 
    mask = np.ones(len(X), dtype=bool)
    mask[ind] = False
    # slice the features for the trainig set by the indexes generated
    X_train = X[ind]
    # slice the features for the test set by the mask generated
    X_test = X[mask] 
    # slice the tragets for the train set by the indexes generated
    y_train = y[ind]
    # slice the targets for the test set by the mask generated
    y_test = y[mask] 
    # create index of integers 
    ind_2 = random.sample(range(0, len(X_test)), int(len(X_test)*validation_size)) 
    #create second mask 
    mask_2 = np.ones(len(X_test), dtype=bool)
    mask_2[ind_2] = False
    # slice the features for the validation set by the indexes generated
    X_val = X_test[ind_2]
    # slice the features for the test set by the mask generated
    X_test = X_test[mask_2] 
    # slice the targets for the validation set by the indexes generated
    y_val = y_test[ind_2]
    # slice the targets for the test set by the mask generated
    y_test = y_test[mask_2] 
    # return the sets
    return X_train, y_train, X_val, y_val, X_test, y_test
    
    

Indexing, complemented by the use of *masks* to extract the 'not-indexes', is computationally efficient, as demonstrated below:

In [539]:
start = time.time()
# unpack the function varibales in the train, validation and test sets
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(iris.data, iris.target, train_size = 0.7, validation_size = 0.5)
print(f'Function running time: {(time.time() - start):.10f}\n')

Function running time: 0.0009377003



We print the number of samples in the training, validation and test sets to make sure we obtained the desired splits:

In [540]:
# print the number of samples per dataset
print(f'Number of features for training: {len(X_train)}')
print(f'Number of targets for training: {len(y_train)}')
print(f'Number of features for validating: {len(X_val)}')
print(f'Number of targets for validating: {len(y_val)}')
print(f'Number of features for testing: {len(X_test)}')
print(f'Number of targets for testing: {len(y_test)}')

Number of features for training: 105
Number of targets for training: 105
Number of features for validating: 22
Number of targets for validating: 22
Number of features for testing: 23
Number of targets for testing: 23


Below we notice that, despite our dataset is originally balanced in its classes (33% of labels for each of the 3 target classes), in our test set we end up having just 4 samples of the first class, against 11 samples of class 2. This due to the randomness of our sampling, and usually something we would like to keep an eye on (or control) as it could negatively affect the learning of an algorythm. 

In [532]:
np.unique(y_test, return_counts=True)

(array([0, 1, 2]), array([ 4, 11,  8], dtype=int64))

# Exercise 2
What is a necessary condition for the arguments to the function created in execise 1 to be valid? Implement a test and print an error message if the arguments are not valid. 

In our case, given the dataset, we want to make sure that the received inputs are numpy arrays, that their features and targets arrays are of the same length and that the inputs sizes would generate reasonable sets. We present how to handle these basic exceptions, but bear in mind that in a practical context (i.e., we wanted to use this function in production), where we may receive all sort of data, we would have to better handle data types and other unique exceptions. 

In [259]:
def train_test_split(X, y, train_size, validation_size):
    '''
    Function: 

            Generates train, validation and test sets from arrays of features and targets.

    Inputs: 

            X: array containing the features 
            y: array containing the targets/labels
            train_size: percentage of data to be used for training (takes values beween 0-1)
            validation_size: percentage of the remaining data (1 - tain_size) to be used for validating (takes values beween 0-1)

    Outputs: 

             1. array of features for training of size: 'train_size' * len(data)
             2. array of targets for training of size: 'train_size' * len(data)
             3. array of features for validating of size: (1 - 'train_size') * 'validation_size' * len(data) 
             4. array of targets for validating of size: (1 - 'train_size') * 'validation_size' * len(data) 
             5. array of features for testing of size: (1 - 'train_size') * (1 - 'validation_size') * len(data) 
             6. array of targets for testing of size: (1 - 'train_size') * (1 - 'validation_size') * len(data) 
    '''
    
    # check conditions 
    allowed_dtypes = [np.ndarray] 
    # check the input data types
    if type(X) not in allowed_dtypes:
        raise TypeError ('Features should be a numpy.ndarray')
    if type(y) not in allowed_dtypes:
        raise TypeError ('Targets should be a numpy.ndarray')
    # check that th arrays are of the same lenght 
    if len(X) != len(y):
        raise ValueError (f'Targets and features should have the same length. Received sizes ({len(X)}, {len(y)}).')
    # check the splits
    if train_size == 0:
        warnings.warn(f'Only validation and test sets were created.', stacklevel=2)
    if validation_size == 0:
        warnings.warn(f'No validation and test sets were created.', stacklevel=2)

    # create a random index of integers 
    ind = random.sample(range(0, len(X)), int(len(X)*train_size))
    #create a mask 
    mask = np.ones(len(X), dtype=bool)
    mask[ind] = False
    # slice the features for the trainig set by the indexes generated
    X_train = X[ind]
    # slice the features for the test set by the mask generated
    X_test = X[mask] 
    # slice the tragets for the train set by the indexes generated
    y_train = y[ind]
    # slice the targets for the test set by the mask generated
    y_test = y[mask] 
    # create second index of integers 
    ind_2 = random.sample(range(0, len(X_test)), int(len(X_test)*validation_size)) 
    #create second mask 
    mask_2 = np.ones(len(X_test), dtype=bool)
    mask_2[ind_2] = False
    # slice the features for the validation set by the indexes generated
    X_val = X_test[ind_2]
    # slice the features for the test set by the mask generated
    X_test = X_test[mask_2] 
    # slice the targets for the validation set by the indexes generated
    y_val = y_test[ind_2]
    # slice the targets for the test set by the mask generated
    y_test = y_test[mask_2] 
    # return the sets
    return X_train, y_train, X_val, y_val, X_test, y_test
    
    

### Testing the errors

Below we pass some 'wrong' arguments to the function and check its responses:

In [257]:
# testing a wrong type of features input
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split('string', iris.target, 0, 1)

TypeError: Features should be a numpy.ndarray

In [260]:
# testing features and target arrays of different sizes
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(np.arange(100), iris.target, 0.7, 0.5)

ValueError: Targets and features should have the same length. Received sizes (100, 150).

In [250]:
# testing an input == 0 for training_size
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(iris.data, iris.target, 0, 1)

  X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(iris.data, iris.target, 0, 1)


In [251]:
# testing an input == 0 for validation_size
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(iris.data, iris.target, 0.7, 0)

  X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(iris.data, iris.target, 0.7, 0)


# Exercise 3

Implement train_test_split version with stratified sampling, i.e. sample such that you have an approximately equal distribution of class labels in each output set.  

In exercise 1 we saw that, by just randomly sample the data, we may create imbalances in the classes. In this exercise we build a function that deals with that and returns training, validation and test sets containing the same percentage of target class as in the dataset provided. 
We will use a slightly different approach in this case and start by creating a unique dataset containing features and targets. <br> We also count the number of target classes (unique labels) and their percentage in the data:

In [508]:
# let us start by creating one array with features and targets together
all = np.c_[iris.data, iris.target]
# we count the number of unique classes (targets) we have
n_classes = len(np.unique(iris.target))
print(f'Number of unique classes: {n_classes}') 

# return the number of instances per each class
data_per_class = np.unique(iris.target, return_counts=True)[1]
# create a dictionary that stores 'class': '% of that class in the data'
classes = {}
for i in range(n_classes):
    classes[i] = data_per_class[i]/len(iris.data)

print(f'Classes: {classes}')

Number of unique classes: 3
Classes: {0: 0.3333333333333333, 1: 0.3333333333333333, 2: 0.3333333333333333}


Then we gather those labels together and extract a dataset for each target class, in this case, one for setosa, one for virginica and one for versicolor. <br> We store each 'unique label' dataset in a list of datasets (lists), which we call *list_classes*: 

In [509]:
# convert the array to a list
all = list(all)
# sort by class
all.sort(key = lambda x: x[-1])
# create a list of lists containing each class 
list_classes = []
# start and finish index to slice the data 
s = 0
f = 0
#for each class in the data slice its data and append it to list_classes
for c in classes.keys():
    f+= int(classes[c]*len(iris.data))
    class_inst = all[s:f]
    s+= int(classes[c]*len(iris.data))
    list_classes.append(class_inst)

Now that we have separated the classes (which in most cases will have a different number of samples), each one in one dataset, we can use the same *train_size* and *validation_size* to sample from those datasets, one at the time, and this will allow us to keep the same representation of classes we had in the original data (*stratification*). <br> We append the setosa, viginica and versicolor samples to *train_array*, *val_array*, *test_array*, each class having its specific train, validation and test splits in each of these lists:    

In [514]:
train_size = 0.7
validation_size = 0.5
train_array = []
val_array = []
test_array = []
for c in list_classes:  
    # convert list to array
    c = np.array(c)  
    # create a random index of integers 
    ind = random.sample(range(0, len(c)), int(len(c)*train_size))
    #create a mask 
    mask = np.ones(len(c), dtype=bool)
    mask[ind] = False
    # slice the features for the trainig set by the indexes generated
    X_train = c[ind]
    # slice the features for the test set by the mask generated
    X_test = c[mask] 
    # create second index of integers 
    ind_2 = random.sample(range(0, len(X_test)), int(len(X_test)*validation_size)) 
    #create second mask 
    mask_2 = np.ones(len(X_test), dtype=bool)
    mask_2[ind_2] = False
    # slice the features for the validation set by the indexes generated
    X_val = X_test[ind_2]
    # slice the features for the test set by the mask generated
    X_test = X_test[mask_2] 
    
    # appending the train, validation and test sets for each class
    train_array.append(X_train)
    val_array.append(X_val)
    test_array.append(X_test)

# this will be equal to the number of classes, since we are generating a dataset for each one
print(len(train_array))

3


We concatenate those arrays to create a single train, validation and test array and we extract the first to the penultimate column to gather the features and the last column to separate the targets (remember we had previously combined our X and y along the first axis to generate a single array):

In [522]:
# concatenate the datasets and separates the features from the targets
train = np.concatenate(train_array)
X_train = train[:, :-1]
y_train = train[:, -1]
validation = np.concatenate(val_array)
X_val = validation[:, :-1]
y_val = validation[:, -1]
test = np.concatenate(test_array)
X_test = test[:, :-1]
y_test = test[:, -1]

## All together

The above scripts result in the below function:

In [24]:
def train_test_split_strat(X, y, train_size, validation_size):
        '''
    Function: 

            Generates stratfied train, validation and test sets from arrays of features and targets.

    Inputs: 

            X: array containing the features 
            y: array containing the targets/labels
            train_size: percentage of data to be used for training (takes values beween 0-1)
            validation_size: percentage of the remaining data (1 - tain_size) to be used for validating (takes values beween 0-1)

    Outputs: 

             1. array of features for training of size: 'train_size' * len(data)
             2. array of targets for training of size: 'train_size' * len(data)
             3. array of features for validating of size: (1 - 'train_size') * 'validation_size' * len(data) 
             4. array of targets for validating of size: (1 - 'train_size') * 'validation_size' * len(data) 
             5. array of features for testing of size: (1 - 'train_size') * (1 - 'validation_size') * len(data) 
             6. array of targets for testing of size: (1 - 'train_size') * (1 - 'validation_size') * len(data) 
    '''

        all = np.c_[X, y]
        # we count the number of unique classes (targets) we have
        n_classes = len(np.unique(y))

        # return the number of instances per each class
        data_per_class = np.unique(y, return_counts=True)[1]
        # create a dictionary that stores 'class': '% of that class in the data'
        classes = {}
        for i in range(n_classes):
                classes[i] = data_per_class[i]/len(X)
        train_array = []
        val_array = []
        test_array = []

        # convert the array to a list
        all = list(all)
        # sort by class
        all.sort(key = lambda x: x[-1])
        # create a list of lists contianing each class 
        list_classes = []
        # start and finish index to slice the data 
        s = 0
        f = 0
        #for each class in the data slice its data and append it to list_classes
        for c in classes.keys():
                f+= int(classes[c]*len(X))
                class_inst = all[s:f]
                s+= int(classes[c]*len(X))
                list_classes.append(class_inst)

        for c in list_classes:  
                # convert list to array
                c = np.array(c)  
                # create a random index of integers 
                ind = random.sample(range(0, len(c)), int(len(c)*train_size))
                #create a mask 
                mask = np.ones(len(c), dtype=bool)
                mask[ind] = False
                # slice the features for the trainig set by the indexes generated
                X_train = c[ind]
                # slice the features for the test set by the mask generated
                X_test = c[mask] 
                # create second index of integers 
                ind_2 = random.sample(range(0, len(X_test)), int(len(X_test)*validation_size)) 
                #create second mask 
                mask_2 = np.ones(len(X_test), dtype=bool)
                mask_2[ind_2] = False
                # slice the features for the validation set by the indexes generated
                X_val = X_test[ind_2]
                # slice the features for the test set by the mask generated
                X_test = X_test[mask_2] 
                # appending the train, validation and test sets for each class
                train_array.append(X_train)
                val_array.append(X_val)
                test_array.append(X_test)

        # concatenate the datasets and separates the features from the targets
        train = np.concatenate(train_array)
        X_train = train[:, :-1]
        y_train = train[:, -1]
        validation = np.concatenate(val_array)
        X_val = validation[:, :-1]
        y_val = validation[:, -1]
        test = np.concatenate(test_array)
        X_test = test[:, :-1]
        y_test = test[:, -1]

        return X_train, y_train, X_val, y_val, X_test, y_test

It looks quite intimidating when put it all together, but hopefully now we know what each line does. Also, the function still runs very fast:

In [26]:
start = time.time()
# unpack the function varibales in the train, validation and test sets
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split_strat(iris.data, iris.target, train_size = 0.7, validation_size = 0.4)
print(f'Function running time: {(time.time() - start):.10f}\n')

Function running time: 0.0009734631



In [27]:
# print the number of samples per dataset
print(f'Number of features for training: {len(X_train)}')
print(f'Number of targets for training: {len(y_train)}')
print(f'Number of features for validating: {len(X_val)}')
print(f'Number of targets for validating: {len(y_val)}')
print(f'Number of features for testing: {len(X_test)}')
print(f'Number of targets for testing: {len(y_test)}')

Number of features for training: 105
Number of targets for training: 105
Number of features for validating: 18
Number of targets for validating: 18
Number of features for testing: 27
Number of targets for testing: 27


Below we print the number of unique labels in the test set and, as desired, this time we have the same number of labels in this set, in line with the original dataset splits:

In [28]:
# count of unique labels in the test dataset
np.unique(y_test, return_counts=True)

(array([0., 1., 2.]), array([9, 9, 9], dtype=int64))

## Our final function
This is the final train_test_split function with stratification and (some) errors handling:

In [12]:
def train_test_split_strat(X, y, train_size, validation_size):
        '''
    Function: 

            Generates startified train, validation and test sets from arrays of features and targets.

    Inputs: 

            X: array containing the features 
            y: array containing the targets/labels
            train_size: percentage of data to be used for training (takes values beween 0-1)
            validation_size: percentage of the remaining data (1 - tain_size) to be used for validating (takes values beween 0-1)

    Outputs: 

             1. array of features for training of size: 'train_size' * len(data)
             2. array of targets for training of size: 'train_size' * len(data)
             3. array of features for validating of size: (1 - 'train_size') * 'validation_size' * len(data) 
             4. array of targets for validating of size: (1 - 'train_size') * 'validation_size' * len(data) 
             5. array of features for testing of size: (1 - 'train_size') * (1 - 'validation_size') * len(data) 
             6. array of targets for testing of size: (1 - 'train_size') * (1 - 'validation_size') * len(data) 
    '''

        # check conditions 
        allowed_dtypes = [np.ndarray] 
        # check the input data types
        if type(X) not in allowed_dtypes:
                raise TypeError ('Features should be a numpy.ndarray')
        if type(y) not in allowed_dtypes:
                raise TypeError ('Targets should be a numpy.ndarray')
         # check that th arrays are of the same lenght 
        if len(X) != len(y):
                raise ValueError (f'Targets and features should have the same length. Received sizes ({len(X)}, {len(y)}).')
        # check the splits
        if train_size == 0:
                warnings.warn(f'Only validation and test sets were created.', stacklevel=2)
        if validation_size == 0:
                warnings.warn(f'No validation and test sets were created.', stacklevel=2)

        all = np.c_[X, y]
        # we count the number of unique classes (targets) we have
        n_classes = len(np.unique(y))

        # return the number of instances per each class
        data_per_class = np.unique(y, return_counts=True)[1]
        # create a dictionary that stores 'class': '% of that class in the data'
        classes = {}
        for i in range(n_classes):
                classes[i] = data_per_class[i]/len(X)
        train_array = []
        val_array = []
        test_array = []

        # convert the array to a list
        all = list(all)
        # sort by class
        all.sort(key = lambda x: x[-1])
        # create a list of lists contianing each class 
        list_classes = []
        # start and finish index to slice the data 
        s = 0
        f = 0
        #for each class in the data slice its data and append it to list_classes
        for c in classes.keys():
                f+= int(classes[c]*len(X))
                class_inst = all[s:f]
                s+= int(classes[c]*len(X))
                list_classes.append(class_inst)

        for c in list_classes:  
                # convert list to array
                c = np.array(c)  
                # create a random index of integers 
                ind = random.sample(range(0, len(c)), int(len(c)*train_size))
                #create a mask 
                mask = np.ones(len(c), dtype=bool)
                mask[ind] = False
                # slice the features for the trainig set by the indexes generated
                X_train = c[ind]
                # slice the features for the test set by the mask generated
                X_test = c[mask] 
                # create second index of integers 
                ind_2 = random.sample(range(0, len(X_test)), int(len(X_test)*validation_size)) 
                #create second mask 
                mask_2 = np.ones(len(X_test), dtype=bool)
                mask_2[ind_2] = False
                # slice the features for the validation set by the indexes generated
                X_val = X_test[ind_2]
                # slice the features for the test set by the mask generated
                X_test = X_test[mask_2] 
                # appending the train, validation and test sets for each class
                train_array.append(X_train)
                val_array.append(X_val)
                test_array.append(X_test)

        # concatenate the datasets and separates the features from the targets
        train = np.concatenate(train_array)
        X_train = train[:, :-1]
        y_train = train[:, -1]
        validation = np.concatenate(val_array)
        X_val = validation[:, :-1]
        y_val = validation[:, -1]
        test = np.concatenate(test_array)
        X_test = test[:, :-1]
        y_test = test[:, -1]

        return X_train, y_train, X_val, y_val, X_test, y_test