# Sklearn

## sklearn.model_selection

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
from sklearn import model_selection, datasets

import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [5]:
print ('Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data)))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [6]:
print ('Обучающая выборка:\n', train_data[:5])
print ('\n')
print ('Тестовая выборка:\n', test_data[:5])

Обучающая выборка:
 [[ 5.5  4.2  1.4  0.2]
 [ 5.1  3.3  1.7  0.5]
 [ 5.6  2.5  3.9  1.1]
 [ 6.9  3.1  5.1  2.3]
 [ 5.9  3.   5.1  1.8]]


Тестовая выборка:
 [[ 6.4  2.7  5.3  1.9]
 [ 6.9  3.2  5.7  2.3]
 [ 5.7  2.8  4.5  1.3]
 [ 5.4  3.7  1.5  0.2]
 [ 6.4  3.2  4.5  1.5]]


In [7]:
print ('Метки классов на обучающей выборке:\n', train_labels)
print ('\n')
print ('Метки классов на тестовой выборке:\n', test_labels)

Метки классов на обучающей выборке:
 [0 0 1 2 2 0 2 2 0 1 0 1 1 1 1 1 0 2 0 1 1 2 2 1 1 0 1 0 0 1 0 2 1 1 0 0 1
 2 0 0 0 2 1 2 1 2 2 0 0 2 0 2 2 0 0 1 0 1 1 1 2 2 2 1 0 1 1 1 2 0 0 1 1 0
 2 0 0 2 1 0 1 0 1 0 1 2 2 1 2 2 2 1 0 0 0 1 1 2 0 2 0 0 1 2 1]


Метки классов на тестовой выборке:
 [2 2 1 0 1 0 2 2 0 0 2 1 2 2 0 2 2 2 2 1 1 2 1 2 0 2 2 0 1 0 0 2 1 0 1 2 2
 0 1 2 1 2 0 0 1]


### Стратегии проведения кросс-валидации

In [21]:
#сгенерируем короткое подобие датасета, где элементы совпадают с порядковым номером
X = range(0,50)

#### KFold

In [22]:
kf = model_selection.KFold(n_splits = 5)
for train_indices, test_indices in kf.split(X):
    print( train_indices, test_indices)

[10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49] [0 1 2 3 4 5 6 7 8 9]
[ 0  1  2  3  4  5  6  7  8  9 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49] [10 11 12 13 14 15 16 17 18 19]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 30 31 32 33 34
 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49] [20 21 22 23 24 25 26 27 28 29]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 40 41 42 43 44 45 46 47 48 49] [30 31 32 33 34 35 36 37 38 39]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39] [40 41 42 43 44 45 46 47 48 49]


In [23]:
kf = model_selection.KFold(n_splits = 2, shuffle = True)
for train_indices, test_indices in kf.split(X):
    print( train_indices, test_indices)

[ 0  4  7  8  9 10 12 15 17 21 23 25 27 28 29 30 32 33 35 37 39 40 44 45 47] [ 1  2  3  5  6 11 13 14 16 18 19 20 22 24 26 31 34 36 38 41 42 43 46 48 49]
[ 1  2  3  5  6 11 13 14 16 18 19 20 22 24 26 31 34 36 38 41 42 43 46 48 49] [ 0  4  7  8  9 10 12 15 17 21 23 25 27 28 29 30 32 33 35 37 39 40 44 45 47]


In [24]:
kf = model_selection.KFold(n_splits = 2, shuffle = True, random_state = 1)
for train_indices, test_indices in kf.split(X):
    print( train_indices, test_indices)

[ 0  1  4  5  6  7  8  9 10 11 12 14 15 16 18 20 25 28 30 33 34 37 43 44 47] [ 2  3 13 17 19 21 22 23 24 26 27 29 31 32 35 36 38 39 40 41 42 45 46 48 49]
[ 2  3 13 17 19 21 22 23 24 26 27 29 31 32 35 36 38 39 40 41 42 45 46 48 49] [ 0  1  4  5  6  7  8  9 10 11 12 14 15 16 18 20 25 28 30 33 34 37 43 44 47]


#### StratifiedKFold

In [25]:
y = np.array([0] * 5 + [1] * 5)
print( y)

skf = model_selection.StratifiedKFold(n_splits = 2, shuffle = True, random_state = 0)
for train_indices, test_indices in skf.split(X, y):
    print( train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]


ValueError: Found input variables with inconsistent numbers of samples: [50, 10]

In [26]:
target = np.array([0, 1] * 5)
print( target)

skf = model_selection.StratifiedKFold(n_splits = 2,shuffle = True)
for train_indices, test_indices in skf.split(X, target):
    print( train_indices, test_indices)

[0 1 0 1 0 1 0 1 0 1]


ValueError: Found input variables with inconsistent numbers of samples: [50, 10]

#### ShuffleSplit

In [27]:
ss = model_selection.ShuffleSplit(n_splits = 50, test_size = 10)

for train_indices, test_indices in ss.split(X):
    print( train_indices, test_indices)

[23  7 40 31 28 27  4 35 12 36 21  1 49 15 33 47 43 42  8 17 24 38  5 19  3
 30  2 41 14  9 34  6 20 44 10 37 39  0 48 11] [26 13 29 32 45 25 46 18 16 22]
[ 9 39 42 15 12 44 20 24 26 36 13 11 47 32 38 10 37 17 27 30  2 19 25  7 34
  5 46 49 48 33  6 21 41 23 28  4 14 29  8 31] [40  0 22 35 16 18  1  3 45 43]
[22 10  3 12  5 18  2 13 27  9 35 11 29 41  7 17 38  4 45 36 24 16 26 44 40
 48  8 46 39 31 20  6 47 49 30 33 43 23 15 42] [28 37 25 34  1 14 19 21 32  0]
[ 5 14 47  8 49 35  6 36 19 12 21 43  2 40  0 46  9 34 18 26 10 13 15 45 28
 17 32 31 20 22 27 42 44 38 30  3 29 41 37  7] [24  4 48 16  1 39 25 11 33 23]
[24 40 33 10 35 17  7  2 28 23 19 48  6 37 14 29 13 32 12 43 16 47  3 45  8
 41 25 11 22 18 34  4 42  9 36 39 20 49 15 30] [21  5 38 46 27  0 44 26 31  1]
[47 41 39 43 12 11 22 13 14 15 19 45 23 46 27 36 35 10  8 25 32 24 49  9 16
  3 48 20 42 29 31 30 34 18  6 21 44 40  1  2] [ 5 28 17  4 38 33  7 26 37  0]
[13 45 21 24 35 19  9 47 44  3  4 17  6 31 38  7 11 18 37  5 29 20 42 

In [32]:
ss = model_selection.ShuffleSplit(n_splits = 50, test_size = 0.2)

for train_indices, test_indices in ss.split(X):
    print( train_indices, test_indices)

[ 2 41 20 47 16 38 40 49 10  8 37  4 32 35 29 43 46 23  0  3 48 45 15 39 25
 36 11 30 24 22 42  6 26  5 44  1  7 31 14 28] [17 19 27 13 18 21 34 12 33  9]
[21 34 36 42  4 41 10 31 33 17 11 35 12 46 30 26 47 44 28 25  7 49 32 38  8
 15 43  9 20 40 16 24 19 22 45 18 29 27  2 13] [37 48  0  1 23  6  5 14  3 39]
[ 6 17 45 47 42 19  4 38 34 43  0  7 20 48  9 30  2 18 27 41 14 24 46 37 40
 16 29 32 49 33 35 39 25 10 15 36 44 28 13  8] [ 3 12 22  5 31 23 11 21  1 26]
[ 8 11 21 40 18 41  3 20 15 23 30  9 32 33 47 38 26 24 43 36 46  5 22 16  4
 34  2 29 31 19  6 45 25  0 48  7 44 35 12 17] [49 28 13 37 10 39 42  1 27 14]
[36  3 13 25 17 30  7 42 27 11 40 43 32 39  2 28 18 22 48  1 21  9  8 47 46
 41 37 14 19 15 23 20 24 31 49 35 29 12 34 33] [ 6 16 26  0 44 45 38  4  5 10]
[18  7 42 48 40 29 28 20 47 33 46 37 14  2 15 45 19 41  0  6 32 27 23  1 22
 16  4 38  8 26 24 34 43 31 11  9 39 12 17  5] [30  3 21 35 10 49 13 44 36 25]
[ 1 17 41 44 13 10 40  2 26 22 31 21 12 16 34 37 24 48  9 45 20  8 11 

#### StratifiedShuffleSplit

In [17]:
target = np.array([0] * 5 + [1] * 5)
print( target)

sss = model_selection.StratifiedShuffleSplit(n_splits = 4, test_size = 0.2)
for train_indices, test_indices in sss.split(X, target):
    print( train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[8 2 0 6 5 7 4 3] [1 9]
[7 3 8 4 1 9 5 0] [2 6]
[7 3 0 9 2 5 4 8] [6 1]
[7 6 4 5 8 1 0 2] [3 9]


#### Leave-One-Out

In [18]:
loo = model_selection.LeaveOneOut()

for train_indices, test_index in loo.split(X):
    print( train_indices, test_index)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators