# Sklearn

## sklearn.model_selection

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
from sklearn import model_selection, datasets

import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [5]:
print 'Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [6]:
print 'Обучающая выборка:\n', train_data[:5]
print '\n'
print 'Тестовая выборка:\n', test_data[:5]

Обучающая выборка:
[[ 7.4  2.8  6.1  1.9]
 [ 7.7  2.6  6.9  2.3]
 [ 6.7  3.1  4.4  1.4]
 [ 5.1  3.5  1.4  0.3]
 [ 5.7  2.8  4.1  1.3]]


Тестовая выборка:
[[ 4.8  3.   1.4  0.3]
 [ 5.5  4.2  1.4  0.2]
 [ 6.3  2.9  5.6  1.8]
 [ 7.1  3.   5.9  2.1]
 [ 5.3  3.7  1.5  0.2]]


In [7]:
print 'Метки классов на обучающей выборке:\n', train_labels
print '\n'
print 'Метки классов на тестовой выборке:\n', test_labels

Метки классов на обучающей выборке:
[2 2 1 0 1 1 2 2 0 2 2 0 2 0 0 0 2 2 1 1 1 0 1 2 1 2 0 2 1 0 0 2 2 0 1 2 1
 0 1 0 2 2 1 1 1 0 1 2 1 1 0 2 2 0 1 2 2 2 1 0 2 0 1 0 0 1 0 1 2 2 2 0 2 1
 0 1 1 1 0 2 0 2 1 1 0 0 1 1 0 1 2 0 2 2 0 1 2 0 0 2 1 1 2 1 1]


Метки классов на тестовой выборке:
[0 0 2 2 0 2 0 1 0 0 2 2 1 1 1 0 2 0 2 0 1 2 1 0 2 0 0 2 2 0 1 0 0 1 0 2 1
 1 0 2 1 2 1 1 0]


### Стратегии проведения кросс-валидации

In [18]:
#сгенерируем короткое подобие датасета, где элементы совпадают с порядковым номером
X = range(0,50)

#### KFold

In [9]:
kf = model_selection.KFold(n_splits = 5)
for train_indices, test_indices in kf.split(X):
    print train_indices, test_indices

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [15]:
kf = model_selection.KFold(n_splits = 2, shuffle = True)
for train_indices, test_indices in kf.split(X):
    print train_indices, test_indices

[0 1 2 6 8] [3 4 5 7 9]
[3 4 5 7 9] [0 1 2 6 8]


In [17]:
kf = model_selection.KFold(n_splits = 2, shuffle = True, random_state = 1)
for train_indices, test_indices in kf.split(X):
    print train_indices, test_indices

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [12]:
y = np.array([0] * 5 + [1] * 5)
print y

skf = model_selection.StratifiedKFold(n_splits = 2, shuffle = True, random_state = 0)
for train_indices, test_indices in skf.split(X, y):
    print train_indices, test_indices

[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


In [13]:
target = np.array([0, 1] * 5)
print target

skf = model_selection.StratifiedKFold(n_splits = 2,shuffle = True)
for train_indices, test_indices in skf.split(X, target):
    print train_indices, test_indices

[0 1 0 1 0 1 0 1 0 1]
[0 2 3 7] [1 4 5 6 8 9]
[1 4 5 6 8 9] [0 2 3 7]


#### ShuffleSplit

In [20]:
ss = model_selection.ShuffleSplit(n_splits = 10, test_size = 0.2)

for train_indices, test_indices in ss.split(X):
    print train_indices, test_indices

[14 23  7 12 27 30 36 19  8  6 48 45 25 37  1 28 15 44  3 46 22 21 43 32 38
 41 26 39  0  9 40 24  5 33 47 10 17 13 18 29] [16  2 35 42  4 20 31 34 11 49]
[ 9 13 18 29 49 38 11 31 15 26  3 47 22 12 10  6  8 33 30 36 14 27 41 23 20
 21 19 43 17  5 28 39  7 32 48 44 25 35 24 34] [45 42 40 46  4  2  1  0 16 37]
[42  2 31 47 39  4 15 46 19 24 45 29  3 21 35 41 32  0 49 44 36 20 38  8 43
 23 10 26 13 28 16 25 40 37 17 27 33  5 14 12] [48  7 18 11 22 30  9  1 34  6]
[21 29 14 19 45  3 32 47 25 18  1  5 31 33 24  9  0 23 12 38  4 22 10 41 28
 15 39  7 26 34 20 27  8 17 42 49 43 30 11  2] [46 37 13 16 40  6 35 48 36 44]
[10 11  9 12 25 40 33 48 44 46 21  3 13 42 29 49 23 16 27 18  4 47 15  2 26
 32 28 22 39 43 36  8  5  0  1 41 24  7 35 30] [17 38 45  6 20 37 31 14 19 34]
[ 5 19 43 34 44 37 45 27 49 46 21 15 48  1  9 32 13 28 36  8 26 24 29 10  3
 12 23 33 16 30 42 38 47  6  2  7  4  0 11 31] [20 35 22 25 41 14 39 40 17 18]
[29 43 14 39 21  8  5 20 24 48 46 37  3  2 23 28 45 16 13 44 30 26 27 

#### StratifiedShuffleSplit

In [15]:
target = np.array([0] * 5 + [1] * 5)
print target

sss = model_selection.StratifiedShuffleSplit(n_splits = 4, test_size = 0.2)
for train_indices, test_indices in sss.split(X, target):
    print train_indices, test_indices

[0 0 0 0 0 1 1 1 1 1]
[4 9 3 8 1 2 7 5] [6 0]
[9 7 3 1 4 6 2 5] [0 8]
[9 2 1 6 7 4 3 5] [8 0]
[3 8 9 2 6 4 1 5] [7 0]


#### Leave-One-Out

In [16]:
loo = model_selection.LeaveOneOut()

for train_indices, test_index in loo.split(X):
    print train_indices, test_index

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators