# Sklearn

## sklearn.model_selection

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
from sklearn import model_selection, datasets

import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = model_selection.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [4]:
print test_labels

[0 2 2 0 0 0 2 1 1 0 2 2 2 1 0 1 0 0 1 2 2 1 1 2 1 0 1 0 0 1 1 0 0 1 1 0 0
 2 0 1 1 0 0 0 2]


In [5]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [6]:
print 'Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [7]:
print 'Обучающая выборка:\n', train_data[:5]
print '\n'
print 'Тестовая выборка:\n', test_data[:5]

Обучающая выборка:
[[ 5.7  2.5  5.   2. ]
 [ 6.7  3.3  5.7  2.5]
 [ 5.1  2.5  3.   1.1]
 [ 6.4  2.8  5.6  2.1]
 [ 6.5  3.   5.5  1.8]]


Тестовая выборка:
[[ 5.4  3.9  1.7  0.4]
 [ 6.4  3.2  5.3  2.3]
 [ 7.1  3.   5.9  2.1]
 [ 5.4  3.4  1.7  0.2]
 [ 4.8  3.4  1.9  0.2]]


In [8]:
print 'Метки классов на обучающей выборке:\n', train_labels
print '\n'
print 'Метки классов на тестовой выборке:\n', test_labels

Метки классов на обучающей выборке:
[2 2 1 2 2 0 1 1 1 1 2 2 1 1 0 2 2 2 0 2 2 1 1 0 2 0 2 0 1 1 0 1 0 2 2 0 1
 2 1 2 1 1 2 1 1 1 2 2 1 1 0 0 2 0 1 1 0 2 2 2 1 2 0 1 0 0 1 2 0 2 1 1 1 2
 0 1 2 1 2 2 0 0 0 0 2 1 0 0 1 2 0 2 0 0 2 0 2 1 1 2 0 0 2 2 0]


Метки классов на тестовой выборке:
[0 2 2 0 0 0 2 1 1 0 2 2 2 1 0 1 0 0 1 2 2 1 1 2 1 0 1 0 0 1 1 0 0 1 1 0 0
 2 0 1 1 0 0 0 2]


### Стратегии проведения кросс-валидации

In [9]:
#сгенерируем короткое подобие датасета, где элементы совпадают с порядковым номером
X = range(0,10)

#### KFold

In [10]:
kf = model_selection.KFold(n_splits = 5)
for train_indices, test_indices in kf.split(X):
    print train_indices, test_indices

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [11]:
kf = model_selection.KFold(n_splits = 2, shuffle = True)
for train_indices, test_indices in kf.split(X):
    print train_indices, test_indices

[0 3 4 5 9] [1 2 6 7 8]
[1 2 6 7 8] [0 3 4 5 9]


In [12]:
kf = model_selection.KFold(n_splits = 2, shuffle = True, random_state = 1)
for train_indices, test_indices in kf.split(X):
    print train_indices, test_indices

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [13]:
y = np.array([0] * 5 + [1] * 5)
print y

skf = model_selection.StratifiedKFold(n_splits = 2, shuffle = True, random_state = 0)
for train_indices, test_indices in skf.split(X, y):
    print train_indices, test_indices

[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


In [14]:
target = np.array([0, 1] * 5)
print target

skf = model_selection.StratifiedKFold(n_splits = 2,shuffle = True)
for train_indices, test_indices in skf.split(X, target):
    print train_indices, test_indices

[0 1 0 1 0 1 0 1 0 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


#### ShuffleSplit

In [20]:
X = range(0,50)
ss = model_selection.ShuffleSplit(n_splits = 10, test_size = 10)

for train_indices, test_indices in ss.split(X):
    print train_indices, test_indices

[42  5 40  8 43  1 14 29 17  0 12 26  7 10  2  9 13 46  3 24  6 48 30 27 19
 36 35 47 20 44 31 18 23 25 37 49 41 21 28 32] [39 22  4 38 33 45 11 15 34 16]
[23 40 39 41 17 24 48 32 20  9 49 29 21  6  3 46 31 12 26 25 35 18 47 36 11
  1 43  4 22 30 44 34 45  2 19 13 15 14 33 42] [37 10 38  8  5 28  0  7 27 16]
[ 2 26 14 22 27  6 18 40 29 41 49 25 44 48 39 32 33 46  5 31  0 30 45 12 19
  9  8 10 34 42 21 24 47 36 13 20 17 35  4 23] [16 15  7 11 37 38  3 43  1 28]
[45  5 28 40 42  6 12  3 35  9 36 27 15 32 13 29 22 16 43  0 21 24 34  8 31
 44  2 38 10 39  7 25 14 17  1 48 49  4 23 19] [11 26 33 37 20 30 18 41 47 46]
[29  1 14 25 32 13 16 28 40  4  6 49 36 43 24 34 23 41  3  7  9 46 15 39 33
  8 18 27 42 30 11 31 44  2 21 48 19 45 38 37] [17 10 35 22  5 20  0 26 47 12]
[28 30 40 26 37 31  4 42  7 34  0 10 49 18 21 19 24  2 22 25 45 46 32 23 39
 47  6 43 11 38 29  1 13 17 20  3 48 16 33  9] [35 12  5 27 41 14 44 36  8 15]
[22  7 48 11 32 44 28 23 15 35 26 13 12 30  5 33  4 16  2 14 43 34 36 

#### StratifiedShuffleSplit

In [None]:
target = np.array([0] * 5 + [1] * 5)
print target

sss = model_selection.StratifiedShuffleSplit(n_splits = 4, test_size = 0.2)
for train_indices, test_indices in sss.split(X, target):
    print train_indices, test_indices

#### Leave-One-Out

In [25]:
loo = model_selection.LeaveOneOut()

for train_indices, test_index in loo.split(X):
    print train_indices, test_index

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators