### 测试集与训练集

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
iris = datasets.load_iris()

In [3]:
X = iris.data
y = iris.target

In [4]:
X.shape # 150行 -> 150个样本， 四列属性

(150, 4)

In [5]:
y.shape # 150个标记，每个样本一个标记

(150,)

### 分离测试数据集与训练数据集

In [6]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

不能简单的取前100为训练数据集，因为对于此数据集y是排好序的。

乱序方法1：<br>
把x，y先合并成一个矩阵，乱序处理后再分离。

In [7]:
z = np.hstack([X,y.reshape(-1,1)])
z[:5]

array([[5.1, 3.5, 1.4, 0.2, 0. ],
       [4.9, 3. , 1.4, 0.2, 0. ],
       [4.7, 3.2, 1.3, 0.2, 0. ],
       [4.6, 3.1, 1.5, 0.2, 0. ],
       [5. , 3.6, 1.4, 0.2, 0. ]])

In [8]:
np.random.shuffle(z)
z[:5]

array([[7.6, 3. , 6.6, 2.1, 2. ],
       [4.9, 3.1, 1.5, 0.1, 0. ],
       [6.4, 2.8, 5.6, 2.1, 2. ],
       [7.7, 3.8, 6.7, 2.2, 2. ],
       [6.4, 3.2, 5.3, 2.3, 2. ]])

In [9]:
X_train = z[:,:-1]
y_train = z[:,-1]

In [10]:
X_train[:5]

array([[7.6, 3. , 6.6, 2.1],
       [4.9, 3.1, 1.5, 0.1],
       [6.4, 2.8, 5.6, 2.1],
       [7.7, 3.8, 6.7, 2.2],
       [6.4, 3.2, 5.3, 2.3]])

In [11]:
y_train[:5]

array([2., 0., 2., 2., 2.])

乱序方法2：

In [12]:
shuffle_indexes = np.random.permutation(len(X))
shuffle_indexes

array([ 28, 113,  88,  94,  18,  86, 100, 118, 129,  73,  39,  75, 147,
        58, 128, 112, 136,  57, 123,  47, 140,  31, 122,   5, 121, 143,
        32,   3,  17,  10, 134,  82,  46,  62, 139,   9, 116, 102, 117,
        65, 108, 130,  34,  68, 138,  71,  77,  60,  66,  89, 144,  63,
        72,  69,  15,  38,  61, 137,  20,  11,  48,  21,  27,  37,  78,
       125,   7, 133,  22,  52,  36,  53, 106, 127, 132,  85, 114,  45,
        24,  50,  79, 119,  29,  49,  19,   6,  97,  12, 146, 145,  83,
        51, 104,  30,  55,  44, 142,  35,   1,  64, 107,  54,  42,  43,
        98, 109, 103,  81,  56, 126,  16, 115,  41,   2,   4,  92, 135,
        95,  76, 141,   8,  13,  25,  90,  84, 124,  87,  40,  26,  91,
         0,  67,  23, 131,  99,  70, 149,  80,  33,  14, 111, 120,  96,
        93, 110, 101,  59, 148,  74, 105])

In [13]:
test_ratio = 0.2
test_size = int(len(X) * test_ratio) # 有可能是浮点数,所以要转换
test_size

30

In [14]:
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[test_size:]

In [15]:
X_train = X[train_indexes]
y_train = y[train_indexes]

X_test = X[test_indexes]
y_test = y[test_indexes]

In [16]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(120, 4)
(120,)
(30, 4)
(30,)


### 使用我们的封装算法

In [17]:
from model_selection.train_test_split_enc import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [18]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(120, 4)
(120,)
(30, 4)
(30,)


### 测试我们的算法

In [20]:
from model_selection.KNN_enc import KNNClassifier

my_knn_clf = KNNClassifier(k=3)
my_knn_clf.fit(X_train, y_train)
y_predict = my_knn_clf.predict(X_test)

y_predict

array([2, 2, 1, 2, 0, 1, 0, 2, 0, 0, 1, 1, 1, 2, 1, 0, 0, 2, 2, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 1])

In [21]:
y_test

array([2, 2, 1, 2, 0, 1, 0, 1, 0, 0, 1, 1, 1, 2, 1, 0, 0, 2, 2, 2, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 1])

In [22]:
sum(y_predict == y_test)

28

In [23]:
sum(y_predict == y_test) / len(y_test)

0.9333333333333333

### sklearn 中的train_test_split

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 666)
# random_state 就是 seed

In [26]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(120, 4)
(120,)
(30, 4)
(30,)
