In [1]:
import numpy as np
import matplotlib as mpl
from sklearn import datasets

In [2]:
iris = datasets.load_iris()

In [3]:
X = iris.data
y = iris.target

In [4]:
X.shape

(150, 4)

In [5]:
y.shape

(150,)

In [6]:
len(X)

150

## train_test_split

In [7]:
shuffle_indexes = np.random.permutation(len(X))  # 将0~len(X)的数打乱

In [8]:
shuffle_indexes

array([ 29, 122, 144,  44,  74, 117,  20,  82, 131,  55,  93,  78, 143,
        51,  53,   5,  84,  30, 115, 105, 124,  90,  35,  45, 139,  34,
       136,  94,  46,  48,  72,  39,  85, 100,  92,  62,  87, 145,  80,
        73,  25, 140, 103,  40,  75,  69,  13, 110,  99,  36,   9,  12,
        22,  96,  16,  86,  10,  65,  32, 129,  63,  56,  95, 120, 106,
        64, 137,  67, 112,  61,  27,  50, 113,   7,   6, 121, 102,  28,
        31,  89, 128,  14, 104,  33,  19,  57,  91, 126,  23,  54, 107,
        15,  11, 101,   1,  47,  21,  70,  49, 138,  38,  41,  58,  81,
        17, 127,  71,  97, 141, 123,   4,  37,  18,  83,  98, 148,   8,
       135, 116,   2,  76, 118,  66, 134,  52,  42,   3, 147, 109, 149,
       125, 130, 114, 132,  68, 146, 133,  60,   0, 119, 108, 111,  59,
        77,  79,  43, 142,  88,  26,  24])

In [9]:
test_ratio = 0.2
test_len = int(len(X) * test_ratio)

In [10]:
test_indexes = shuffle_indexes[:test_len]  # 测试集索引
test_indexes

array([ 29, 122, 144,  44,  74, 117,  20,  82, 131,  55,  93,  78, 143,
        51,  53,   5,  84,  30, 115, 105, 124,  90,  35,  45, 139,  34,
       136,  94,  46,  48])

In [11]:
X_test = X[test_indexes]
y_test = y[test_indexes]

In [12]:
X_test.shape, y_test.shape

((30, 4), (30,))

In [13]:
train_indexes = shuffle_indexes[test_len:]  # 训练集索引

In [14]:
X_train = X[train_indexes]
y_train = y[train_indexes]

In [15]:
X_train.shape, y_train.shape

((120, 4), (120,))

## 使用我们的算法

In [22]:
from playML.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [24]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((120, 4), (120,), (30, 4), (30,))

In [25]:
from playML.KNN import KNN_Classify

In [26]:
my_knn_clf = KNN_Classify(k=3)

In [27]:
my_knn_clf.fit(X_train, y_train)

KNN(k=3)

In [28]:
y_predict = my_knn_clf.predict(X_test)

In [29]:
y_predict

array([0, 2, 0, 1, 2, 0, 1, 1, 0, 2, 0, 2, 2, 1, 1, 1, 0, 0, 0, 2, 2, 0,
       0, 1, 0, 0, 1, 1, 1, 1])

In [30]:
y_test

array([0, 2, 0, 1, 2, 0, 1, 2, 0, 2, 0, 2, 2, 1, 1, 1, 0, 0, 0, 2, 2, 0,
       0, 1, 0, 0, 1, 1, 1, 1])

In [31]:
sum(y_predict == y_test)  # 预测结果与世界结果相同的有多少

29

In [32]:
sum(y_predict == y_test)/len(y_test)  # 预测成功率

0.9666666666666667

## sklearn 中的 train_test_split

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [39]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [41]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


In [42]:
from sklearn.neighbors import KNeighborsClassifier  # 使用sklearn中的算法

In [43]:
knn_clf = KNeighborsClassifier(3)

In [44]:
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [45]:
y_predict = knn_clf.predict(X_test)

In [46]:
y_predict

array([1, 2, 1, 2, 0, 1, 1, 2, 1, 1, 1, 0, 0, 0, 2, 1, 0, 2, 2, 2, 1, 0,
       2, 0, 1, 1, 0, 1, 2, 2])

In [47]:
y_test

array([1, 2, 1, 2, 0, 1, 1, 2, 1, 1, 1, 0, 0, 0, 2, 1, 0, 2, 2, 2, 1, 0,
       2, 0, 1, 1, 0, 1, 2, 2])

In [48]:
sum(y_predict == y_test)

30

In [49]:
sum(y_predict == y_test)/len(y_test)

1.0