In [1]:
from sklearn import datasets
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
iris = datasets.load_iris()

# Set X a samples times features matrix, Y equals to the targets
# Use only the first 10 datapoints for this example
X = iris.data[0:10]
y = iris.target[0:10]

# Use 10 splits, with the test size being 0.2
cvsplt = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

for train_index, test_index in cvsplt.split(X, y):
  print("train indices:", train_index, "/ test indices:", test_index)
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

train indices: [9 8 4 3 6 2 7 1] / test indices: [5 0]
train indices: [1 6 8 3 0 2 4 5] / test indices: [9 7]
train indices: [8 7 3 5 1 9 4 6] / test indices: [0 2]
train indices: [5 0 9 8 6 2 3 1] / test indices: [4 7]
train indices: [5 4 6 0 1 2 7 3] / test indices: [8 9]
train indices: [8 6 2 0 7 3 1 5] / test indices: [4 9]
train indices: [7 8 5 3 9 6 4 2] / test indices: [1 0]
train indices: [4 8 5 1 6 7 9 3] / test indices: [2 0]
train indices: [8 9 0 4 6 1 5 7] / test indices: [3 2]
train indices: [6 2 4 1 7 8 9 5] / test indices: [3 0]


In [4]:
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [7]:
%matplotlib inline
iris = datasets.load_iris()

# Set X a samples times features matrix, Y equals to the targets
X = iris.data
y = iris.target

# Split to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Define knn classifier, with 5 neighbours and use the euclidean distance
knn = KNeighborsClassifier(n_neighbors=10, metric="euclidean")

# Define training and testing data, fit the classifier
knn.fit(X_train, y_train)

# Predict values for test data based on training data
y_pred = knn.predict(X_test)

print(y_test) # True values
print(y_pred) # Predicted values

[0 2 2 2 2 1 0 1 0 1 1 2 1 0 0 2 0 0 0 1 1 1 2 2 1 0 0 1 0 0]
[0 2 2 2 2 1 0 1 0 1 1 2 1 0 0 2 0 0 0 1 1 1 2 2 1 0 0 1 0 0]


In [8]:
def myCrossVal(X, y, foldK):
  '''
  This function performs cross validation on the sklearn KNeighborsClassifier algorithm.

  [ inputs ]
      X: a data matrix of size (samples, features)
      y: a label array of size (samples,)
      foldK: number of folds

  [ outpus ]
      accuracy_fold: a list of foldK accuracy values
  '''

  from sklearn.neighbors import KNeighborsClassifier
  from sklearn import metrics

  accuracy_fold = [] # List to store accuracies folds

  # Generate a list of shuffled indices from in the range (0, number of data)
  np.random.seed(0)
  indices = np.random.permutation(np.arange(0, len(X), 1))

  # Split the indices to k different bins
  bins = np.array_split(indices, foldK)

  for i in range(0, foldK):
    foldTrain = [] # List to store current indices for training
    foldTest = [] # List to store current indices for testing

    # Take bin i for testing, rest for training
    foldTest = bins[i]

    for j in range(0, foldK):
      if j != i:
        foldTrain.extend(bins[j])

    # Train kNN classifier
    knn = KNeighborsClassifier(n_neighbors=10, metric="euclidean")
    knn.fit(X[foldTrain,:], y[foldTrain])

    # Test on test data
    y_pred = knn.predict(X[foldTest,:])

    # Append the new accuracy to accuracy_fold list
    accuracy_fold.append(metrics.accuracy_score(y[foldTest], y_pred))

  return accuracy_fold

accuracy_fold = myCrossVal(X, y, 5)
print(accuracy_fold)

[1.0, 0.8666666666666667, 1.0, 0.9666666666666667, 0.9666666666666667]


In [9]:
print("average accuracy: %.3f (std. %.3f)" % (np.mean(accuracy_fold), np.std(accuracy_fold)))

average accuracy: 0.960 (std. 0.049)
