## Example 1:  K-Nearest Neighbor with Iris Data

In [1]:
# Import the required libraries
import numpy as np
from sklearn import datasets

#Import the learning algorithm
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
iris_X, iris_y = datasets.load_iris(return_X_y=True)
iris_X.shape

(150, 4)

### Step 1,2: Data Preprocessing, Feature Engineering

In [6]:
# No need for Preprocessing and any feature selection

### Step 3: Train/Test Data Splitting

In [41]:
# Split iris data in train and test data


## Most common way to split train and test data 
from sklearn.model_selection import train_test_split
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X,iris_y, test_size=0.5, random_state=0)

In [42]:
print(iris_X_train.shape, iris_y_test.shape)

print(type(iris_X_train))

(75, 4) (75,)
<class 'numpy.ndarray'>


In [38]:
# Another way for splitting 
# A random permutation, to split the data randomly
np.random.seed(0)
indices = np.random.permutation(len(iris_X))
iris_X_train = iris_X[indices[:-10]]
iris_y_train = iris_y[indices[:-10]]
iris_X_test = iris_X[indices[-10:]]
iris_y_test = iris_y[indices[-10:]]
## 

print(iris_X_train.shape, iris_y_test.shape)

(140, 4) (10,)


### Step 4: Model Creation and Training

In [51]:
# Create and fit a nearest-neighbor classifier and predict the target values 
# Classifier implementing the k-nearest neighbors vote.
knn = KNeighborsClassifier()

# Model training with fit function -Fit the model using X as training data and y as target values

knn.fit(iris_X_train, iris_y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

### Step 5: Model Evaluation

In [52]:
# Result with the trained model - Predict the class labels for the provided data.
print(knn.predict(iris_X_test))


# Return probability estimates for the test data X.
knn.predict_proba(iris_X_test)


# Results of the prediction -Return the mean accuracy on the given test data and labels.
knn.score(iris_X_test, iris_y_test )

[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2 1 1 2 0 2 0 0 1 2 2 1 2 1 2 1 1 2 2 1 2 1 2 1 0 2 1 1 1 1 2 0 0 2 1 0 0
 1]


0.96

### Advanced Analysis for Model Selection and Evaluation 

#### Cross Validation 

In [62]:
# Import CV from model selection section
from sklearn.model_selection import cross_val_score


#A new KNN model for CV
knn_cv = KNeighborsClassifier(n_neighbors=7)


#train model with cv of 5 
cv_scores = cross_val_score(knn_cv, iris_X, iris_y, cv=10)


#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[1.         0.93333333 1.         1.         0.86666667 0.93333333
 0.93333333 1.         1.         1.        ]
cv_scores mean:0.9666666666666668


#### Tuning model Parameters using GridSearch

In [72]:
from sklearn.model_selection import GridSearchCV


# CReate a new KNN for GS

kNN_with_GS = KNeighborsClassifier()

# Create a model parameter list 
param_KNN = {'n_neighbors': np.arange(1,50)}

kNN_GS = GridSearchCV(kNN_with_GS, param_KNN, cv=10)

kNN_GS.fit(iris_X_train,iris_y_train)

## Find the best parameters for KNN
print(kNN_GS.best_params_)

## Find the best score with the best parameters
kNN_GS.best_score_

{'n_neighbors': 3}




0.9733333333333334