# Iris flower data set


Dataset contains a set of 150 records under 5 attributes 
- Petal Length
- Petal Width
- Sepal Length
- Sepal Width
- Class (Species) → `Label`

![flower_anatomy](./res/flower_anatomy.jpg)


### Import Libraries and take a general look at the data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

iris = pd.read_csv("./res/iris.csv", na_values=['NA'])
print (iris.head())
print ('\n')

print (iris.info())
print ('\n')

print (iris['species'].value_counts())

### Visualizing the distribution of a dataset

In [None]:
sns.distplot(a=iris["sepal_length"],rug=True)
sns.FacetGrid(iris, hue="species", size=6)\
    .map(sns.kdeplot, "sepal_length")\
    .add_legend()
plt.show()

In [None]:
sns.jointplot(x="sepal_length", y="sepal_width", data=iris, size=5,kind="scatter")
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# set colors to keep consistency between Axes3D and seaborn chart
colors = {'setosa':'blue', 'versicolor':'orange', 'virginica':'green',\
          'new0':'red', 'new1':'purple'}  #hardcode new color here

threedee = plt.figure().gca(projection='3d')
threedee.scatter(iris['sepal_length'], iris['sepal_width'], \
                 iris['petal_length'], c=iris['species'].apply(lambda x: colors[x]))
threedee.set_xlabel('sepal_length')
threedee.set_ylabel('sepal_width')
threedee.set_zlabel('petal_length')
plt.show()

### Visualizing pairwise relationships in a dataset

In [None]:
sns.pairplot(iris, hue="species",diag_kind="kde")
plt.show()

### Devide dataset into Training Data and Test Data
![Holdout-validation-method](./res/Holdout-validation-method.png)

Training Data v.s. Test Data is usually around 80/20, 70/30, or 2/3, 1/3

In [None]:
from sklearn import model_selection
# Devide the dataset into training and test data
iris_training, iris_test = model_selection.train_test_split(iris, test_size=0.30, random_state=42)

# Separate x and y variables from data
iris_training_y = iris_training["species"]
iris_training_x = iris_training.drop("species", axis=1)
iris_test_y = iris_test["species"]
iris_test_x = iris_test.drop("species", axis=1)

print ('Training Y')
print (iris_training_y.head())
print ('\n')
print ('Training X')
print (iris_training_x.head())

### K-Nearest Neighbor Classifier

In [None]:
from sklearn import neighbors
# KNeighborsClassifier
clf = neighbors.KNeighborsClassifier() # Default K is 5
clf.fit(X=iris_training_x, y=iris_training_y)
print ( "KNN classifier score: ", clf.score(iris_test_x, iris_test_y))

### Loss Functions and Optimization

![Optimization Function](./res/Optimization Function.png)

In [None]:
# Optimization K
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score

# Loss Function
myList = list(range(1,50))

# subsetting just the odd ones
odd_neighbors = list(filter(lambda x: x % 2 != 0, myList))

cv_scores = []
for k in odd_neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, iris_training_x, iris_training_y, cv=10, scoring='accuracy')
    cv_scores.append(scores.mean())

# changing to misclassification error
MSE = [1 - x for x in cv_scores]

# determining best k
optimal_k = odd_neighbors[MSE.index((min(MSE)))]
print ("The optimal number of neighbors is %d" % optimal_k)

# plot misclassification error vs k
plt.plot(odd_neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.show()

### Update the K value to the model

In [None]:
clf = neighbors.KNeighborsClassifier(11)
clf.fit(X=iris_training_x, y=iris_training_y)
print ( "KNN classifier score: ", clf.score(iris_test_x, iris_test_y))

### Predict the new points

In [None]:
new_points=[(5.2, 3.7, 2.2, 1.1),(7.0, 3.3, 5.0, 1.5)]
prediction = clf.predict(new_points)
prediction_proba = clf.predict_proba(new_points)

print (prediction)
print (iris['species'].unique())
print (prediction_proba)

### Check the nearest points

In [None]:
#kneighbors(X, n_neighbors=N, return_distance=True)
nearests = clf.kneighbors([new_points[0]],n_neighbors=5,return_distance=True)  
print (nearests)  #Returns [dist(array), index(array)]
iris.loc[nearests[1][0]]

### Visualize the new points

In [None]:
new_iris = iris
i = 0
while i < len(new_points):
    new_iris = new_iris.append({'sepal_length':new_points[i][0], 'sepal_width':new_points[i][1], \
                      'petal_length':new_points[i][2], 'petal_width':new_points[i][3], \
                      'species':'new'+str(i)}, ignore_index=True)
    i += 1
new_iris.tail()

In [None]:
new_threedee = plt.figure().gca(projection='3d')
new_threedee.scatter(new_iris['sepal_length'], new_iris['sepal_width'],\
                     new_iris['petal_length'],c=new_iris['species'].apply(lambda x: colors[x]))
new_threedee.set_xlabel('sepal_length')
new_threedee.set_ylabel('sepal_width')
new_threedee.set_zlabel('petal_length')
plt.show()
print ('Prediction:\n'
       'Red:', prediction[0] +'\n'
       'Purple:', prediction[1] )

In [None]:
import warnings
warnings.filterwarnings('ignore')

sns.pairplot(new_iris, hue="species", diag_kind="kde", markers=["o", "o", "o", ",", ","])
print ('Prediction:\n'
       'Red:', prediction[0] +'\n'
       'Purple:', prediction[1] )
plt.show()