# About this Jupyter Notebook

@author: Yingding Wang\
@updated: 26.09.2022

This notebook building iris classifier with iris dataset

In [77]:
import sys

### About iris dataset

This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other.

Reference:
* https://www.tensorflow.org/datasets/catalog/iris
* https://archive.ics.uci.edu/ml/datasets/iris

In [78]:
from collections import namedtuple
Settings = namedtuple('Settings', [
    'tf_io',
    'tf_datasets',
    'tf_version',
    'pandas_version',
    'numpy_version',
    #'base_tf_image',
    #'base_python_image'
])
# the base images are from the dockerhub https://hub.docker.com/_/python
settings = Settings(
    tf_io="0.27.0", 
    tf_datasets="4.6.0",
    tf_version="2.10.0",
    pandas_version="1.5.0",
    numpy_version="1.23.3", # "1.21.6"
    # base_tf_image="tensorflow/tensorflow:2.10.0",
    # base_python_image="python:3.8.14"
) 
print(f"{settings}")

Settings(tf_io='0.27.0', tf_datasets='4.6.0', tf_version='2.10.0', pandas_version='1.5.0', numpy_version='1.23.3')


In [79]:
!{sys.executable} -m pip install --upgrade --user tensorflow-datasets=="{settings.tf_datasets}" tensorflow=="{settings.tf_version}" pandas=="{settings.pandas_version}" numpy=="{settings.numpy_version}"



In [80]:
# optional clean up the local installation
# !{sys.executable} -m pip uninstall -y tensorflow-datasets tensorflow

In [81]:
# show current numpy version
!{sys.executable} -m pip list | grep numpy

numpy                        1.23.3


### TensorFlow Iris Flower Dataset
* Training Deep Learning Model with Iris: https://medium.com/@nutanbhogendrasharma/tensorflow-deep-learning-model-with-iris-dataset-8ec344c49f91
* TensorFlow Dataset mnist example:  https://www.tensorflow.org/datasets/keras_example
* Get feature and label from TF Dataset: https://stackoverflow.com/questions/56226621/how-to-extract-data-labels-back-from-tensorflow-dataset/56226958#56226958

Note:\
iris dataset has only the 'train' split (https://www.tensorflow.org/datasets/catalog/iris)


In [82]:
import tensorflow_datasets as tfds
import tensorflow as tf

In [83]:
(ds_train), ds_info = tfds.load(
    'iris',
    split=tfds.Split.TRAIN, # split=['train', 'test'],
    shuffle_files=True,
    as_supervised=True,
    with_info=True
)
assert isinstance(ds_train, tf.data.Dataset)

### Explore Dataset

In [84]:
print(ds_train)

<PrefetchDataset element_spec=(TensorSpec(shape=(4,), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>


In [85]:
ds_train.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(4,), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [86]:
iris_sample, = ds_train.take(1)
print(iris_sample)

(<tf.Tensor: shape=(4,), dtype=float32, numpy=array([5.1, 3.4, 1.5, 0.2], dtype=float32)>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)


In [87]:
(iris_feature_1, iris_label_1), = ds_train.take(1)
print(iris_feature_1)
print(iris_label_1)

tf.Tensor([5.1 3.4 1.5 0.2], shape=(4,), dtype=float32)
tf.Tensor(0, shape=(), dtype=int64)


### Visualize the dataset

In [88]:
# https://www.tensorflow.org/datasets/api_docs/python/tfds/as_dataframe
size = ds_train.cardinality().numpy()
df = tfds.as_dataframe(ds_train.take(size), ds_info)

In [89]:
df

Unnamed: 0,features,label
0,"ndarray(shape=(4,), dtype=float32)",0 (Iris-setosa)
1,"ndarray(shape=(4,), dtype=float32)",2 (Iris-virginica)
2,"ndarray(shape=(4,), dtype=float32)",1 (Iris-versicolor)
3,"ndarray(shape=(4,), dtype=float32)",2 (Iris-virginica)
4,"ndarray(shape=(4,), dtype=float32)",0 (Iris-setosa)
5,"ndarray(shape=(4,), dtype=float32)",1 (Iris-versicolor)
6,"ndarray(shape=(4,), dtype=float32)",1 (Iris-versicolor)
7,"ndarray(shape=(4,), dtype=float32)",1 (Iris-versicolor)
8,"ndarray(shape=(4,), dtype=float32)",0 (Iris-setosa)
9,"ndarray(shape=(4,), dtype=float32)",2 (Iris-virginica)


In [90]:
df.head(5)

Unnamed: 0,features,label
0,"[5.1, 3.4, 1.5, 0.2]",0
1,"[7.7, 3.0, 6.1, 2.3]",2
2,"[5.7, 2.8, 4.5, 1.3]",1
3,"[6.8, 3.2, 5.9, 2.3]",2
4,"[5.2, 3.4, 1.4, 0.2]",0


In [91]:
output_path = "./iris.csv"
with open(output_path, "w+", encoding="utf-8") as f:
    df.to_csv(f, index=False, header=True, encoding="utf-8")

## Open the output file
* Right mouse click of "iris.csv" -> open with -> CSVTable (to show content with csv viewer)
* Right mouse click of "iris.csv" -> open with -> **Editor** (to show raw data content)

![image](./screens/csvOpenWith.png)


## Reload the CSV file

In [142]:
feature_names = {
    "0": "sepal length (cm)",
    "1": "sepal width (cm)",
    "2": "pedal length (cm)",
    "3": "petal width (cm)",
}
feature_names["0"]

'sepal length (cm)'

In [164]:
import pandas as pd
df = pd.read_csv(output_path, sep=",", header=0, index_col=None)

In [165]:
df

Unnamed: 0,features,label
0,[5.1 3.4 1.5 0.2],0
1,[7.7 3. 6.1 2.3],2
2,[5.7 2.8 4.5 1.3],1
3,[6.8 3.2 5.9 2.3],2
4,[5.2 3.4 1.4 0.2],0
...,...,...
145,[5.1 3.8 1.6 0.2],0
146,[4.9 2.4 3.3 1. ],1
147,[6.7 3.1 5.6 2.4],2
148,[5.5 2.4 3.8 1.1],1


In [167]:
assert df.shape[1]==2

In [145]:
import numpy as np
# https://stackoverflow.com/questions/45704999/how-to-convert-vector-wrapped-as-string-to-numpy-array-in-pandas-dataframe
df['numpy'] = df['features'].apply(lambda x: 
                           np.fromstring(
                               x.replace('[','')
                                .replace(']',''), sep=' '))
# df['features'] = df['features'].apply(lambda x:[].replace()).apply(eval).apply(np.array)

In [146]:
for i in range(0, 4):
    df[feature_names[f"{i}"]] = df["numpy"].apply(lambda x: x[i])

In [148]:
df.drop(columns=['features', 'numpy'], inplace=True)

In [156]:
df_x = df.loc[:, ~df.columns.isin(['label'])]
df_y = df.loc[:, ['label']]

In [157]:
df_x.to_numpy()

array([[5.1, 3.4, 1.5, 0.2],
       [7.7, 3. , 6.1, 2.3],
       [5.7, 2.8, 4.5, 1.3],
       [6.8, 3.2, 5.9, 2.3],
       [5.2, 3.4, 1.4, 0.2],
       [5.6, 2.9, 3.6, 1.3],
       [5.5, 2.6, 4.4, 1.2],
       [5.5, 2.4, 3.7, 1. ],
       [4.6, 3.4, 1.4, 0.3],
       [7.7, 2.8, 6.7, 2. ],
       [7. , 3.2, 4.7, 1.4],
       [4.6, 3.2, 1.4, 0.2],
       [6.5, 3. , 5.2, 2. ],
       [5.5, 4.2, 1.4, 0.2],
       [5.4, 3.9, 1.3, 0.4],
       [5. , 3.5, 1.3, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [4.8, 3. , 1.4, 0.1],
       [6.5, 3. , 5.8, 2.2],
       [7.6, 3. , 6.6, 2.1],
       [6.7, 3.3, 5.7, 2.1],
       [7.9, 3.8, 6.4, 2. ],
       [6.7, 3. , 5.2, 2.3],
       [5.8, 4. , 1.2, 0.2],
       [6.3, 2.5, 5. , 1.9],
       [5. , 3. , 1.6, 0.2],
       [6.9, 3.1, 5.1, 2.3],
       [6.1, 3. , 4.6, 1.4],
       [5.8, 2.7, 4.1, 1. ],
       [5.2, 2.7, 3.9, 1.4],
       [6.7, 3. , 5. , 1.7],
       [5.7, 2.6, 3.5, 1. ],
       [5.8, 2.8, 5.1, 2.4],
       [6.5, 3.2, 5.1, 2. ],
       [5.7, 3

In [160]:
df_y.to_numpy()

array([[0],
       [2],
       [1],
       [2],
       [0],
       [1],
       [1],
       [1],
       [0],
       [2],
       [1],
       [0],
       [2],
       [0],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [2],
       [2],
       [2],
       [0],
       [2],
       [0],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [2],
       [0],
       [0],
       [1],
       [2],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [2],
       [0],
       [0],
       [1],
       [2],
       [0],
       [0],
       [1],
       [2],
       [2],
       [1],
       [2],
       [2],
       [2],
       [0],
       [2],
       [1],
       [0],
       [0],
       [1],
       [2],
       [1],
       [1],
       [2],
       [0],
    

In [161]:
df_y.to_numpy().flatten()

array([0, 2, 1, 2, 0, 1, 1, 1, 0, 2, 1, 0, 2, 0, 0, 0, 0, 0, 2, 2, 2, 2,
       2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 2, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 2, 0, 0, 1, 2, 1, 0, 0, 1, 0, 1, 1, 1, 2, 0, 0, 1, 2, 0, 0, 1,
       2, 2, 1, 2, 2, 2, 0, 2, 1, 0, 0, 1, 2, 1, 1, 2, 0, 1, 2, 0, 2, 1,
       2, 1, 2, 1, 0, 1, 2, 1, 0, 1, 1, 2, 2, 2, 0, 2, 1, 2, 1, 0, 1, 0,
       2, 2, 0, 2, 0, 1, 2, 0, 2, 1, 0, 0, 2, 0, 1, 2, 0, 2, 2, 2, 1, 1,
       1, 0, 1, 1, 1, 2, 0, 2, 1, 2, 1, 0, 2, 0, 1, 2, 1, 0])

In [162]:
# add back one dimention to the flatten array
df_y.to_numpy().flatten()[:, None]

array([[0],
       [2],
       [1],
       [2],
       [0],
       [1],
       [1],
       [1],
       [0],
       [2],
       [1],
       [0],
       [2],
       [0],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [2],
       [2],
       [2],
       [0],
       [2],
       [0],
       [2],
       [1],
       [1],
       [1],
       [1],
       [1],
       [2],
       [2],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [2],
       [0],
       [0],
       [1],
       [2],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [2],
       [0],
       [0],
       [1],
       [2],
       [0],
       [0],
       [1],
       [2],
       [2],
       [1],
       [2],
       [2],
       [2],
       [0],
       [2],
       [1],
       [0],
       [0],
       [1],
       [2],
       [1],
       [1],
       [2],
       [0],
    