# 1. Preparation

## 1.1. Load Modules

In [None]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt 

## 1.2. Get Dataset from SciKit Learn Package

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()

## 1.3. Look Up Dataset

In [None]:
print(iris.DESCR)   # Check data info

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

## 1.4. Get Data

In [None]:
data = iris.data
label = iris.target
columns = iris.feature_names

In [None]:
data  = pd.DataFrame(data, columns=columns)

## 1.5. Look Up Data

In [None]:
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
data.shape

(150, 4)

# 2. Data Splitting: Train & Test Data

## 2.1. Load Modules

In [None]:
from sklearn.model_selection import train_test_split

## 2.2. Splitting data to Train set and Test set

In [None]:
# Just split
train_test_split(data, label)

[     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 17                 5.1               3.5                1.4               0.3
 18                 5.7               3.8                1.7               0.3
 92                 5.8               2.6                4.0               1.2
 142                5.8               2.7                5.1               1.9
 122                7.7               2.8                6.7               2.0
 ..                 ...               ...                ...               ...
 12                 4.8               3.0                1.4               0.1
 33                 5.5               4.2                1.4               0.2
 127                6.1               3.0                4.9               1.8
 108                6.7               2.5                5.8               1.8
 36                 5.5               3.5                1.3               0.2
 
 [112 rows x 4 columns],
      sepal length (cm)  

In [None]:
# Set a ratio of train:data
train_test_split(data, label, test_size=0.2)

[     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 40                 5.0               3.5                1.3               0.3
 132                6.4               2.8                5.6               2.2
 8                  4.4               2.9                1.4               0.2
 2                  4.7               3.2                1.3               0.2
 149                5.9               3.0                5.1               1.8
 ..                 ...               ...                ...               ...
 86                 6.7               3.1                4.7               1.5
 62                 6.0               2.2                4.0               1.0
 53                 5.5               2.3                4.0               1.3
 34                 4.9               3.1                1.5               0.2
 146                6.3               2.5                5.0               1.9
 
 [120 rows x 4 columns],
      sepal length (cm)  

In [None]:
# Randomly mix data
train_test_split(data, label, test_size=0.2, random_state=2023)

[     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 9                  4.9               3.1                1.5               0.1
 38                 4.4               3.0                1.3               0.2
 18                 5.7               3.8                1.7               0.3
 119                6.0               2.2                5.0               1.5
 98                 5.1               2.5                3.0               1.1
 ..                 ...               ...                ...               ...
 52                 6.9               3.1                4.9               1.5
 116                6.5               3.0                5.5               1.8
 3                  4.6               3.1                1.5               0.2
 25                 5.0               3.0                1.6               0.2
 87                 6.3               2.3                4.4               1.3
 
 [120 rows x 4 columns],
      sepal length (cm)  

In [None]:
# Load Data
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=2023)

In [None]:
y_train

array([0, 0, 0, 2, 1, 2, 0, 1, 2, 0, 1, 2, 1, 1, 1, 2, 1, 1, 1, 0, 0, 2,
       0, 0, 1, 2, 1, 1, 0, 0, 1, 2, 2, 0, 1, 2, 1, 1, 1, 2, 0, 1, 2, 1,
       2, 2, 1, 0, 1, 1, 0, 2, 2, 0, 1, 1, 0, 2, 0, 1, 1, 2, 0, 0, 0, 2,
       2, 0, 0, 1, 1, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2, 0,
       0, 2, 2, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2, 2, 0, 0, 1, 2, 0, 1,
       2, 0, 1, 0, 2, 1, 2, 0, 0, 1])